# 2 - Metadata Extraction and Chunking

This Lab material is an adaptation of the course 'Preprocessing Unstructured Data for LLM Applications', Coursera, March 2024.  

There is also a new Unstructured.io SDK available:  https://docs.unstructured.io/api-reference/api-services/examples#ingest-python

In the previous jupyter notebook, we were able to obtain document elements and metadata.  In this jupyter notebook, we are now ready to perform metadata extraction and chunking.

Let's get started!

In [1]:
# Install Unstructured.io libraries and chromadb.  Chromadb is an in memory vector database.  If you don't know what a vector database
# is, read the section 'What is a Vector Database?'in the geeksforgeeks.org website.

!pip install unstructured_client
!pip install unstructured
!pip install chromadb
!pip install pysqlite3-binary


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting unstructured
  Downloading unstructured-0.16.5-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hCollecting python-iso639
  Downloading python_iso639-2024.10.22-py3-none-any.whl (274 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m274.9/274.9 kB[0m [31m214.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting filetype
  Downloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m213.3 MB/s[0m eta [36m0:00:00[0m
Collecting python-oxmsg
  Downloading 


**Note:**
Chroma requires SQLite > 3.35.  If you are on a linux system, you can install pysqlite3-binary and then override 
the default sqlite3 library before running Chroma.


In [2]:
import pysqlite3
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [3]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [4]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

In [5]:
import json
from IPython.display import JSON

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
from unstructured.staging.base import dict_to_elements

import chromadb

In [11]:
#Connection to Unstructured model API
    #Use the DLAI_API_KEY & DLAI_API_URL that you obtained from Unstructured.io
    #UNSTRUCTURED_API_KEY = 'your DLAI_API_KEY'
    #UNSTRUCTURED_API_URL = 'your DLAI_API_URL'

#Example using your credentials
UNSTRUCTURED_API_KEY = 'tkp3I9iABLDbcJvfgGvnELB4Y2usgn'
UNSTRUCTURED_API_URL = 'https://naaissa-62qdjqlm.api.unstructuredapp.io/'

#Create a client
client = UnstructuredClient(
    api_key_auth=UNSTRUCTURED_API_KEY,
    server_url=UNSTRUCTURED_API_URL,
)

## View the content of the file
- <a href="example_files/CP_CHRT_C_G4M3BA_De-identified.pdf">Patient Chart (View PDF) -- Click Here</a>

## Use Unstructured API

In [12]:
filename = "example_files/CP_CHRT_C_G4M3BA_De-identified.pdf"

with open(filename, "rb") as f:
    files=shared.Files(
        content=f.read(),
        file_name=filename,
    )

req = {
    "partition_parameters": {
        "files": {
            "content": open(filename, "rb"),
            "file_name": filename,
        },
        "strategy": shared.Strategy.HI_RES,
        "languages": ['eng'],
        "split_pdf_page": True,            # If True, splits the PDF file into smaller chunks of pages.
        "split_pdf_allow_failed": True,    # If True, the partitioning continues even if some pages fail.
        "split_pdf_concurrency_level": 15  # Set the number of concurrent request to the maximum value: 15.
    }
}

try:
    res = client.general.partition(request=req)
    element_dicts = [element for element in res.elements]

    # Print the processed data's first element only.
    print(element_dicts[0])

    # Write the processed data to a local file.
    json_elements = json.dumps(element_dicts, indent=2)

    with open("PATH_TO_OUTPUT_FILE", "w") as file:
        file.write(json_elements)
except Exception as e:
    print(e)

INFO: HTTP Request: POST https://naaissa-62qdjqlm.api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"


{'type': 'Title', 'element_id': 'da061e7b23ad36a4729f7ee39657b7a2', 'text': 'PAST MEDICAL HISTORY', 'metadata': {'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'filename': 'CP_CHRT_C_G4M3BA_De-identified.pdf'}}


In [15]:
JSON(json.dumps(res.elements[0:20], indent=2))

<IPython.core.display.JSON object>

## Find elements associated with patient chart sections

In [16]:
[x for x in res.elements if x['type'] == 'Title' and 'SURGICAL HISTORY' in x['text'].lower()]

[]

In [17]:
sections = [
    "PAST MEDICAL HISTORY",
    "VACCINE HISTORY",
    "SURGICAL HISTORY",
    "SOCIAL HISTORY",
    "VITALS",
    "VIDEO EXAM VIA TELEMEDICINE",
    "ASSESSMENT & PLAN",
    "FOLLOW UP",
    "SIGNATURE",
]

In [19]:
section_ids = {}
for element in res.elements:
    for section in sections:
        if element["text"] == section and element["type"] == "Title":
            section_ids[element["element_id"]] = section
            break

In [20]:
section_ids

{'da061e7b23ad36a4729f7ee39657b7a2': 'PAST MEDICAL HISTORY',
 '6c8dbb5374802cd239e1b79bb3a0bcf4': 'VACCINE HISTORY',
 '739ec169e9f67ffdd42e595921338a4e': 'SURGICAL HISTORY',
 '6c3cb75b3189082a62400014b632894b': 'SOCIAL HISTORY',
 'a3a664150ef280a7af534a7c1e1114db': 'VIDEO EXAM VIA TELEMEDICINE',
 'b217300f8cc2fa7150c186ea32efe177': 'ASSESSMENT & PLAN'}

In [21]:
section_to_id = {v: k for k, v in section_ids.items()}
[x for x in res.elements if x["metadata"].get("parent_id") == section_to_id["SURGICAL HISTORY"]][0]

{'type': 'NarrativeText',
 'element_id': 'a1cc57d6952d90eba8bf16566cc4b75c',
 'text': '1. Pr explore parathyroid glands n/a. Procedure: neck exploration with parathyroidectomy',
 'metadata': {'filetype': 'application/pdf',
  'languages': ['eng'],
  'page_number': 1,
  'parent_id': '739ec169e9f67ffdd42e595921338a4e',
  'filename': 'CP_CHRT_C_G4M3BA_De-identified.pdf'}}

## Load documents into a vector db

In [22]:
client = chromadb.PersistentClient(path="chroma_tmp", settings=chromadb.Settings(allow_reset=True))
client.reset()

INFO: Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


True

In [23]:
collection = client.create_collection(
    name="patientcharts",
    metadata={"hnsw:space": "cosine"}
)

In [24]:
for element in res.elements:
    parent_id = element["metadata"].get("parent_id")
    section = section_ids.get(parent_id, "")
    collection.add(
        documents=[element["text"]],
        ids=[element["element_id"]],
        metadatas=[{"section": section}]
    )

INFO: HTTP Request: GET https://chroma-onnx-models.s3.amazonaws.com/all-MiniLM-L6-v2/onnx.tar.gz "HTTP/1.1 200 OK"
/opt/app-root/src/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:00<00:00, 103MiB/s] 


## See the elements in Vector DB

In [25]:
results = collection.peek()
print(results["documents"])

['PAST MEDICAL HISTORY', 'Patient has a past medical history of Anxiety, Disease of thyroid gland, Migraine, Obstructive sleep apnea i Prediabetes.', "She has no past medical history of Angina pectoris (CMS/HCC), Arthritis, Asthma, Atrial fibrillation (CMS/HCC), Awareness under anesthesia, Basal cell carcinoma, Cancer (CMS/HCC), Chronic kidney disease, Chronic pain disorder, Chronic renal failure, COPD (chronic obstructive pulmonary disease) (CMS/HCC), Deep vein thrombosis (CMS/HCC), Delayed emergence from general anesthesia, Depression, Diabetes mellitus type I (CMS/HCC), Diabetic retinopathy (CMS/HCC), Dry eyes, Epilepsy (CMS/HCC), Eye trauma, GERD (gastroesophageal reflux disease), Glaucoma, Hard to intubate, Heart disease, Heart murmur, Hiatal hemia, HIV disease (CMS/HCC), Hypertension, Hypertensive retinopathy, Infectious viral hepatitis, Macular degeneration, Malignant hyperthermia, Melanoma (CMS/HCC), Mitral valve prolapse, Motion sickness, Myocardial infarction (CMS/HCC), Parki

## Perform a hybrid search with metadata

In [26]:
result = collection.query(
    query_texts=["Did the patient have a skin graft?"],
    n_results=2,
    where={"section": "SURGICAL HISTORY"},
)
print(json.dumps(result, indent=2))

{
  "ids": [
    [
      "36478bc9dee3d62edbb9a77020367156",
      "7adba53b444557ec7f50ff8815231a98"
    ]
  ],
  "embeddings": null,
  "documents": [
    [
      "4. Skin Graft: suffered burns and skin grafts from truck explosion",
      "3. Right Procedure: strabismus surgery, right eye, Ophthalmology"
    ]
  ],
  "uris": null,
  "data": null,
  "metadatas": [
    [
      {
        "section": "SURGICAL HISTORY"
      },
      {
        "section": "SURGICAL HISTORY"
      }
    ]
  ],
  "distances": [
    [
      0.3832554706013752,
      0.6725163885487607
    ]
  ],
  "included": [
    "distances",
    "documents",
    "metadatas"
  ]
}


## Chunking Content

In [27]:
elements = dict_to_elements(res.elements)

In [28]:
chunks = chunk_by_title(
    elements,
    combine_text_under_n_chars=100,
    max_characters=3000,
)

In [29]:
JSON(json.dumps(chunks[0].to_dict(), indent=2))

<IPython.core.display.JSON object>

In [30]:
len(elements)

50

In [31]:
len(chunks)

7