# Metadata Extraction and Chunking

In [None]:
# Coursera, Preprocessing Unstructured Data for LLM Applications, March 2024
# Modified code for this demo:  https://github.com/redhat-na-ssa/patientcharts2

In [None]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [None]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

In [None]:
import json
from IPython.display import JSON

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
from unstructured.staging.base import dict_to_elements

import chromadb

In [None]:
from Utils import Utils
utils = Utils()

DLAI_API_KEY = utils.get_dlai_api_key()
DLAI_API_URL = utils.get_dlai_url()

s = UnstructuredClient(
    api_key_auth=DLAI_API_KEY,
    server_url=DLAI_API_URL,
)

## View the content of the file
- <a href="example_files/CP_CHRT_C_G4M3BA_De-identified.pdf">Patient Chart (View PDF) -- Click Here</a>

## Use Unstructured API

In [None]:
filename = "example_files/CP_CHRT_C_G4M3BA_De-identified.pdf"

with open(filename, "rb") as f:
    files=shared.Files(
        content=f.read(),
        file_name=filename,
    )

req = shared.PartitionParameters(files=files)

In [None]:
try:
    resp = s.general.partition(req)
except SDKError as e:
    print(e)

In [None]:
JSON(json.dumps(resp.elements[0:3], indent=2))

## Find elements associated with patient chart sections

In [None]:
[x for x in resp.elements if x['type'] == 'Title' and 'SURGICAL HISTORY' in x['text'].lower()]

In [None]:
sections = [
    "PAST MEDICAL HISTORY",
    "VACCINE HISTORY",
    "SURGICAL HISTORY",
    "SOCIAL HISTORY",
    "VITALS",
    "VIDEO EXAM VIA TELEMEDICINE",
    "ASSESSMENT & PLAN",
    "FOLLOW UP",
    "SIGNATURE",
]

In [None]:
section_ids = {}
for element in resp.elements:
    for section in sections:
        if element["text"] == section and element["type"] == "Title":
            section_ids[element["element_id"]] = section
            break

In [None]:
section_ids

In [None]:
section_to_id = {v: k for k, v in section_ids.items()}
[x for x in resp.elements if x["metadata"].get("parent_id") == section_to_id["SURGICAL HISTORY"]][0]

## Load documents into a vector db

In [None]:
client = chromadb.PersistentClient(path="chroma_tmp", settings=chromadb.Settings(allow_reset=True))
client.reset()

In [None]:
collection = client.create_collection(
    name="patientcharts",
    metadata={"hnsw:space": "cosine"}
)

In [None]:
for element in resp.elements:
    parent_id = element["metadata"].get("parent_id")
    section = section_ids.get(parent_id, "")
    collection.add(
        documents=[element["text"]],
        ids=[element["element_id"]],
        metadatas=[{"section": section}]
    )

## See the elements in Vector DB

In [None]:
results = collection.peek()
print(results["documents"])

## Perform a hybrid search with metadata

In [None]:
result = collection.query(
    query_texts=["Did the patient have a skin graft?"],
    n_results=2,
    where={"section": "SURGICAL HISTORY"},
)
print(json.dumps(result, indent=2))

## Chunking Content

In [None]:
elements = dict_to_elements(resp.elements)

In [None]:
chunks = chunk_by_title(
    elements,
    combine_text_under_n_chars=100,
    max_characters=3000,
)

In [None]:
JSON(json.dumps(chunks[0].to_dict(), indent=2))

In [None]:
len(elements)

In [None]:
len(chunks)