# Preprocessing PDFs

In [None]:
# Coursera, Preprocessing Unstructured Data for LLM Applications, March 2024
# Modified code for this demo:  https://github.com/redhat-na-ssa/patientcharts2

In [None]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [None]:
from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

from unstructured.partition.html import partition_html
from unstructured.partition.pdf import partition_pdf

from unstructured.staging.base import dict_to_elements

In [None]:
from Utils import Utils
utils = Utils()

DLAI_API_KEY = utils.get_dlai_api_key()
DLAI_API_URL = utils.get_dlai_url()

s = UnstructuredClient(
    api_key_auth=DLAI_API_KEY,
    server_url=DLAI_API_URL,
)

## Example Patient Chart: PDF

### View the content of the files
- <a href="example_files/CP_CHRT_C_G4M3BA_De-identified.pdf">Patient Chart (View PDF) -- Click Here</a>


## Process the PDF with Document Layout Detection

In [None]:
filename = "example_files/CP_CHRT_C_G4M3BA_De-identified.pdf"
pdf_elements = partition_pdf(filename=filename, strategy="fast")

In [None]:
for element in pdf_elements[:10]:
    print(f"{element.category.upper()}: {element.text}")

In [None]:
with open(filename, "rb") as f:
    files=shared.Files(
        content=f.read(),
        file_name=filename,
    )

req = shared.PartitionParameters(
    files=files,
    strategy="hi_res",
    hi_res_model_name="yolox",
)

try:
    resp = s.general.partition(req)
    dld_elements = dict_to_elements(resp.elements)
except SDKError as e:
    print(e)

In [None]:
for element in dld_elements[:10]:
    print(f"{element.category.upper()}: {element.text}")

In [None]:
import collections

In [None]:
len(dld_elements)

In [None]:
dld_categories = [el.category for el in dld_elements]
collections.Counter(dld_categories).most_common()