In [None]:
import json
from dotenv import load_dotenv
load_dotenv()

with open('docs.json', 'r') as file:
    data = json.load(file)

docs = data['processed_data']
docs

In [None]:
def combine_docs(docs):
    """Combine document chunks into markdown formatted string based on element type"""
    formatted_chunks = []
    
    for doc in docs:
        content = doc["page_content"]
        category = doc["metadata"].get("category", "UncategorizedText")
        
        # Format based on element type
        if category == "Title":
            formatted = f"## {content}"
        elif category == "Header":
            formatted = f"# {content}"
        elif category == "ListItem":
            formatted = f"* {content}"
        elif category == "FigureCaption":
            formatted = f"*Figure: {content}*"
        elif category == "Formula":
            formatted = f"```math\n{content}\n```"
        elif category == "CodeSnippet":
            formatted = f"```\n{content}\n```"
        elif category == "Table":
            # Tables might need more complex handling depending on structure
            formatted = f"| {content} |"
        elif category in ["Footer", "PageNumber"]:
            # Skip footer and page numbers
            continue
        else:
            # Default handling for NarrativeText and other types
            formatted = content
            
        formatted_chunks.append(formatted)
    
    return "\n\n".join(formatted_chunks)

# Process the PDF
text = combine_docs(docs)
print(text)

In [None]:
from pydantic import BaseModel, Field
from typing import List, Literal, Optional, Union

# Base FHIR Models
class Quantity(BaseModel):
    value: Optional[float] = None 
    comparator: Optional[Literal["<", "<=", ">=", ">", "ad"]] = None  # How to understand the value
    unit: Optional[str] = None 
    system: Optional[str] = None 
    code: Optional[str] = None

class Coding(BaseModel):
    system: str
    code: str
    display: Optional[str] = None

class CodeableConcept(BaseModel):
    coding: List[Coding] = Field(default_factory=list)
    text: Optional[str] = None

class Identifier(BaseModel):
    system: Optional[str] = None
    value: str

class HumanName(BaseModel):
    text: Optional[str] = None
    family: Optional[str] = None
    given: Optional[List[str]] = Field(default_factory=list)
    prefix: Optional[List[str]] = Field(default_factory=list)
    suffix: Optional[List[str]] = Field(default_factory=list)

class Address(BaseModel):
    text: Optional[str] = None
    line: Optional[List[str]] = Field(default_factory=list)
    city: Optional[str] = None
    state: Optional[str] = None
    postalCode: Optional[str] = None
    country: Optional[str] = None

class Dosage(BaseModel):
    text: Optional[str] = None
    timing: Optional[CodeableConcept] = None
    route: Optional[CodeableConcept] = None
    method: Optional[CodeableConcept] = None

# FHIR Resources
class Patient(BaseModel):
    resourceType: Literal["Patient"] = "Patient"
    identifier: Optional[List[Identifier]] = Field(default_factory=list)
    name: Optional[List[HumanName]] = Field(default_factory=list)
    gender: Optional[Literal["male", "female", "other", "unknown"]] = None
    birthDate: Optional[str] = None
    address: Optional[List[Address]] = Field(default_factory=list)

class Observation(BaseModel):
    resourceType: Literal["Observation"] = "Observation"
    identifier: Optional[List[Identifier]] = Field(default_factory=list)
    status: Literal["registered", "preliminary", "final", "amended", "corrected", "cancelled", "entered-in-error", "unknown"]
    code: CodeableConcept
    effectiveDateTime: Optional[str] = None
    valueQuantity: Optional[Quantity] = None  # For numeric measurements

class Condition(BaseModel):
    resourceType: Literal["Condition"] = "Condition"
    identifier: Optional[List[Identifier]] = Field(default_factory=list)
    clinicalStatus: CodeableConcept
    code: CodeableConcept
    onsetDateTime: Optional[str] = None

class MedicationStatement(BaseModel):
    resourceType: Literal["MedicationStatement"] = "MedicationStatement"
    identifier: Optional[List[Identifier]] = Field(default_factory=list)
    status: Literal["recorded", "entered-in-error", "draft"]
    medicationCodeableConcept: Optional[CodeableConcept] = None
    effectiveDateTime: Optional[str] = None
    dosage: Optional[List[Dosage]] = Field(default_factory=list)

class Bundle(BaseModel):
    resourceType: Literal["Bundle"] = "Bundle"
    type: Literal["document"]
    timestamp: str
    coding: List[Coding] = Field(default_factory=list)
    entry: List[Union[Patient, Observation, Condition, MedicationStatement]] = Field(default_factory=list)

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

def create_fhir_extraction_chain():
    """Create an LLM chain for extracting FHIR data"""
    prompt = ChatPromptTemplate.from_messages([
        ("system", """You are an expert at converting medical documents into HL7 FHIR resources.
        Extract all relevant clinical information and structure it according to FHIR R4 standards.
        
        Important guidelines:
        - Ensure all required FHIR fields are populated
        - Use standard FHIR codings (LOINC, SNOMED CT, etc.) where applicable
        - Include proper status and category fields for observations
        - Generate unique UUIDs for all resources
        - Link observations to the patient using proper references
        - Include dates and times when available
        - Only extract information explicitly stated in the source
        - Try to extract as much information as possible.
        - Try to fill in all optional fields.
        - Try to keep the format as close to the original as possible.
        - Keep the original language.
        """),
        ("human", "Medical Report:\n{text}")
    ])
    
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    return prompt | llm.with_structured_output(schema=Bundle, method="json_schema", strict=True)

In [None]:
chain = create_fhir_extraction_chain()

bundle = chain.invoke({"text": text})
bundle

In [None]:
print(bundle.json())

In [None]:
class PlainText(BaseModel):
    text: str

def create_plain_text_extraction_chain():
    """Create an LLM chain for extracting plain text medical information"""
    prompt = ChatPromptTemplate.from_messages([
        ("system", """You are an expert at extracting relevant medical information from documents.
        Convert the markdown-formatted medical report into clean plaintext while:
        
        - Preserve all important medical information including:
          - Patient details
          - Diagnoses
          - Medications
          - Test results
          - Treatment plans
          - Clinical observations
        - Keep original medical terminology and exact values
        - Maintain the original sentence structure where possible
        - Remove formatting markers, headers, footers, page numbers
        - Skip administrative metadata
        - Present information in a clear, readable format
        - Preserve the logical flow of information
        - Keep the text as close to the original as possible.
        - Keep the original language.
        
        Return only the essential medical content in plain text format.
        """),
        ("human", "{text}")
    ])

    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    return prompt | llm.with_structured_output(schema=PlainText, method="json_schema", strict=True)

In [None]:
plain_text_chain = create_plain_text_extraction_chain()
plain_text = plain_text_chain.invoke({"text": text})
print(plain_text)

In [None]:
print(plain_text.json())

In [None]:
print(plain_text.text)

In [None]:
from langchain_core.runnables import RunnableParallel

fhir_extraction_chain = create_fhir_extraction_chain()
plain_text_chain = create_plain_text_extraction_chain()

extraction_chains = RunnableParallel(
    fhir=fhir_extraction_chain,
    plain_text=plain_text_chain
)

results = extraction_chains.invoke({"text": text})
print("FHIR Bundle:")
print(results["fhir"])
print("\nPlain Text:")
print(results["plain_text"])
