In [None]:
import json

with open('docs.json', 'r') as file:
    data = json.load(file)

docs = data['processed_data']
docs

In [None]:
def combine_docs(docs):
    """Combine document chunks into markdown formatted string based on element type"""
    formatted_chunks = []
    
    for doc in docs:
        content = doc["page_content"]
        category = doc["metadata"].get("category", "UncategorizedText")
        
        # Format based on element type
        if category == "Title":
            formatted = f"## {content}"
        elif category == "Header":
            formatted = f"# {content}"
        elif category == "ListItem":
            formatted = f"* {content}"
        elif category == "FigureCaption":
            formatted = f"*Figure: {content}*"
        elif category == "Formula":
            formatted = f"```math\n{content}\n```"
        elif category == "CodeSnippet":
            formatted = f"```\n{content}\n```"
        elif category == "Table":
            # Tables might need more complex handling depending on structure
            formatted = f"| {content} |"
        elif category in ["Footer", "PageNumber"]:
            # Skip footer and page numbers
            continue
        else:
            # Default handling for NarrativeText and other types
            formatted = content
            
        formatted_chunks.append(formatted)
    
    return "\n\n".join(formatted_chunks)

# Process the PDF
text = combine_docs(docs)
print(text)

In [None]:
from dotenv import load_dotenv
load_dotenv()


In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from fhir.resources.patient import Patient
from fhir.resources.observation import Observation
from fhir.resources.bundle import Bundle
from fhir.resources.fhirtypes import BundleEntryType

def combine_docs(docs):
    """Combine document chunks into a single text while preserving structure"""
    formatted_chunks = []
    
    for doc in docs:
        content = doc["page_content"]
        category = doc["metadata"].get("category", "UncategorizedText")
        formatted_chunks.append(f"{category}: {content}")
            
    return "\n\n".join(formatted_chunks)

def create_fhir_extraction_chain():
    """Create an LLM chain for extracting FHIR data"""
    prompt = ChatPromptTemplate.from_messages([
        ("system", """You are an expert at converting medical documents into HL7 FHIR resources.
        Extract all relevant clinical information and structure it according to FHIR R5 standards.
        Only include information that is explicitly stated in the source document.
        Return a JSON object with separate 'patient','observations','conditions','medicationStatements' keys."""),
        ("human", "Medical Report:\n{text}")
    ])
    
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    return prompt | llm.with_structured_output({
        "title": "MedicalExtraction",
        "description": "Extract patient and observation data from medical text",
        "type": "object",
        "properties": {
            "patient": {
                "type": "object",
                "description": "Patient information"
            },
            "observations": {
                "type": "array",
                "description": "List of observations",
                "items": {
                    "type": "object"
                }
            },
            "conditions": {
                "type": "array",
                "description": "List of conditions",
                "items": {
                    "type": "object"
                }
            },
            "medicationStatements": {
                "type": "array",
                "description": "List of medication statements",
                "items": {
                    "type": "object"
                }
            },
        }
    })

def create_fhir_bundle(extracted_data):
    """Convert extracted data into a FHIR Bundle"""
    # Create Patient resource
    patient = Patient.parse_obj(extracted_data["patient"])
    
    # Create Observation resources
    observations = [
        Observation.parse_obj(obs) for obs in extracted_data["observations"]
    ]
    
    # Create Bundle
    bundle = Bundle()
    bundle.type = "collection"
    bundle.entry = []
    
    # Add patient to bundle
    bundle.entry.append({
        "resource": patient.dict(),
        "fullUrl": f"urn:uuid:{patient.id}",
    })
    
    # Add observations to bundle
    for obs in observations:
        bundle.entry.append({
            "resource": obs.dict(),
            "fullUrl": f"urn:uuid:{obs.id}",
        })
    
    return bundle

# Main processing pipeline
def process_medical_report(docs):
    # Combine document chunks
    text = combine_docs(docs)
    
    # Extract structured data using LLM
    extraction_chain = create_fhir_extraction_chain()
    extracted_data = extraction_chain.invoke({"text": text})
    
    # # Create and validate FHIR bundle
    # fhir_bundle = create_fhir_bundle(extracted_data)
    
    # return fhir_bundle
    return extracted_data

# Use the pipeline
# fhir_bundle = process_medical_report(docs)
# print(fhir_bundle.json(indent=2))
process_medical_report(docs)

In [None]:
def create_fhir_extraction_chain():
    """Create an LLM chain for extracting FHIR data"""
    prompt = ChatPromptTemplate.from_messages([
        ("system", """You are an expert at converting medical documents into HL7 FHIR resources.
        Create a FHIR Bundle (R4) containing all relevant clinical information from the medical report.
        
        Important guidelines:
        - Create a Bundle of type "document"
        - Include a Patient resource as the first entry
        - Add Observation resources for measurements and findings
        - Add Condition resources for diagnoses
        - Add MedicationStatement resources for medications
        - Ensure all resources have unique UUIDs
        - Use proper FHIR references between resources
        - Use standard codings (LOINC, SNOMED CT) where applicable
        - Include proper status and category fields
        - Only include information explicitly stated in the source
        
        Return a complete FHIR Bundle as a JSON object with all required fields.
        """),
        ("human", "Medical Report:\n{text}")
    ])
    
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    return prompt | llm.with_structured_output({
        "title": "MedicalExtraction",
        "description": "Extract patient and observation data from medical text",
        "type": "object",
        "properties": {
            "resourceType": {"type": "string", "enum": ["Bundle"]},
            "type": {"type": "string", "enum": ["document"]},
            "entry": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "resource": {"type": "object"}
                    }
                }
            }
        }
    })

def process_medical_report(docs):
    """Process medical report into FHIR Bundle"""
    # Combine document chunks
    text = combine_docs(docs)
    
    # Extract FHIR Bundle using LLM
    extraction_chain = create_fhir_extraction_chain()
    fhir_bundle = extraction_chain.invoke({"text": text})

    # Validate the bundle (optional)
    # return [fhir_bundle, Bundle.parse_obj(fhir_bundle)]
    return fhir_bundle
# Use the pipeline
raw_bundle = process_medical_report(docs)
# print(bundle.json(indent=2))
print(raw_bundle)

In [None]:
import json
from fhir.resources.R4B.bundle import Bundle


with open('bundle.json', 'r') as file:
    data = json.load(file)

Bundle.parse_obj(data)