In [None]:
from langchain_community.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.chains import LLMChain
import json

# Initialize DeepSeek with Ollama
llm = Ollama(model="deepseek-r1:1.5b")

# Define expected features in a structured format
response_schemas = [
    ResponseSchema(name="Name", description="The full name of the person mentioned in the text"),
    ResponseSchema(name="Age", description="The age of the person (if available)"),
    ResponseSchema(name="Hospital Name", description="The company or organization the person works for"),
    ResponseSchema(name="Diagnosis",description="Disease, illness or injury"),
    ResponseSchema(name="Salary", description="The salary or compensation details (if available)"),
    ResponseSchema(name="Location", description="The city or country where the person resides"),
    ResponseSchema(name="Policy ID",description = "Policy ID of the claim"),
    ResponseSchema(name="Claim ID",description="Unique claim ID"),
    ResponseSchema(name="Amount Charged", description="The total amount billed for the service,dont include currency"),
    ResponseSchema(name="Amount Paid", description="The amount actually paid by insurance,dont include currency"),
    ResponseSchema(name="Admit Date", description="The date when patient was admitted to hospital"),
    ResponseSchema(name="Discharge Date", description="The date when patient was Discharged from hospital"),
    ResponseSchema(name="Surgery Performed", description="Name of surgery/treatment performed"),
    ResponseSchema(name="Medications", description="Names of medications, painkillers, antibiotics, etc prescribed"),

]

# Initialize the output parser
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

# Define prompt template with enforced JSON output
prompt_template = PromptTemplate(
    input_variables=["text"],
    template="""
    Extract the following details from the provided unstructured text and return them in JSON format, Dont Include currency in Amount:

    If any field is missing, explicitly return it as `null` (None in Python). Ensure the response is **only valid JSON** without extra text.

    - Name
    - Age
    - Hospital Name
    - Diagnosis
    - Salary
    - Location
    - Policy ID
    - Claim ID
    - Amount Paid
    - Amount Charged
    - Admit date
    - Discharge Date
    - Surgery Performed
    - Medications Prescribed
    
    Text:
    {text}

    Use the following JSON format:
    {format_instructions}
    """,
    partial_variables={"format_instructions": output_parser.get_format_instructions()}
)

# Define a LangChain LLMChain
llm_chain = LLMChain(llm=llm, prompt=prompt_template)

def extract_features(unstructured_text):
    """Extracts relevant structured features from unstructured text."""
    response = llm_chain.run(text=unstructured_text)
    parsed_response = output_parser.parse(response)
    return parsed_response

def clean_and_convert_to_json(extracted_data):
    """
    Converts extracted dictionary to JSON, replacing empty strings with None.

    Args:
    - extracted_data (dict): Dictionary containing extracted features.

    Returns:
    - str: JSON formatted string with empty strings converted to None.
    """
    cleaned_data = {key: None if isinstance(value, str) and value.strip() == "" else value 
                    for key, value in extracted_data.items()}

    return json.dumps(cleaned_data, indent=4)

# Example usage
if __name__ == "__main__":
    text_data = """
    Sarvesh Pujare, aged 24, was admitted to Jupiter Hospital on February 12, 2024, after he was hit by a short-distance delivery while playing cricket, resulting in a fractured right forearm. He was diagnosed with a displaced radius fracture and underwent an Open Reduction and Internal Fixation (ORIF) surgery on the same day. His insurance policy ID is HPX12345678 under MediCare Insurance, and his claim ID is CLM20240212002. The total amount charged for his hospitalization, surgery, and post-operative care was ₹2,75,000, out of which the insurance provider approved and paid ₹2,50,000. The remaining ₹25,000 was covered by Sarvesh as part of his deductible and co-payment. He was discharged on February 15, 2024, with a prescribed physiotherapy plan for recovery over the next six weeks. Medications prescribed included painkillers (Ibuprofen 400mg), antibiotics (Amoxicillin 500mg), and calcium supplements for bone healing.  
    """

    extracted_info = extract_features(text_data)
    print("Extracted Features:", extracted_info)
    json_output = clean_and_convert_to_json(extracted_info)
    with open("extracted_claim.json", "w") as json_file:
        json.dump(extracted_info, json_file, indent=4)
    



Extracted Features: {'Name': 'Sarvesh Pujare', 'Age': 24, 'Hospital Name': 'Jupiter Hospital', 'Diagnosis': 'He was hit by a short-distance delivery while playing cricket, resulting in a fractured right forearm.', 'Salary': None, 'Location': None, 'Policy ID': 'HPX12345678', 'Claim ID': 'CLM20240212002', 'Amount Charged': '₹2,75,000', 'Amount Paid': '₹2,50,000', 'Admit Date': '2024-02-12', 'Discharge Date': '2024-02-15', 'Surgery Performed': 'Open Reduction and Internal Fixation (ORIF) surgery on the same day', 'Medications': 'painkillers, antibiotics, calcium supplements'}
