In [3]:
import numpy as np
import pandas as pd
import os
import csv
import re
from dotenv import load_dotenv
import json
from openai import OpenAI
from urllib.parse import urlparse
import hashlib
import pdfkit
import requests
from playwright.sync_api import sync_playwright
from openai import OpenAI
import json
from bs4 import BeautifulSoup
from urllib.parse import quote_plus

In [4]:
path = '/home/cptaswadu/RESCUE-n8n/insurance'
load_dotenv(dotenv_path=os.path.join(path, ".env"))
openai_api_key = os.getenv("OPEN_AI_API_KEY")
perplexity_api_key = os.getenv("PERPLEXITY_API_KEY")
client = OpenAI(api_key=openai_api_key)

In [5]:
def generate_search_query_from_patient_info(patient_info_text):
    insurance_provider = None
    if "United Healthcare" in patient_info_text or "UnitedHealthcare" in patient_info_text:
        insurance_provider = "UnitedHealthcare"
    elif "Aetna" in patient_info_text:
        insurance_provider = "Aetna"
    elif "Cigna" in patient_info_text:
        insurance_provider = "Cigna"
    elif "Blue Cross" in patient_info_text:
        insurance_provider = "Blue Cross"

    plan_match = re.search(r"covered by (.*?) and resides", patient_info_text)
    plan = plan_match.group(1) if plan_match else ""

    location_match = re.search(r"resides in ([A-Za-z ]+)[\\.]?", patient_info_text)
    location = location_match.group(1).strip() if location_match else ""

    if "Whole exome sequencing" in patient_info_text or "WES" in patient_info_text:
        test = "Whole Exome Sequencing"
    elif "BRCA1" in patient_info_text or "BRCA2" in patient_info_text:
        test = "BRCA1/BRCA2"
    else:
        test = "Genetic Testing"

    search_query = f"{insurance_provider} {plan} {location} {test} coverage policy"
    return search_query

In [6]:
def find_policy_url_with_web_search(patient_info_text):
    search_query = generate_search_query_from_patient_info(patient_info_text)

    prompt = f"""
You are a clinical insurance assistant.

Search for the most relevant and up-to-date official insurance policy document URL that describes genetic testing coverage for this patient, considering the patient's clinical situation and insurance plan.

PATIENT INFORMATION:
{patient_info_text}

Search Query (for reference):
{search_query}

Instructions:
- Prioritize policies that are specific to the patient’s clinical scenario and insurance plan, not generic ones.
- If available, return the URL for the most relevant coverage policy page that discusses medical necessity, clinical guidelines, prior authorization, and coverage conditions for the requested genetic test in this patient's context.
- Return ONLY the policy document URL. Do not include any explanation or extra text.

If no appropriate policy is found, respond with "No policy found."
"""

    response = client.responses.create(
        model="gpt-4o",
        input=[
            {"role": "system", "content": "You are a clinical insurance assistant."},
            {"role": "user", "content": prompt}
        ],
        tools=[{"type": "web_search_preview"}]
    )

    result_text = response.output_text.strip()

    if not result_text.startswith("http"):
        result_text = "No policy found."

    return result_text


In [7]:
def extract_policy_summary(patient_info_text, policy_url):
    if policy_url == "No policy found.":
        return "(No relevant policy document. Use patient information for all decisions.)"

    prompt = f"""
You are a clinical insurance assistant.

You will be provided:
1. Patient clinical information (very important context).
2. Insurance policy document URL (to read and extract relevant coverage rules).

PATIENT INFORMATION:
{patient_info_text}

Read the following insurance policy document from this URL:

{policy_url}

Extract ONLY the coverage criteria and important policy rules that are relevant to THIS patient’s situation. 
Focus especially on factors such as:
- Age requirements
- Clinical guidelines
- Medical necessity
- Prior authorization criteria 
- Any other relevant rules related to genetic testing coverage for this patient.

Do NOT include website menus, navigation elements, disclaimers, or unrelated information.

Summarize only the meaningful policy content that is helpful for deciding test coverage for this specific patient based on their clinical information.

If the page does not load or no relevant rules are found, respond with: "No relevant coverage criteria found."

Return the summarized policy text below.
"""

    response = client.responses.create(
        model="gpt-4o",
        input=[
            {"role": "system", "content": "You are a clinical insurance assistant."},
            {"role": "user", "content": prompt}
        ],
        tools=[{"type": "web_search_preview"}]
    )

    result_text = response.output_text.strip()

    if "No relevant coverage criteria" in result_text or len(result_text) < 50:
        return "(No relevant policy document. Use patient information for all decisions.)"

    return result_text

In [45]:
def find_and_extract_policy_summary(patient_info_text):
    policy_url = find_policy_url_with_web_search(patient_info_text)
    return extract_policy_summary(patient_info_text, policy_url)

In [8]:
questions_file_path = "/home/cptaswadu/RESCUE-n8n/insurance/dataset/Insurance_Genetic_Testing_QA.json"

with open(questions_file_path, "r") as f:
    questions_data = json.load(f)

questions_list = questions_data["questions"]

def format_questions(questions_list):
    formatted_questions = []
    for q in questions_list:
        question_line = f"{q['id']}. {q['question']}"

        # Check if this is Free text question
        if q.get("options") == ["Free text answer"]:
            question_line += "\n(Free text answer allowed.)"

        else:
            # It is options-based question
            question_line += f"\nOptions: {q['options']}"

            # Additional follow up if Yes
            if "additional_if_yes" in q and q["additional_if_yes"]:
                question_line += f"\nIf you answer 'Yes', ALSO select from: {q['additional_if_yes']}"

            # Additional follow up if No
            if "additional_if_no" in q and q["additional_if_no"]:
                question_line += f"\nIf you answer 'No', ALSO select from: {q['additional_if_no']}"

        formatted_questions.append(question_line)

    return "\n\n".join(formatted_questions)

In [9]:
def run_qna(patient_info_text, policy_url, policy_summary, questions_list):
    questions_formatted = format_questions(questions_list)

    prompt = f"""
You are a clinical insurance assistant specializing in genetic testing coverage policies.

You will be given:
1. Patient clinical information (very important for decision making)
2. Summarized policy coverage criteria text (use this when available)

Instructions:
- If policy criteria clearly apply, follow them.
- If policy criteria do NOT exist or are vague, rely on the patient's clinical information and your prior knowledge and reasoning.
- For each question:
    - Answer "Yes" or "No" based on the policy criteria and patient information.
    - If the question is a Free text question, provide a free text answer.
    - Strictly choose the answer from the options provided.
    - If options are provided, choose ONLY from those options.
    - If the question says "(Free text answer allowed)", you may write your answer freely.
    - If the question says "If Yes, ALSO select from ..." and you answered "Yes", you MUST also select from those follow-up options.
    - If the question says "If No, ALSO select from ..." and you answered "No", you MUST also select from those follow-up options.
- Output answers in JSON format ONLY, with no explanation.
- At the end, include the policy_url as "policy_url".

==== PATIENT INFORMATION ====
{patient_info_text}

==== SUMMARIZED POLICY COVERAGE CRITERIA (from URL: {policy_url}) ====
{policy_summary}

==== QUESTIONS ====
{questions_formatted}

Output your answers in JSON format only and include the policy_url at the end.
"""


    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a clinical insurance assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )

    result_content = response.choices[0].message.content.strip()

    print("\n===== QnA Result =====")
    print(result_content)
    print("======================\n")

    return result_content

In [10]:
case_ex = [
    {
        "id": "Case1",
        "patient_info": "An 8-year-old boy with neurodevelopmental delay and seizures. A prior chromosomal microarray test was negative. Whole exome sequencing (WES) has been requested by the genetic counselor to investigate potential underlying genetic causes that may guide diagnosis and future treatment decisions. There is also a family history of neurodevelopmental disorders, as his older brother was diagnosed with autism spectrum disorder. The patient is covered by United Healthcare Choice Plus through a family plan and resides in New Jersey."
    },
    {
        "id": "Case2",
        "patient_info": "An 8-year-old boy with mild learning difficulties and no significant neurological symptoms. There is no family history of genetic conditions, and his prior chromosomal microarray test was negative. Whole exome sequencing (WES) has been requested by his primary care provider (PCP) to explore potential genetic factors as part of general health screening and educational planning. The patient is covered by United Healthcare Choice Plus through a family plan and resides in New Jersey."
    },
    {
        "id": "Case3",
        "patient_info": "A 35-year-old woman with a strong family history of breast and ovarian cancer. Her mother was diagnosed with breast cancer at age 42, and her maternal aunt had ovarian cancer in her 50s. The patient herself has no history of cancer but has dense breast tissue and is considered at increased risk. The genetic counselor has recommended BRCA1/BRCA2 testing to assess her hereditary cancer risk and guide risk-reducing management decisions, including potential prophylactic options. The patient is covered by Aetna Open Access Managed Choice Plan and resides in California."
    },
    {
        "id": "Case4",
        "patient_info": "A 28-year-old woman with no family history of breast or ovarian cancer. The patient requested BRCA1/BRCA2 genetic testing after reading about genetic risks online. There were no prior specialist consultations or referrals, and no other clinical risk factors have been identified. The test was ordered directly by her primary care physician at the patient's request. The patient is enrolled in Aetna Open Access Managed Choice Plan and lives in Texas."
    }
]

In [11]:
policy_records = {}

for case in case_ex:
    case_id = case["id"]
    patient_info = case["patient_info"]

    print(f"\n=== Running for {case_id} ===")

    policy_url = find_policy_url_with_web_search(patient_info)
    print("Policy URL:", policy_url)

    policy_summary = extract_policy_summary(patient_info, policy_url)
    print("Policy Summary Extracted:", policy_summary)

    policy_records[case_id] = {
        "policy_url": policy_url,
        "policy_summary": policy_summary
    }

    qna_result = run_qna(patient_info, policy_url, policy_summary, questions_list)
    policy_records[case_id]["qna_result"] = qna_result


=== Running for Case1 ===
Policy URL: https://www.uhcprovider.com/content/dam/provider/docs/public/policies/comm-medical-drug/whole-exome-and-whole-genome-sequencing.pdf
Policy Summary Extracted: Based on UnitedHealthcare's policy on Whole Exome Sequencing (WES) for non-oncology conditions, effective May 1, 2025, the following coverage criteria are relevant to the 8-year-old patient with neurodevelopmental delay and seizures:

**Coverage Criteria:**

- **Clinical Presentation:** The patient's symptoms are nonspecific and do not align with a well-defined syndrome for which a specific or targeted gene test is available.

- **Ordering Physician:** WES must be ordered by a medical geneticist, neonatologist, neurologist, immunologist, or developmental pediatrician.

- **Clinical History and Features:** The patient's clinical history strongly suggests a genetic cause, and at least one of the following features is present:
  - Multiple congenital anomalies affecting different organ systems
 

In [48]:
for case in case_ex:
    case_id = case["id"]
    patient_info = case["patient_info"]

    print(f"\n=== Running for {case_id} ===")

    policy_summary = find_and_extract_policy_summary(patient_info)
    
    if "(No relevant policy document." in policy_summary:
        policy_url = "No policy found."
    else:
        policy_url = find_policy_url_with_web_search(patient_info)

    print("Policy URL:", policy_url)
    print("Policy Summary Extracted:", policy_summary)
    print("Patient Info:", patient_info)

    policy_records[case_id] = {
        "policy_url": policy_url,
        "policy_summary": policy_summary
    }

    qna_result = run_qna(patient_info, policy_url, policy_summary, questions_list)
    policy_records[case_id]["qna_result"] = qna_result



=== Running for Case1 ===
Policy URL: No policy found.
Policy Summary Extracted: Based on UnitedHealthcare's policy effective May 1, 2025, Whole Exome Sequencing (WES) is considered medically necessary for diagnosing or evaluating a genetic disorder when the results are expected to directly influence medical management and clinical outcomes, provided the following criteria are met:

1. **Clinical Presentation**: The patient's clinical presentation is nonspecific and does not fit a well-defined syndrome for which a specific or targeted gene test is available. If a specific genetic syndrome is suspected, a single gene or targeted gene panel should be performed prior to determining if WES is necessary.

2. **Ordering Physician**: WES must be ordered by a medical geneticist, neonatologist, neurologist, immunologist, or developmental pediatrician.

3. **Clinical History**: The patient's clinical history strongly suggests a genetic cause, and one or more of the following features are presen

In [13]:
def extract_qna_results_only(policy_records):
    """
    Extract only the qna_result part from policy_records and convert to JSON dict.
    
    Returns:
        dict: case_id -> parsed qna_result (dict)
    """
    qna_results_only = {}

    for case_id, record in policy_records.items():
        raw_qna = record.get("qna_result", "").strip()

        # Clean the ```json ... ``` wrapper if exists
        if raw_qna.startswith("```json"):
            raw_qna = raw_qna.replace("```json", "").strip()
        if raw_qna.endswith("```"):
            raw_qna = raw_qna[:-3].strip()

        # Parse JSON safely
        try:
            parsed_qna = json.loads(raw_qna)
            qna_results_only[case_id] = parsed_qna
        except json.JSONDecodeError as e:
            print(f"JSON parsing error in {case_id}: {e}")
            qna_results_only[case_id] = None  # Optional: Could also skip or raise error

    return qna_results_only
    
qna_result = extract_qna_results_only(policy_records)

In [37]:
ground_truth = {
    "Case1": {
        "Q0": "Whole Exome Sequencing (WES)",
        "Q1": "Yes",
        "Q2": "Yes",
        "Q3": "Yes",
        "Q4": "Yes",
        "Q4_followup": [
      "ACMG"
    ],
        "Q5": "Yes",
        "Q6": "No",
        "Q7": "Yes",
        "Q8": "No",
        "Q9": "Yes",
        "Q9_followup": [
      "Yes"
    ],
        "Q10": "No",
        "Q10_followup": [
      "Diagnostic"
    ],
        "Q11": "Yes",
        "Q12": "No",
        "Q13": "Yes",
        "Q14": "Yes",
        "Q14_followup": [
      "81415", "81416"
    ],
        "Q15": "No",
        "Q16": "Yes"
  },
  
    "Case2": {
        "Q0": "Whole Exome Sequencing (WES)",
        "Q1": "Yes",
        "Q2": "No",
        "Q3": "No",
        "Q4": "No",
        "Q5": "Yes",
        "Q6": "No",
        "Q7": "No",
        "Q8": "No",
        "Q9": "Yes",
        "Q9_followup": [
      "No"
    ],
        "Q10": "No",
        "Q10_followup": [
      "Other"
    ],
        "Q11": "Yes",
        "Q12": "No",
        "Q13": "Yes",
        "Q14": "Yes",
        "Q14_followup": [
      "81415", "81416"
    ],
        "Q15": "No",
        "Q16": "No"
  },
  
    "Case3": {
        "Q0": "BRCA1/BRCA2 genetic testing",
        "Q1": "Yes",
        "Q2": "Yes",
        "Q3": "Yes",
        "Q4": "Yes",
        "Q4_followup": [
      "NCCN"
    ],
        "Q5": "Yes",
        "Q6": "No",
        "Q7": "No",
        "Q8": "No",
        "Q9": "Yes",
        "Q9_followup": [
      "Yes"
    ],
        "Q10": "No",
        "Q10_followup": [
      "Risk Assessment"
    ],
        "Q11": "Yes",
        "Q12": "No",
        "Q13": "Yes",
        "Q14": "Yes",
        "Q14_followup": [
      "81162"
    ],
        "Q15": "No",
        "Q16": "Yes"
  },
  
    "Case4": {
        "Q0": "BRCA1/BRCA2 genetic testing",
        "Q1": "Yes",
        "Q2": "Yes",
        "Q3": "No",
        "Q4": "No",
        "Q5": "No",
        "Q6": "No",
        "Q7": "No",
        "Q8": "No",
        "Q9": "Yes",
        "Q9_followup": [
      "No"
    ],
        "Q10": "No",
        "Q10_followup": [
      "Risk Assessment"
    ],
        "Q11": "No",
        "Q12": "No",
        "Q13": "Yes",
        "Q14": "Yes",
        "Q14_followup": [
      "81162"
    ],
        "Q15": "No",
        "Q16": "No"
  }
    
}

In [None]:
def evaluate_qna_result(case_id, predicted_result, gold_result):
    records = []
    correct_count = 0
    total_count = 0

    for qid in gold_result:
        if not qid.startswith("Q") or qid == "policy_url" or qid == "Q17" or "_followup" in qid:
            continue

        pred_answer = predicted_result.get(qid, "").strip()
        gold_answer = gold_result.get(qid, "").strip()

        is_correct = pred_answer == gold_answer
        score = 1 if is_correct else 0

        records.append({
            "Case": case_id,
            "Question": qid,
            "Predicted": pred_answer,
            "Gold": gold_answer,
            "Score": score
        })

        total_count += 1
        correct_count += score

        # follow-up 평가
        followup_key = qid + "_followup"
        pred_followup = predicted_result.get(followup_key, None)
        gold_followup = gold_result.get(followup_key, None)

        if is_correct and gold_followup is not None:

            def normalize(ans):
                if ans is None:
                    return "None"
                if isinstance(ans, list):
                    return ", ".join(ans)
                return ans

            pred_followup_norm = normalize(pred_followup)
            gold_followup_norm = normalize(gold_followup)

            followup_score = 1 if pred_followup_norm == gold_followup_norm else 0

            records.append({
                "Case": case_id,
                "Question": followup_key,
                "Predicted": pred_followup_norm,
                "Gold": gold_followup_norm,
                "Score": followup_score
            })

            total_count += 1
            correct_count += followup_score

    accuracy = correct_count / total_count * 100 if total_count > 0 else 0

    records.append({
        "Case": case_id,
        "Question": "TOTAL",
        "Predicted": f"Correct: {correct_count}",
        "Gold": f"Incorrect: {total_count - correct_count}",
        "Score": f"Accuracy: {accuracy:.2f}%"
    })

    df_records = pd.DataFrame(records)
    df_records
    return df_records


In [34]:
evaluate_qna_result('Case1', qna_result['Case1'], ground_truth['Case1'])


Unnamed: 0,Case,Question,Predicted,Gold,Score
0,Case1,Q0,Whole Exome Sequencing (WES),Whole Exome Sequencing (WES),1
1,Case1,Q1,Yes,Yes,1
2,Case1,Q2,Not specified,Yes,0
3,Case1,Q3,Yes,Yes,1
4,Case1,Q4,Yes,Yes,1
5,Case1,Q4_followup,None of the above,ACMG,0
6,Case1,Q5,Yes,Yes,1
7,Case1,Q6,No,No,1
8,Case1,Q7,Yes,Yes,1
9,Case1,Q8,No,No,1


In [43]:
output_csv_path = "/home/cptaswadu/RESCUE-n8n/insurance/results/LLM_QnA/qna_eval_results.csv"
def evaluate_all_cases(policy_records, gold_answers, output_csv_path):
    """
    Evaluate all cases using the evaluate_qna_result function and save the results to a CSV file.

    Args:
        policy_records (dict): Predicted results, in the format {"CaseID": {"qna_result": dict}}
        gold_answers (dict): Ground truth answers, in the format {"CaseID": {"Q0": ..., "Q1": ..., ...}}
        output_csv_path (str): Path to save the evaluation result CSV file.
    """
    all_dfs = []
    
    policy_records = extract_qna_results_only(policy_records)
    for case_id in policy_records:
        pred_result = policy_records[case_id]
        gold_result = gold_answers.get(case_id)

        if pred_result is None or gold_result is None:
            print(f"Skipping {case_id} due to missing data.")
            continue

        # ✅ evaluate_qna_result already returns DataFrame
        df_case = evaluate_qna_result(case_id, pred_result, gold_result)
        all_dfs.append(df_case)

    # Concatenate all DataFrames
    if all_dfs:
        final_df = pd.concat(all_dfs, ignore_index=True)
        final_df.to_csv(output_csv_path, index=False)
        print(f"✅ Evaluation completed and saved to {output_csv_path}")
        print(final_df)
        return final_df
    else:
        print("❗ No records to save.")
        return pd.DataFrame()  # empty dataframe

In [44]:
evaluate_all_cases(policy_records, ground_truth, output_csv_path)

✅ Evaluation completed and saved to /home/cptaswadu/RESCUE-n8n/insurance/results/LLM_QnA/qna_eval_results.csv
     Case Question                     Predicted  \
0   Case1       Q0  Whole Exome Sequencing (WES)   
1   Case1       Q1                           Yes   
2   Case1       Q2                 Not specified   
3   Case1       Q3                           Yes   
4   Case1       Q4                           Yes   
..    ...      ...                           ...   
75  Case4      Q13                 Not specified   
76  Case4      Q14                    Not listed   
77  Case4      Q15                            No   
78  Case4      Q16                            No   
79  Case4    TOTAL                   Correct: 11   

                            Gold             Score  
0   Whole Exome Sequencing (WES)                 1  
1                            Yes                 1  
2                            Yes                 0  
3                            Yes                 1  


Unnamed: 0,Case,Question,Predicted,Gold,Score
0,Case1,Q0,Whole Exome Sequencing (WES),Whole Exome Sequencing (WES),1
1,Case1,Q1,Yes,Yes,1
2,Case1,Q2,Not specified,Yes,0
3,Case1,Q3,Yes,Yes,1
4,Case1,Q4,Yes,Yes,1
...,...,...,...,...,...
75,Case4,Q13,Not specified,Yes,0
76,Case4,Q14,Not listed,Yes,0
77,Case4,Q15,No,No,1
78,Case4,Q16,No,No,1
