In [1]:
import numpy as np
import pandas as pd
import os
import csv
import re
from dotenv import load_dotenv
import json
from openai import OpenAI

In [2]:
df = pd.read_csv('/home/cptaswadu/RESCUE-n8n/insurance/Providers_Network.csv')
df.head()

Unnamed: 0,In-network Provider
0,Advocate Health
1,Aetna
2,Aetna Better Health
3,Affinity by Molina HealthCare
4,Alabama Medicaid


In [3]:
os.chdir('/home/cptaswadu/RESCUE-n8n/insurance')
load_dotenv(dotenv_path='/home/cptaswadu/RESCUE-n8n/insurance/.env')
openai_api_key = os.getenv("OPEN_AI_API_KEY")
perplexity_api_key = os.getenv("PERPLEXITY_API_KEY")


# First Prompt

In [4]:
client = OpenAI(api_key=openai_api_key)
messages = [
    {
        "role": "system",
        "content": (
            "You are an artificial intelligence assistant and you need to "
            "engage in a helpful, detailed, polite conversation with a user."
        ),
    },
    {   
        "role": "user",
        "content": (
            "List all the medical insurance providers that are currently in-network with GeneDx."
            "Output the result strictly in JSON format using the following structure:"
            "{\"Providers\": [list of provider names], \"source_url\": \"link to the official source\"}."
            "Only include links from the official GeneDx website or affiliated trusted sources."
            "Do not include any introduction, explanation, or extra commentary — only return the JSON object."

        ),
    },
]


response = client.responses.create(
    model="gpt-4o",  
    tools=[{"type": "web_search_preview"}],
    input=messages
)

print(response.output_text)

response_text = response.output_text.strip()
if response_text.startswith("```json"):
    response_text = re.sub(r"^```json\s*", "", response_text)
if response_text.endswith("```"):
    response_text = re.sub(r"\s*```$", "", response_text)

result = json.loads(response_text)
print(result)

```json
{
  "Providers": [
    "Absolute Total Care",
    "Aetna",
    "Aetna Affordable Health Choices",
    "Aetna Better Health Illinois",
    "Aetna Better Health Kentucky",
    "Aetna Better Health Maryland",
    "Aetna Better Health of Texas",
    "Aetna HMO",
    "Aetna Medicare",
    "Aetna PPO",
    "Ambetter (Centene)",
    "Amerigroup Georgia",
    "Amerigroup Texas",
    "Amerigroup/Wellpoint Maryland Medicaid",
    "Amerihealth",
    "AmeriHealth Caritas (NC)",
    "Amerihealth Caritas Ohio",
    "Anthem BCBS Indiana",
    "Anthem BCBS Indiana Blue Access PPO",
    "Anthem BCBS Indiana Medicaid",
    "Anthem BCBS Kentucky Medicaid",
    "Anthem BCBS Ohio Medicaid",
    "Anthem Hoosier Healthwise",
    "Arkansas Health and Wellness",
    "Aspirus Health Plan",
    "BC Community Options",
    "BCBS Arizona",
    "BCBS Illinois",
    "BCBS Kansas City",
    "BCBS Minnesota",
    "BCBS Minnesota Blue Plus Medicaid",
    "BCBS North Carolina",
    "BCBS North Dakota",
    "BCBS

In [5]:
chatgpt_provider_list = result.get("Providers", [])
num_providers = len(result.get("Providers", []))
print(f"✅ Number of companies: {len(chatgpt_provider_list)}")
print(chatgpt_provider_list)

✅ Number of companies: 165
['Absolute Total Care', 'Aetna', 'Aetna Affordable Health Choices', 'Aetna Better Health Illinois', 'Aetna Better Health Kentucky', 'Aetna Better Health Maryland', 'Aetna Better Health of Texas', 'Aetna HMO', 'Aetna Medicare', 'Aetna PPO', 'Ambetter (Centene)', 'Amerigroup Georgia', 'Amerigroup Texas', 'Amerigroup/Wellpoint Maryland Medicaid', 'Amerihealth', 'AmeriHealth Caritas (NC)', 'Amerihealth Caritas Ohio', 'Anthem BCBS Indiana', 'Anthem BCBS Indiana Blue Access PPO', 'Anthem BCBS Indiana Medicaid', 'Anthem BCBS Kentucky Medicaid', 'Anthem BCBS Ohio Medicaid', 'Anthem Hoosier Healthwise', 'Arkansas Health and Wellness', 'Aspirus Health Plan', 'BC Community Options', 'BCBS Arizona', 'BCBS Illinois', 'BCBS Kansas City', 'BCBS Minnesota', 'BCBS Minnesota Blue Plus Medicaid', 'BCBS North Carolina', 'BCBS North Dakota', 'BCBS South Carolina (Avalon)', 'BCBS Tennessee', 'BCBS Tennessee Medicaid MCO', 'BCBS Texas Medicaid', 'Blue Choice/Healthy Blue South Caro

In [6]:
real_list = df["In-network Provider"].dropna().str.strip().tolist()
print(real_list)

['Advocate Health', 'Aetna', 'Aetna Better Health', 'Affinity by Molina HealthCare', 'Alabama Medicaid', 'Alameda Alliance for Health', 'Alaska Medicaid', 'Alignment Healthcare', 'Allina Health', 'American Indian Health Program', 'Amerigroup District of Columbia', 'Amerigroup Georgia', 'Amerigroup Nevada', 'Amerigroup Real Solutions', 'Amerigroup Tennessee', 'Amerihealth', 'Amerihealth Caritas (NH)', 'Amerihealth Caritas (NC)', 'Amerihealth Caritas Ohio', 'Amerihealth Caritas (PA)', 'Anthem BC of California', 'Anthem BCBS Connecticut', 'Anthem BCBS Georgia', 'Anthem BCBS Indiana', 'Anthem BCBS Kentucky', 'Anthem BCBS Maine', 'Anthem BCBS Missouri', 'Anthem BCBS New Hampshire', 'Anthem BCBS Wisconsin', 'Anthem Health Plans of Virginia', 'APC Passe LLC dba Summit Community Care', 'Arizona Medicare', 'Arkansas Medicaid', 'Asuris Northwest Health', 'Avera Health Plans', 'Banner Health', 'BC of Idaho', 'BCBS Alabama', 'BCBS Arizona', 'BCBS Federal Employee Plan', 'BCBS Hawaii', 'BCBS Illino

In [7]:
real_set = set(real_list)
chatgpt_set = set(chatgpt_provider_list)

common = real_set & chatgpt_set
missing = real_set - chatgpt_set
extra = chatgpt_set - real_set

precision = len(common) / len(chatgpt_set) * 100 if chatgpt_set else 0
recall = len(common) / len(real_set) * 100 if real_set else 0

comparison_summary = {
    "real_list_count": len(real_list),
    "chatgpt_list_count": len(chatgpt_provider_list),
    "common_count": len(common),
    "missing_from_chatgpt_count": len(missing),
    "extra_in_chatgpt_count": len(extra),
    "Precision (%)": round(precision, 2),
    "Recall (%)": round(recall, 2)
}

comparison_summary

{'real_list_count': 255,
 'chatgpt_list_count': 165,
 'common_count': 44,
 'missing_from_chatgpt_count': 211,
 'extra_in_chatgpt_count': 121,
 'Precision (%)': 26.67,
 'Recall (%)': 17.25}

In [16]:
print(common)

{'United Healthcare Community Plan (FL)', 'Amerihealth Caritas Ohio', 'Amerigroup Georgia', 'Anthem BCBS Indiana', 'Montana Medicaid', 'United Healthcare Community Plan (NJ)', 'Tennessee Medicaid', 'Maryland Physicians Care', 'United Healthcare Community Plan (NY)', 'United Healthcare Community Plan (MD)', 'Health New England', 'Independence Blue Cross', 'United Healthcare Community Plan (KY)', 'Cigna', 'United Healthcare Community Plan (OH)', 'United Healthcare Community Plan (MI)', 'United Healthcare', 'BCBS Minnesota', 'North Dakota Medicaid', 'United Healthcare Community Plan (LA)', 'Community First Health Plans', 'Molina Healthcare of Illinois', 'Molina Healthcare of Texas', 'Amerihealth', 'BCBS Arizona', 'Coordinated Care Health Plan', 'BCBS North Carolina', 'United Healthcare Community Plan (TN)', 'CareSource Georgia', 'Healthy Blue Missouri', 'CareSource Ohio', 'Aetna', 'Johns Hopkins Healthcare', 'United Healthcare Community Plan (MO)', 'Dean Health Plan', 'BCBS Illinois', 'Ro

In [8]:
def normalize_provider(name):
    name = name.strip()

    
    if name.startswith("Aetna Better Health"):
        return "Aetna Better Health"
    elif name.startswith("Aetna"):
        return "Aetna"
    
    if re.search(r"\s+Medicaid$", name):
        name = re.sub(r"\s+Medicaid$", "", name)

    # 4. "[기업명] (Medicaid)" → "[기업명]"
    if re.search(r"\s+\(Medicaid\)$", name):
        name = re.sub(r"\s+\(Medicaid\)$", "", name)

    # 5. Kansas City → Kansas
    if "Kansas City" in name:
        name = name.replace("Kansas City", "Kansas")

    return name


chatgpt_normalized_list = [normalize_provider(item) for item in chatgpt_provider_list]
chatgpt_normalized_list = sorted(set(chatgpt_normalized_list))
normalized_num_providers = len(chatgpt_normalized_list)
print(chatgpt_normalized_list)
print(normalized_num_providers)

['Absolute Total Care', 'Aetna', 'Aetna Better Health', 'Ambetter (Centene)', 'AmeriHealth Caritas (NC)', 'Amerigroup Georgia', 'Amerigroup Texas', 'Amerigroup/Wellpoint Maryland', 'Amerihealth', 'Amerihealth Caritas Ohio', 'Anthem BCBS Indiana', 'Anthem BCBS Indiana Blue Access PPO', 'Anthem BCBS Kentucky', 'Anthem BCBS Ohio', 'Anthem Hoosier Healthwise', 'Arkansas Health and Wellness', 'Aspirus Health Plan', 'BC Community Options', 'BCBS Arizona', 'BCBS Illinois', 'BCBS Kansas', 'BCBS Minnesota', 'BCBS Minnesota Blue Plus', 'BCBS North Carolina', 'BCBS North Dakota', 'BCBS South Carolina (Avalon)', 'BCBS Tennessee', 'BCBS Tennessee Medicaid MCO', 'BCBS Texas', 'Blue Choice/Healthy Blue South Carolina', 'Blue Cross Community Options Illinois', 'Blue Shield California', 'Buckeye Health Plan', 'Capital Blue Cross', 'CareFirst Community Health Plan', 'CareSource Georgia', 'CareSource Indiana', 'CareSource Ohio', 'Carolina Complete Health', 'Centene Corporation', 'Cigna', 'Community First

In [9]:
real_set = set(real_list)
chatgpt_normalized_set = set(chatgpt_normalized_list)

common_normalized = real_set & chatgpt_normalized_set
missing_normalized = real_set - chatgpt_normalized_set
extra_normalized = chatgpt_normalized_set - real_set

precision_normalized = len(common) / len(chatgpt_normalized_set) * 100 if chatgpt_normalized_set else 0
recall_normalized = len(common) / len(real_set) * 100 if real_set else 0

normalized_comparison_summary = {
    "real_list_count": len(real_list),
    "chatgpt_list_count": len(chatgpt_normalized_list),
    "common_count": len(common_normalized),
    "missing_from_chatgpt_count": len(missing_normalized),
    "extra_in_chatgpt_count": len(extra_normalized),
    "Precision (%)": round(precision_normalized, 2),
    "Recall (%)": round(recall_normalized, 2)
}

normalized_comparison_summary

{'real_list_count': 255,
 'chatgpt_list_count': 156,
 'common_count': 45,
 'missing_from_chatgpt_count': 210,
 'extra_in_chatgpt_count': 111,
 'Precision (%)': 28.21,
 'Recall (%)': 17.25}

# Second Prompt

In [10]:
messages = [
    {
        "role": "system",
        "content": (
            "You are an artificial intelligence assistant and you need to "
            "engage in a helpful, detailed, polite conversation with a user."
        ),
    },
    {   
        "role": "user",
        "content": (
            "List all the 255 medical insurance providers that are currently in-network with GeneDx."
            "Output the result strictly in JSON format using the following structure:"
            "{\"Providers\": [list of provider names], \"source_url\": \"link to the official source\"}."
            "Only include links from the official GeneDx website or affiliated trusted sources."
            "Do not include any introduction, explanation, or extra commentary — only return the JSON object."

        ),
    },
]


response = client.responses.create(
    model="gpt-4o",  
    tools=[{"type": "web_search_preview"}],
    input=messages
)

print(response.output_text)

response_text = response.output_text.strip()
if response_text.startswith("```json"):
    response_text = re.sub(r"^```json\s*", "", response_text)
if response_text.endswith("```"):
    response_text = re.sub(r"\s*```$", "", response_text)

result = json.loads(response_text)
print(result)

```json
{
  "Providers": [
    "Absolute Total Care",
    "Aetna",
    "Aetna Affordable Health Choices",
    "Aetna Better Health Illinois",
    "Aetna Better Health Kentucky",
    "Aetna Better Health Maryland",
    "Aetna Better Health of Texas",
    "Aetna HMO",
    "Aetna Medicare",
    "Aetna PPO",
    "Ambetter (Centene)",
    "Amerigroup Georgia",
    "Amerigroup Texas",
    "Amerigroup/Wellpoint Maryland Medicaid",
    "Amerihealth",
    "AmeriHealth Caritas (NC)",
    "Amerihealth Caritas Ohio",
    "Anthem BCBS Indiana",
    "Anthem BCBS Indiana Blue Access PPO",
    "Anthem BCBS Indiana Medicaid",
    "Anthem BCBS Kentucky Medicaid",
    "Anthem BCBS Ohio Medicaid",
    "Anthem Hoosier Healthwise",
    "Arkansas Health and Wellness",
    "Aspirus Health Plan",
    "BC Community Options",
    "BCBS Arizona",
    "BCBS Illinois",
    "BCBS Kansas City",
    "BCBS Minnesota",
    "BCBS Minnesota Blue Plus Medicaid",
    "BCBS North Carolina",
    "BCBS North Dakota",
    "BCBS

In [11]:
chatgpt_provider_list = result.get("Providers", [])
num_providers = len(result.get("Providers", []))
print(f"✅ Number of companies: {len(chatgpt_provider_list)}")
print(chatgpt_provider_list)

✅ Number of companies: 165
['Absolute Total Care', 'Aetna', 'Aetna Affordable Health Choices', 'Aetna Better Health Illinois', 'Aetna Better Health Kentucky', 'Aetna Better Health Maryland', 'Aetna Better Health of Texas', 'Aetna HMO', 'Aetna Medicare', 'Aetna PPO', 'Ambetter (Centene)', 'Amerigroup Georgia', 'Amerigroup Texas', 'Amerigroup/Wellpoint Maryland Medicaid', 'Amerihealth', 'AmeriHealth Caritas (NC)', 'Amerihealth Caritas Ohio', 'Anthem BCBS Indiana', 'Anthem BCBS Indiana Blue Access PPO', 'Anthem BCBS Indiana Medicaid', 'Anthem BCBS Kentucky Medicaid', 'Anthem BCBS Ohio Medicaid', 'Anthem Hoosier Healthwise', 'Arkansas Health and Wellness', 'Aspirus Health Plan', 'BC Community Options', 'BCBS Arizona', 'BCBS Illinois', 'BCBS Kansas City', 'BCBS Minnesota', 'BCBS Minnesota Blue Plus Medicaid', 'BCBS North Carolina', 'BCBS North Dakota', 'BCBS South Carolina (Avalon)', 'BCBS Tennessee', 'BCBS Tennessee Medicaid MCO', 'BCBS Texas Medicaid', 'Blue Choice/Healthy Blue South Caro

In [12]:
real_set = set(real_list)
chatgpt_set = set(chatgpt_provider_list)

common = real_set & chatgpt_set
missing = real_set - chatgpt_set
extra = chatgpt_set - real_set

precision = len(common) / len(chatgpt_set) * 100 if chatgpt_set else 0
recall = len(common) / len(real_set) * 100 if real_set else 0

comparison_summary = {
    "real_list_count": len(real_list),
    "chatgpt_list_count": len(chatgpt_provider_list),
    "common_count": len(common),
    "missing_from_chatgpt_count": len(missing),
    "extra_in_chatgpt_count": len(extra),
    "Precision (%)": round(precision, 2),
    "Recall (%)": round(recall, 2)
}

comparison_summary

{'real_list_count': 255,
 'chatgpt_list_count': 165,
 'common_count': 44,
 'missing_from_chatgpt_count': 211,
 'extra_in_chatgpt_count': 121,
 'Precision (%)': 26.67,
 'Recall (%)': 17.25}

# Third prompt

In [17]:
messages = [
    {
        "role": "system",
        "content": (
            "You are an artificial intelligence assistant and you need to "
            "engage in a helpful, detailed, polite conversation with a user."
        ),
    },
    {   
        "role": "user",
        "content": (
            "List all the medical insurance providers that are currently in-network with GeneDx."
            "You may use the official GeneDx insurance network page at "
            "https://www.genedx.com/commercial-insurance-in-network-contracts/ as the primary source of information. "
            "Output the result strictly in JSON format using the following structure:"
            "{\"Providers\": [list of provider names], \"source_url\": \"link to the official source\"}."
            "Only include links from the official GeneDx website or affiliated trusted sources."
            "Do not include any introduction, explanation, or extra commentary — only return the JSON object."

        ),
    },
]


response = client.responses.create(
    model="gpt-4o",  
    tools=[{"type": "web_search_preview"}],
    input=messages
)

print(response.output_text)


```json
{
  "Providers": [
    "Aetna",
    "Alameda Alliance for Health",
    "American Indian Health Program (AIHP)",
    "AmeriHealth",
    "AmeriHealth Caritas NH",
    "AmeriHealth Caritas OH",
    "AmeriHealth Caritas PA",
    "Amerigroup Real Solutions",
    "Anthem",
    "Anthem – Medicaid",
    "Asuris Northwest Health",
    "Avera Health Plans",
    "Banner Health | Aetna",
    "BCBS Alabama",
    "BCBS Arizona",
    "BCBS Federal Employee Plan",
    "BCBS Illinois",
    "BCBS Kansas",
    "BCBS Massachusetts",
    "BCBS Michigan",
    "BCBS Minnesota",
    "BCBS North Carolina",
    "BCBS North Dakota",
    "BCBS Rhode Island",
    "BCBS South Carolina",
    "BCBS Tennessee",
    "BCBS Texas",
    "BCBS Virginia",
    "BCBS Wisconsin",
    "Blue Cross Complete of Michigan",
    "Blue Cross Idaho",
    "Blue Shield of California",
    "Blue Shield of California Promise Health Plan",
    "BlueChoice HealthPlan of South Carolina – Medicaid",
    "Capital Blue Cross",
    "Capit

In [18]:
response_text = response.output_text.strip()
if response_text.startswith("```json"):
    response_text = re.sub(r"^```json\s*", "", response_text)
if response_text.endswith("```"):
    response_text = re.sub(r"\s*```$", "", response_text)

result = json.loads(response_text)
print(result)

{'Providers': ['Aetna', 'Alameda Alliance for Health', 'American Indian Health Program (AIHP)', 'AmeriHealth', 'AmeriHealth Caritas NH', 'AmeriHealth Caritas OH', 'AmeriHealth Caritas PA', 'Amerigroup Real Solutions', 'Anthem', 'Anthem – Medicaid', 'Asuris Northwest Health', 'Avera Health Plans', 'Banner Health | Aetna', 'BCBS Alabama', 'BCBS Arizona', 'BCBS Federal Employee Plan', 'BCBS Illinois', 'BCBS Kansas', 'BCBS Massachusetts', 'BCBS Michigan', 'BCBS Minnesota', 'BCBS North Carolina', 'BCBS North Dakota', 'BCBS Rhode Island', 'BCBS South Carolina', 'BCBS Tennessee', 'BCBS Texas', 'BCBS Virginia', 'BCBS Wisconsin', 'Blue Cross Complete of Michigan', 'Blue Cross Idaho', 'Blue Shield of California', 'Blue Shield of California Promise Health Plan', 'BlueChoice HealthPlan of South Carolina – Medicaid', 'Capital Blue Cross', 'Capital Health Plan', 'CareFirst BCBS', 'CareFirst BCBS – Medicaid', 'CarePartners', 'CareSource', 'CareSource – Medicaid', 'Cigna', 'Common Ground Healthcare Co

In [19]:
chatgpt_provider_list = result.get("Providers", [])
num_providers = len(result.get("Providers", []))
print(f"✅ Number of companies: {len(chatgpt_provider_list)}")
print(chatgpt_provider_list)

✅ Number of companies: 169
['Aetna', 'Alameda Alliance for Health', 'American Indian Health Program (AIHP)', 'AmeriHealth', 'AmeriHealth Caritas NH', 'AmeriHealth Caritas OH', 'AmeriHealth Caritas PA', 'Amerigroup Real Solutions', 'Anthem', 'Anthem – Medicaid', 'Asuris Northwest Health', 'Avera Health Plans', 'Banner Health | Aetna', 'BCBS Alabama', 'BCBS Arizona', 'BCBS Federal Employee Plan', 'BCBS Illinois', 'BCBS Kansas', 'BCBS Massachusetts', 'BCBS Michigan', 'BCBS Minnesota', 'BCBS North Carolina', 'BCBS North Dakota', 'BCBS Rhode Island', 'BCBS South Carolina', 'BCBS Tennessee', 'BCBS Texas', 'BCBS Virginia', 'BCBS Wisconsin', 'Blue Cross Complete of Michigan', 'Blue Cross Idaho', 'Blue Shield of California', 'Blue Shield of California Promise Health Plan', 'BlueChoice HealthPlan of South Carolina – Medicaid', 'Capital Blue Cross', 'Capital Health Plan', 'CareFirst BCBS', 'CareFirst BCBS – Medicaid', 'CarePartners', 'CareSource', 'CareSource – Medicaid', 'Cigna', 'Common Ground 

In [20]:
real_set = set(real_list)
chatgpt_set = set(chatgpt_provider_list)

common = real_set & chatgpt_set
missing = real_set - chatgpt_set
extra = chatgpt_set - real_set

precision = len(common) / len(chatgpt_set) * 100 if chatgpt_set else 0
recall = len(common) / len(real_set) * 100 if real_set else 0

comparison_summary = {
    "real_list_count": len(real_list),
    "chatgpt_list_count": len(chatgpt_provider_list),
    "common_count": len(common),
    "missing_from_chatgpt_count": len(missing),
    "extra_in_chatgpt_count": len(extra),
    "Precision (%)": round(precision, 2),
    "Recall (%)": round(recall, 2)
}

comparison_summary

{'real_list_count': 255,
 'chatgpt_list_count': 169,
 'common_count': 55,
 'missing_from_chatgpt_count': 200,
 'extra_in_chatgpt_count': 114,
 'Precision (%)': 32.54,
 'Recall (%)': 21.57}

In [35]:
evaluation_df = pd.DataFrame([comparison_summary])
print(evaluation_df)

   real_list_count  chatgpt_list_count  common_count  \
0              255                 156            45   

   missing_from_chatgpt_count  extra_in_chatgpt_count  Precision (%)  \
0                         210                     111          28.85   

   Recall (%)  
0       17.65  
