In [19]:
import numpy as np
import pandas as pd
import os
import csv
import re
from dotenv import load_dotenv
import json
from openai import OpenAI
from urllib.parse import urlparse
import hashlib
import pdfkit
import requests

In [3]:
df = pd.read_csv('/home/cptaswadu/RESCUE-n8n/insurance/Providers_Network.csv')
df.head()

Unnamed: 0,In-network Provider
0,Advocate Health
1,Aetna
2,Aetna Better Health
3,Affinity by Molina HealthCare
4,Alabama Medicaid


In [4]:
provider_list = df["In-network Provider"].dropna().str.strip().tolist()
print(provider_list)

['Advocate Health', 'Aetna', 'Aetna Better Health', 'Affinity by Molina HealthCare', 'Alabama Medicaid', 'Alameda Alliance for Health', 'Alaska Medicaid', 'Alignment Healthcare', 'Allina Health', 'American Indian Health Program', 'Amerigroup District of Columbia', 'Amerigroup Georgia', 'Amerigroup Nevada', 'Amerigroup Real Solutions', 'Amerigroup Tennessee', 'Amerihealth', 'Amerihealth Caritas (NH)', 'Amerihealth Caritas (NC)', 'Amerihealth Caritas Ohio', 'Amerihealth Caritas (PA)', 'Anthem BC of California', 'Anthem BCBS Connecticut', 'Anthem BCBS Georgia', 'Anthem BCBS Indiana', 'Anthem BCBS Kentucky', 'Anthem BCBS Maine', 'Anthem BCBS Missouri', 'Anthem BCBS New Hampshire', 'Anthem BCBS Wisconsin', 'Anthem Health Plans of Virginia', 'APC Passe LLC dba Summit Community Care', 'Arizona Medicare', 'Arkansas Medicaid', 'Asuris Northwest Health', 'Avera Health Plans', 'Banner Health', 'BC of Idaho', 'BCBS Alabama', 'BCBS Arizona', 'BCBS Federal Employee Plan', 'BCBS Hawaii', 'BCBS Illino

In [2]:
os.chdir('/home/cptaswadu/RESCUE-n8n/insurance')
load_dotenv(dotenv_path='/home/cptaswadu/RESCUE-n8n/insurance/.env')
openai_api_key = os.getenv("OPEN_AI_API_KEY")
perplexity_api_key = os.getenv("PERPLEXITY_API_KEY")

# Sample Experiment

In [20]:
client = OpenAI(api_key=openai_api_key)

def build_prompt(provider_name):
    return (
        f"Find and list all the links to official policy documents that contain genetic testing coverage policies "
        f"for the insurance provider '{provider_name}'. "
        "Include both PDF files and webpage URLs if the information is only available on the website. "
        "Only include links from official sources such as the insurance company's website or regulatory bodies. "
        "Exclude links from news articles, blog posts, or discussion forums. "
        "If the policy is available as a PDF, return the direct PDF link under the key \"pdf_links\". "
        "If the policy is available only as an HTML webpage, return the webpage URL under the key \"webpage_links\". "
        "The response must be strictly in JSON format with two single keys: "
        "\"pdf_links\", containing an array of valid PDF URLs, and "
        "\"webpage_links\", containing an array of valid webpage URLs. "
        "Do not include any additional text or explanations—only the JSON object."
    )

def download_pdf(url, save_path):
    try:
        response = requests.get(url, timeout=15)
        response.raise_for_status()
        with open(save_path, "wb") as f:
            f.write(response.content)
        print(f"✅ Downloaded PDF: {save_path}")
        return True
    except Exception as e:
        print(f"❌ Failed to download PDF from {url}: {e}")
        return False

def save_webpage_as_pdf(url, save_path):
    try:
        pdfkit.from_url(url, save_path)
        print(f"✅ Saved webpage as PDF: {save_path}")
        return True
    except Exception as e:
        print(f"❌ Failed to save {url} as PDF: {e}")
        return False

providers = ["United Healthcare", "Cigna", "Capital BC", "Carelon"]


results = []

for provider in providers:
    print(f"🔍 Searching for: {provider}")
    messages = [
        {"role": "system", "content": "You are a helpful and precise research assistant."},
        {"role": "user", "content": build_prompt(provider)}
    ]

    try:
        response = client.responses.create(
            model="gpt-4o",
            tools=[{"type": "web_search_preview"}],
            input=messages
        )

        # Parse JSON from result
        result_text = response.output_text.strip().replace("```json", "").replace("```", "").strip()
        result_json = json.loads(result_text)

        pdf_links = result_json.get("pdf_links", [])
        webpage_links = result_json.get("webpage_links", [])
        all_links = pdf_links + webpage_links

        folder = os.path.join("llm_searched", provider.replace(" ", "_"))
        os.makedirs(folder, exist_ok=True)

        downloaded_pdfs = 0
        for link in pdf_links:
            filename = os.path.basename(link.split("?")[0])
            save_path = os.path.join(folder, filename)
            if download_pdf(link, save_path):
                downloaded_pdfs += 1

        saved_webpages = 0
        for link in webpage_links:
            base_name = os.path.basename(link.split("?")[0]).split('.')[0] or "webpage"
            pdf_name = f"{base_name}.pdf"
            save_path = os.path.join(folder, pdf_name)
            if save_webpage_as_pdf(link, save_path):
                saved_webpages += 1

        results.append({
            "Provider": provider,
            "PDF Links": json.dumps(pdf_links),
            "PDF Count": len(pdf_links),
            "Downloaded PDFs": downloaded_pdfs,
            "Webpage Links": json.dumps(webpage_links),
            "Webpage Count": len(webpage_links),
            "Saved Webpages as PDF": saved_webpages,
            "All Links": json.dumps(all_links),
            "Total Count": len(all_links)
        })

    except Exception as e:
        print(f"❌ Error processing {provider}: {e}")
        results.append({
            "Provider": provider,
            "PDF Links": "[]",
            "PDF Count": 0,
            "Downloaded PDFs": 0,
            "Webpage Links": "[]",
            "Webpage Count": 0,
            "Saved Webpages as PDF": 0,
            "All Links": "[]",
            "Total Count": 0
        })

        

🔍 Searching for: United Healthcare
✅ Downloaded PDF: llm_searched/United_Healthcare/genetic-testing-for-hereditary-cancer.pdf
✅ Downloaded PDF: llm_searched/United_Healthcare/carrier-testing-for-genetic-diseases.pdf
❌ Failed to save https://www.uhcprovider.com/en/prior-auth-advance-notification/genetic-molecular-lab.html as PDF: No wkhtmltopdf executable found: "b''"
If this file exists please check that this process can read it or you can pass path to it manually in method call, check README. Otherwise please install wkhtmltopdf - https://github.com/JazzCore/python-pdfkit/wiki/Installing-wkhtmltopdf
❌ Failed to save https://www.uhcprovider.com/en/resource-library/news/2024/changes-genetic-molecular-testing-coverage-pa-requirements.html as PDF: No wkhtmltopdf executable found: "b''"
If this file exists please check that this process can read it or you can pass path to it manually in method call, check README. Otherwise please install wkhtmltopdf - https://github.com/JazzCore/python-pdf

In [21]:
df = pd.DataFrame(results)
df.to_csv("genetic_policy_links_with_downloads.csv", index=False)
print("\n✅ Finished! Results saved to 'genetic_policy_links_with_downloads.csv'")


✅ Finished! Results saved to 'genetic_policy_links_with_downloads.csv'


In [12]:
df = pd.DataFrame(results)
df

Unnamed: 0,Provider,PDF Links,PDF Count,Webpage Links,Webpage Count,All Links,Total Count
0,United Healthcare,[],0,[],0,[],0
1,Cigna,[],0,[],0,[],0
2,Capital BC,[],0,[],0,[],0
3,Carelon,[],0,[],0,[],0


In [None]:
df.to_csv("genetic_policy_links_by_provider_ex3.csv", index=False)

# 1. Starting from provider retreival

In [29]:
messages = [
    {
        "role": "system",
        "content": (
            "You are an artificial intelligence assistant and you need to "
            "engage in a helpful, detailed, polite conversation with a user."
        ),
    },
    {   
        "role": "user",
        "content": (
            "List all the medical insurance providers that are currently in-network with GeneDx."
            "Output the result strictly in JSON format using the following structure:"
            "{\"Providers\": [list of provider names], \"source_url\": \"link to the official source\"}."
            "Only include links from the official GeneDx website or affiliated trusted sources."
            "Do not include any introduction, explanation, or extra commentary — only return the JSON object."

        ),
    },
]

response = client.responses.create(
    model="gpt-4o",  
    tools=[{"type": "web_search_preview"}],
    input=messages
)

print(response.output_text)

response_text = response.output_text.strip()
if response_text.startswith("```json"):
    response_text = re.sub(r"^```json\s*", "", response_text)
if response_text.endswith("```"):
    response_text = re.sub(r"\s*```$", "", response_text)

result = json.loads(response_text)
provider_list = result.get("Providers", [])

print(provider_list)

```json
{
  "Providers": [
    "Absolute Total Care",
    "Aetna",
    "Aetna Affordable Health Choices",
    "Aetna Better Health Illinois",
    "Aetna Better Health Kentucky",
    "Aetna Better Health Maryland",
    "Aetna Better Health of Texas",
    "Aetna HMO",
    "Aetna Medicare",
    "Aetna PPO",
    "Ambetter (Centene)",
    "Amerigroup Georgia",
    "Amerigroup Texas",
    "Amerigroup/Wellpoint Maryland Medicaid",
    "Amerihealth",
    "AmeriHealth Caritas (NC)",
    "Amerihealth Caritas Ohio",
    "Anthem BCBS Indiana",
    "Anthem BCBS Indiana Blue Access PPO",
    "Anthem BCBS Indiana Medicaid",
    "Anthem BCBS Kentucky Medicaid",
    "Anthem BCBS Ohio Medicaid",
    "Anthem Hoosier Healthwise",
    "Arkansas Health and Wellness",
    "Aspirus Health Plan",
    "BC Community Options",
    "BCBS Arizona",
    "BCBS Illinois",
    "BCBS Kansas City",
    "BCBS Minnesota",
    "BCBS Minnesota Blue Plus Medicaid",
    "BCBS North Carolina",
    "BCBS North Dakota",
    "BCBS

In [30]:
providers = provider_list


results = []

for provider in providers:
    print(f"🔍 Searching for: {provider}")
    messages = [
        {"role": "system", "content": "You are a helpful and precise research assistant."},
        {"role": "user", "content": build_prompt(provider)}
    ]

    try:
        response = client.responses.create(
            model="gpt-4o",
            tools=[{"type": "web_search_preview"}],
            input=messages
        )

        # Parse JSON from result
        result_text = response.output_text.strip().replace("```json", "").replace("```", "").strip()
        result_json = json.loads(result_text)

        pdf_links = result_json.get("pdf_links", [])
        webpage_links = result_json.get("webpage_links", [])
        all_links = pdf_links + webpage_links

        results.append({
            "Provider": provider,
            "PDF Links": json.dumps(pdf_links),
            "PDF Count": len(pdf_links),
            "Webpage Links": json.dumps(webpage_links),
            "Webpage Count": len(webpage_links),
            "All Links": json.dumps(all_links),
            "Total Count": len(all_links)
        })

    except Exception as e:
        print(f"❌ Failed for {provider}: {e}")
        results.append({
            "Provider": provider,
            "PDF Links": "[]",
            "PDF Count": 0,
            "Webpage Links": "[]",
            "Webpage Count": 0,
            "All Links": "[]",
            "Total Count": 0
        })

df2 = pd.DataFrame(results)
df2.to_csv("genetic_policy_links_by_provider2.csv", index=False)
df2

🔍 Searching for: Absolute Total Care
🔍 Searching for: Aetna
🔍 Searching for: Aetna Affordable Health Choices
🔍 Searching for: Aetna Better Health Illinois
🔍 Searching for: Aetna Better Health Kentucky
🔍 Searching for: Aetna Better Health Maryland
🔍 Searching for: Aetna Better Health of Texas
🔍 Searching for: Aetna HMO
🔍 Searching for: Aetna Medicare
🔍 Searching for: Aetna PPO
🔍 Searching for: Ambetter (Centene)
🔍 Searching for: Amerigroup Georgia
🔍 Searching for: Amerigroup Texas
🔍 Searching for: Amerigroup/Wellpoint Maryland Medicaid
🔍 Searching for: Amerihealth
🔍 Searching for: AmeriHealth Caritas (NC)
🔍 Searching for: Amerihealth Caritas Ohio
🔍 Searching for: Anthem BCBS Indiana
🔍 Searching for: Anthem BCBS Indiana Blue Access PPO
🔍 Searching for: Anthem BCBS Indiana Medicaid
🔍 Searching for: Anthem BCBS Kentucky Medicaid
🔍 Searching for: Anthem BCBS Ohio Medicaid
🔍 Searching for: Anthem Hoosier Healthwise
🔍 Searching for: Arkansas Health and Wellness
🔍 Searching for: Aspirus Health

Unnamed: 0,Provider,PDF Links,PDF Count,Webpage Links,Webpage Count,All Links,Total Count
0,Absolute Total Care,"[""https://www.absolutetotalcare.com/content/da...",40,"[""https://www.absolutetotalcare.com/providers/...",1,"[""https://www.absolutetotalcare.com/content/da...",41
1,Aetna,[],0,"[""https://www.aetna.com/cpb/medical/data/100_1...",3,"[""https://www.aetna.com/cpb/medical/data/100_1...",3
2,Aetna Affordable Health Choices,[],0,"[""https://www.aetna.com/cpb/medical/data/100_1...",3,"[""https://www.aetna.com/cpb/medical/data/100_1...",3
3,Aetna Better Health Illinois,[],0,"[""https://www.aetnabetterhealth.com/illinois/m...",4,"[""https://www.aetnabetterhealth.com/illinois/m...",4
4,Aetna Better Health Kentucky,[],0,"[""https://es.aetna.com/cpb/medical/data/100_19...",1,"[""https://es.aetna.com/cpb/medical/data/100_19...",1
...,...,...,...,...,...,...,...
160,WellCare (KY),"[""https://www.wellcare.com/-/media/PDFs/CCG/Ge...",26,"[""https://www.wellcare.com/Kentucky/Providers/...",1,"[""https://www.wellcare.com/-/media/PDFs/CCG/Ge...",27
161,WellCare (NC),[],0,[],0,[],0
162,Wellmark BCBS,[],0,[],0,[],0
163,Wellpoint (Amerigroup Washington State Medicaid),"[""https://lawfilesext.leg.wa.gov/law/wsr/2009/...",2,"[""https://geneticspolicy.nccrcg.org/medicaid-p...",1,"[""https://lawfilesext.leg.wa.gov/law/wsr/2009/...",3


# 2. Retrieval from the collected provider list

In [6]:
client = OpenAI(api_key=openai_api_key)

def build_prompt(provider_name):
    return (
        f"Find and list all the links to official policy documents that contain genetic testing coverage policies "
        f"for the insurance provider '{provider_name}'. "
        "Include both PDF files and webpage URLs if the information is only available on the website. "
        "Only include links from official sources such as the insurance company's website or regulatory bodies. "
        "Exclude links from news articles, blog posts, or discussion forums. "
        "If the policy is available as a PDF, return the direct PDF link under the key \"pdf_links\". "
        "If the policy is available only as an HTML webpage, return the webpage URL under the key \"webpage_links\". "
        "The response must be strictly in JSON format with two single keys: "
        "\"pdf_links\", containing an array of valid PDF URLs, and "
        "\"webpage_links\", containing an array of valid webpage URLs. "
        "Do not include any additional text or explanations—only the JSON object."
    )

providers = df["In-network Provider"].dropna().str.strip().tolist()


results = []

for provider in providers:
    print(f"🔍 Searching for: {provider}")
    messages = [
        {"role": "system", "content": "You are a helpful and precise research assistant."},
        {"role": "user", "content": build_prompt(provider)}
    ]

    try:
        response = client.responses.create(
            model="gpt-4o",
            tools=[{"type": "web_search_preview"}],
            input=messages
        )

        # Parse JSON from result
        result_text = response.output_text.strip().replace("```json", "").replace("```", "").strip()
        result_json = json.loads(result_text)

        pdf_links = result_json.get("pdf_links", [])
        webpage_links = result_json.get("webpage_links", [])
        all_links = pdf_links + webpage_links

        results.append({
            "Provider": provider,
            "PDF Links": json.dumps(pdf_links),
            "PDF Count": len(pdf_links),
            "Webpage Links": json.dumps(webpage_links),
            "Webpage Count": len(webpage_links),
            "All Links": json.dumps(all_links),
            "Total Count": len(all_links)
        })

    except Exception as e:
        print(f"❌ Failed for {provider}: {e}")
        results.append({
            "Provider": provider,
            "PDF Links": "[]",
            "PDF Count": 0,
            "Webpage Links": "[]",
            "Webpage Count": 0,
            "All Links": "[]",
            "Total Count": 0
        })

df3 = pd.DataFrame(results)
df3.to_csv("genetic_policy_links_by_provider2.csv", index=False)
df3

🔍 Searching for: Advocate Health
🔍 Searching for: Aetna
🔍 Searching for: Aetna Better Health
🔍 Searching for: Affinity by Molina HealthCare
🔍 Searching for: Alabama Medicaid
🔍 Searching for: Alameda Alliance for Health
🔍 Searching for: Alaska Medicaid
🔍 Searching for: Alignment Healthcare
🔍 Searching for: Allina Health
🔍 Searching for: American Indian Health Program
🔍 Searching for: Amerigroup District of Columbia
🔍 Searching for: Amerigroup Georgia
🔍 Searching for: Amerigroup Nevada
🔍 Searching for: Amerigroup Real Solutions
🔍 Searching for: Amerigroup Tennessee
🔍 Searching for: Amerihealth
🔍 Searching for: Amerihealth Caritas (NH)
🔍 Searching for: Amerihealth Caritas (NC)
🔍 Searching for: Amerihealth Caritas Ohio
🔍 Searching for: Amerihealth Caritas (PA)
🔍 Searching for: Anthem BC of California
🔍 Searching for: Anthem BCBS Connecticut
🔍 Searching for: Anthem BCBS Georgia
🔍 Searching for: Anthem BCBS Indiana
🔍 Searching for: Anthem BCBS Kentucky
🔍 Searching for: Anthem BCBS Maine
🔍 Se

Unnamed: 0,Provider,PDF Links,PDF Count,Webpage Links,Webpage Count,All Links,Total Count
0,Advocate Health,[],0,"[""https://www.advocatehealth.com/health-servic...",4,"[""https://www.advocatehealth.com/health-servic...",4
1,Aetna,[],0,"[""https://www.aetna.com/cpb/medical/data/100_1...",3,"[""https://www.aetna.com/cpb/medical/data/100_1...",3
2,Aetna Better Health,"[""https://es.aetnabetterhealth.com/pennsylvani...",2,"[""https://es.aetna.com/cpb/medical/data/100_19...",1,"[""https://es.aetnabetterhealth.com/pennsylvani...",3
3,Affinity by Molina HealthCare,[],0,"[""https://www.molinahealthcare.com/providers/i...",1,"[""https://www.molinahealthcare.com/providers/i...",1
4,Alabama Medicaid,"[""https://medicaid.alabama.gov/documents/9.0_R...",1,"[""https://medicaid.alabama.gov/alert_detail.as...",2,"[""https://medicaid.alabama.gov/documents/9.0_R...",3
...,...,...,...,...,...,...,...
250,Wellpoint TX,[],0,"[""https://www.wellpoint.com/tx/provider/state-...",4,"[""https://www.wellpoint.com/tx/provider/state-...",4
251,Wellpoint Washington,"[""https://www.provider.wellpoint.com/docs/gpp/...",2,"[""https://www.provider.wellpoint.com/washingto...",2,"[""https://www.provider.wellpoint.com/docs/gpp/...",4
252,West Virginia Medicaid,[],0,"[""https://www.wvlegislature.gov/Bill_Status/bi...",3,"[""https://www.wvlegislature.gov/Bill_Status/bi...",3
253,Wisconsin Medicaid,"[""https://www.dhs.wisconsin.gov/forms/f1/f1101...",2,"[""https://www.forwardhealth.wi.gov/WIPortal/Su...",3,"[""https://www.dhs.wisconsin.gov/forms/f1/f1101...",5
