In [3]:
import numpy as np
import pandas as pd
import os
import csv
import re
from dotenv import load_dotenv
import json
from openai import OpenAI
from urllib.parse import urlparse
import hashlib
import pdfkit
import requests
from playwright.sync_api import sync_playwright

In [4]:
os.chdir('/home/cptaswadu/RESCUE-n8n/insurance')
load_dotenv(dotenv_path='/home/cptaswadu/RESCUE-n8n/insurance/.env')
openai_api_key = os.getenv("OPEN_AI_API_KEY")
perplexity_api_key = os.getenv("PERPLEXITY_API_KEY")
client = OpenAI(api_key=openai_api_key)

In [5]:
df = pd.read_csv('/home/cptaswadu/RESCUE-n8n/insurance/Providers_Network.csv')
df.head()

Unnamed: 0,In-network Provider
0,Advocate Health
1,Aetna
2,Aetna Better Health
3,Affinity by Molina HealthCare
4,Alabama Medicaid


In [6]:
provider_list = df["In-network Provider"].dropna().str.strip().tolist()
print(provider_list)

['Advocate Health', 'Aetna', 'Aetna Better Health', 'Affinity by Molina HealthCare', 'Alabama Medicaid', 'Alameda Alliance for Health', 'Alaska Medicaid', 'Alignment Healthcare', 'Allina Health', 'American Indian Health Program', 'Amerigroup District of Columbia', 'Amerigroup Georgia', 'Amerigroup Nevada', 'Amerigroup Real Solutions', 'Amerigroup Tennessee', 'Amerihealth', 'Amerihealth Caritas (NH)', 'Amerihealth Caritas (NC)', 'Amerihealth Caritas Ohio', 'Amerihealth Caritas (PA)', 'Anthem BC of California', 'Anthem BCBS Connecticut', 'Anthem BCBS Georgia', 'Anthem BCBS Indiana', 'Anthem BCBS Kentucky', 'Anthem BCBS Maine', 'Anthem BCBS Missouri', 'Anthem BCBS New Hampshire', 'Anthem BCBS Wisconsin', 'Anthem Health Plans of Virginia', 'APC Passe LLC dba Summit Community Care', 'Arizona Medicare', 'Arkansas Medicaid', 'Asuris Northwest Health', 'Avera Health Plans', 'Banner Health', 'BC of Idaho', 'BCBS Alabama', 'BCBS Arizona', 'BCBS Federal Employee Plan', 'BCBS Hawaii', 'BCBS Illino

In [7]:
def build_prompt(provider_name):
    return (
        f"Find and list all the links to official policy documents that contain genetic testing coverage policies "
        f"for the insurance provider '{provider_name}'. "
        "Include both PDF files and webpage URLs if the information is only available on the website. "
        "Only include links from official sources such as the insurance company's website or regulatory bodies. "
        "Exclude links from news articles, blog posts, or discussion forums. "
        "If the policy is available as a PDF, return the direct PDF link under the key \"pdf_links\". "
        "If the policy is available only as an HTML webpage, return the webpage URL under the key \"webpage_links\". "
        "The response must be strictly in JSON format with two single keys: "
        "\"pdf_links\", containing an array of valid PDF URLs, and "
        "\"webpage_links\", containing an array of valid webpage URLs. "
        "Do not include any additional text or explanations—only the JSON object."
    )

In [8]:
def download_pdf(url, save_path):
    try:
        response = requests.get(url, timeout=15)
        response.raise_for_status()
        with open(save_path, "wb") as f:
            f.write(response.content)
        print(f"✅ Downloaded PDF: {save_path}")
        return True
    except Exception as e:
        print(f"❌ Failed to download PDF from {url}: {e}")
        return False


def save_webpage_as_pdf(url, save_path):
    try:
        pdfkit.from_url(url, save_path)
        print(f"✅ Saved webpage as PDF: {save_path}")
        return True
    except Exception as e:
        print(f"❌ Failed to save {url} as PDF: {e}")
        return False

In [9]:
def retrieve_and_save_for_provider(provider):
    print(f"\n🔍 Searching for: {provider}")
    messages = [
        {"role": "system", "content": "You are a helpful and precise research assistant."},
        {"role": "user", "content": build_prompt(provider)}
    ]

    try:
        response = client.responses.create(
            model="gpt-4o",
            tools=[{"type": "web_search_preview"}],
            input=messages
        )

        result_text = response.output_text.strip().replace("```json", "").replace("```", "").strip()
        result_json = json.loads(result_text)

        pdf_links = result_json.get("pdf_links", [])
        webpage_links = result_json.get("webpage_links", [])
        all_links = pdf_links + webpage_links

        folder = os.path.join("llm_searched", provider.replace(" ", "_"))
        os.makedirs(folder, exist_ok=True)

        downloaded_pdfs = sum(
            download_pdf(link, os.path.join(folder, os.path.basename(link.split("?")[0])))
            for link in pdf_links
        )

        saved_webpages = sum(
            save_webpage_as_pdf(
                link,
                os.path.join(folder, f"{os.path.basename(link.split("?")[0]).split('.')[0] or 'webpage'}.pdf")
            ) for link in webpage_links
        )

        return {
            "Provider": provider,
            "PDF Links": json.dumps(pdf_links),
            "PDF Count": len(pdf_links),
            "Downloaded PDFs": downloaded_pdfs,
            "Webpage Links": json.dumps(webpage_links),
            "Webpage Count": len(webpage_links),
            "Saved Webpages as PDF": saved_webpages,
            "All Links": json.dumps(all_links),
            "Total Count": len(all_links)
        }

    except Exception as e:
        print(f"❌ Error processing {provider}: {e}")
        return {
            "Provider": provider,
            "PDF Links": "[]",
            "PDF Count": 0,
            "Downloaded PDFs": 0,
            "Webpage Links": "[]",
            "Webpage Count": 0,
            "Saved Webpages as PDF": 0,
            "All Links": "[]",
            "Total Count": 0
        }


In [10]:
def compute_md5(file_path):
    hasher = hashlib.md5()
    with open(file_path, 'rb') as f:
        while chunk := f.read(8192):
            hasher.update(chunk)
    return hasher.hexdigest()


def get_md5_map(folder):
    md5_map = {}
    for root, _, files in os.walk(folder):
        for file in files:
            path = os.path.join(root, file)
            md5_map[file] = compute_md5(path)
    return md5_map

def compare_folders(manual_folder, llm_folder):
    manual_hashes = get_md5_map(manual_folder)
    llm_hashes = get_md5_map(llm_folder)

    manual_md5_set = set(manual_hashes.values())
    llm_md5_set = set(llm_hashes.values())

    matched = manual_md5_set & llm_md5_set
    only_llm = llm_md5_set - manual_md5_set

    print(f"\n✅ Matched files: {len(matched)}")
    print(f"❌ Only in LLM searched folder: {len(only_llm)}")

    for filename, md5 in llm_hashes.items():
        if md5 in matched:
            print(f"✔️ MATCHED: {filename}")
        else:
            print(f"❌ UNMATCHED (LLM-only): {filename}")


In [11]:
providers = ["United Healthcare", "Cigna", "Aetna"]
results = [retrieve_and_save_for_provider(p) for p in providers]

df = pd.DataFrame(results)
df.to_csv("genetic_policy_links_with_downloads.csv", index=False)

compare_folders(
    manual_folder="/home/cptaswadu/RESCUE-n8n/insurance/insurance_policy",
    llm_folder="/home/cptaswadu/RESCUE-n8n/insurance/llm_searched"
)


🔍 Searching for: United Healthcare
✅ Downloaded PDF: llm_searched/United_Healthcare/carrier-testing-for-genetic-diseases.pdf
✅ Downloaded PDF: llm_searched/United_Healthcare/genetic-testing-for-hereditary-cancer.pdf
✅ Saved webpage as PDF: llm_searched/United_Healthcare/genetic-molecular-lab.pdf
✅ Saved webpage as PDF: llm_searched/United_Healthcare/prior-auth-requirement-update-genetic-molecular-codes.pdf
✅ Saved webpage as PDF: llm_searched/United_Healthcare/changes-genetic-molecular-testing-coverage-pa-requirements.pdf
✅ Saved webpage as PDF: llm_searched/United_Healthcare/prior-auth-non-invasive-prenatal.pdf
✅ Saved webpage as PDF: llm_searched/United_Healthcare/nicu-genetic-testing-program.pdf

🔍 Searching for: Cigna
❌ Failed to download PDF from https://chk.static.cigna.com/assets/chcp/resourceLibrary/coveragePolicies/medical/genetic_testing_for_hereditary_and_multifactorial_conditions.pdf: 404 Client Error: Not Found for url: https://chk.static.cigna.com/assets/chcp/resourceLib

In [None]:
def main():
    providers = ["United Healthcare", "Cigna", "Capital BC", "Carelon"]
    results = [retrieve_and_save_for_provider(p) for p in providers]

    df = pd.DataFrame(results)
    df.to_csv("genetic_policy_links_with_downloads.csv", index=False)
    print("\n✅ Finished! Results saved to 'genetic_policy_links_with_downloads.csv'")

    compare_folders(
        manual_folder="/home/cptaswadu/RESCUE-n8n/insurance/insurance_policy",
        llm_folder="/home/cptaswadu/RESCUE-n8n/insurance/llm_searched/United_Healthcare"
    )

if __name__ == "__main__":
    main()
