In [None]:
!pip install pdfplumber pytesseract pdf2image pillow opencv-python
!pip install rapidfuzz usaddress scourgify
!pip install requests beautifulsoup4 lxml
!pip install pandas numpy
!pip install panel rich great-tables

In [None]:
import os
import pandas as pd
import numpy as np
import pdfplumber
import pytesseract
import requests
import cv2
from rapidfuzz import fuzz
from pdf2image import convert_from_path
from scourgify import normalize_address_record
import usaddress
from rich.console import Console
from bs4 import BeautifulSoup
import panel as pn

console = Console()
pn.extension()

In [None]:
df = pd.read_csv("providers.csv").fillna("")
console.print("[bold green]Provider CSV Loaded Successfully[/bold green]")
df.head()

In [None]:
def log(msg):
    console.print(f"[cyan]{msg}[/cyan]")

def normalize_address(addr):
    if not addr.strip():
        return ""
    try:
        parsed, _ = usaddress.tag(addr)
        norm = normalize_address_record(parsed)
        return " ".join(norm.values())
    except:
        return addr

def fuzzy_match(a, b):
    a, b = str(a).lower().strip(), str(b).lower().strip()
    if not a or not b:
        return {"ratio": 0, "partial": 0, "token": 0, "composite": 0}

    ratio = fuzz.ratio(a, b)
    partial = fuzz.partial_ratio(a, b)
    token = fuzz.token_sort_ratio(a, b)
    composite = np.mean([ratio, partial, token])

    return {"ratio": ratio, "partial": partial, "token": token, "composite": composite}

In [None]:
def query_npi(npi=None, first=None, last=None):
    url = "https://npiregistry.cms.hhs.gov/api/"
    params = {"version": "2.1"}

    if npi:
        params["number"] = str(npi)
    else:
        params.update({"first_name": first, "last_name": last})

    try:
        resp = requests.get(url, params=params, timeout=10)
        data = resp.json()
        return (data.get("results") or [None])[0]
    except Exception as e:
        log(f"[red]NPI API error: {e}[/red]")
        return None

In [None]:
def scrape_practice_website(url):
    if not url or "http" not in url:
        return {"website_text": "", "found_phone": False, "found_address": False}

    try:
        resp = requests.get(url, timeout=10)
        soup = BeautifulSoup(resp.text, "lxml")
        text = soup.get_text(" ", strip=True).lower()

        return {
            "website_text": text,
            "found_phone": "phone" in text,
            "found_address": "address" in text
        }
    except:
        return {"website_text": "", "found_phone": False, "found_address": False}

In [None]:
def ocr_pdf(pdf_path):
    try:
        images = convert_from_path(pdf_path, dpi=300)
    except Exception as e:
        log(f"[red]OCR Conversion Error: {e}[/red]")
        return ""

    full_text = ""
    for img in images:
        arr = np.array(img)
        gray = cv2.cvtColor(arr, cv2.COLOR_BGR2GRAY)
        gray = cv2.threshold(gray, 140, 255, cv2.THRESH_BINARY)[1]
        text = pytesseract.image_to_string(gray)
        full_text += text + "\n"

    return full_text.strip()

In [None]:
def extract_pdf_text(pdf_path):
    try:
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for pg in pdf.pages:
                t = pg.extract_text() or ""
                text += t + "\n"
        return text.strip()
    except:
        return ""

In [None]:
def read_pdf(pdf_path):
    log(f"Reading PDF → {pdf_path}")
    text = extract_pdf_text(pdf_path)

    if len(text) < 40:
        log("Low text detected — switching to OCR")
        return ocr_pdf(pdf_path)

    log("Digital extraction successful")
    return text

In [None]:
os.environ["API_VALIDATION_KEY"] = "REPLACE_WITH_YOUR_KEY"

def validate_phone_address(address):
    key = os.getenv("API_VALIDATION_KEY")
    if not key:
        return {"valid": False, "lat": None, "lon": None}

    url = "https://api.positionstack.com/v1/forward"
    params = {"access_key": key, "query": address, "limit": 1}

    try:
        resp = requests.get(url, params=params)
        d = resp.json()

        if "data" not in d or len(d["data"]) == 0:
            return {"valid": False, "lat": None, "lon": None}

        item = d["data"][0]
        return {
            "valid": True,
            "lat": item.get("latitude"),
            "lon": item.get("longitude")
        }

    except:
        return {"valid": False, "lat": None, "lon": None}

In [None]:
def validate_provider(row):
    name = row.get("Name", "").strip()
    address = row.get("Address", "")
    npi = str(row.get("NPI", "")).strip()
    website = row.get("Website", "")

    log(f"\n[bold yellow]Validating Provider → {name}[/bold yellow]")

In [None]:
    if npi.isdigit():
        npi_data = query_npi(npi=npi)
    else:
        parts = name.split()
        first = parts[0] if parts else ""
        last = parts[-1] if len(parts) > 1 else ""
        npi_data = query_npi(first=first, last=last)

    registry_name = npi_data["basic"]["name"] if npi_data else ""
    npi_addr = npi_data["addresses"][0]["address_1"] if npi_data else ""

    norm_input_addr = normalize_address(address)
    norm_registry_addr = normalize_address(npi_addr)

    name_score = fuzzy_match(name, registry_name)["composite"]
    addr_score = fuzzy_match(norm_input_addr, norm_registry_addr)["composite"]

    web = scrape_practice_website(website)
    api_val = validate_phone_address(address)

    final_score = np.mean([name_score, addr_score])

    return {
        "Provider_Name": name,
        "Registry_Name": registry_name,
        "Name_Score": name_score,
        "Address_Score": addr_score,
        "Overall_Confidence": final_score,
        "Registry_Address": norm_registry_addr,
        "Address_API_Valid": api_val["valid"],
        "Latitude": api_val["lat"],
        "Longitude": api_val["lon"],
        "Website_Found_Phone": web["found_phone"],
        "Website_Found_Address": web["found_address"]
    }

In [None]:
results = []
for _, row in df.iterrows():
    results.append(validate_provider(row))

results_df = pd.DataFrame(results)
console.print("[bold green]Validation Completed Successfully[/bold green]")
results_df.head()

In [None]:
def priority(score):
    if score < 50: return "HIGH"
    if score < 75: return "MEDIUM"
    return "LOW"

results_df["Priority"] = results_df["Overall_Confidence"].apply(priority)
results_df.head()

In [None]:
directory = df.copy()
for col in results_df.columns:
    directory[col] = results_df[col]

directory.to_csv("validated_provider_directory.csv", index=False)
directory.head()

In [None]:
def generate_email(name, score):
    return f"""
    Dear {name},

    As part of our Provider Directory Accuracy Program,
    we detected discrepancies that require your confirmation.

    Current Data Confidence Score: {score:.1f}%.

    Please reply with updated practice information.

    Regards,
    Provider Data Quality Team
    """

directory["Email_Template"] = directory.apply(
    lambda r: generate_email(r["Name"], r["Overall_Confidence"]),
    axis=1
)

directory.head()

In [None]:
table = pn.widgets.DataFrame(directory, name="Provider Directory Validation Results")

pn.Column(
    "# Provider Data Validation Dashboard",
    "### Review confidence scores, discrepancies, and action flags",
    table
)