In [None]:
# --- Installation, Setup, and Key ---

# 1. Install necessary libraries
!pip install requests pydantic pdfplumber google-genai python-dotenv

# 2. Import core modules
import os
import json
import requests
import pdfplumber
from pydantic import BaseModel, Field  # Used for defining the schema

# 3. Import Google Gemini SDK components
from google import genai
from google.genai import types as gtypes

# --- IMPORTANT: GEMINI API KEY SETUP ---
# 4. Load your API Key from a .env file
from dotenv import load_dotenv
load_dotenv()  # loads variables from .env file
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

# Optional check
if GEMINI_API_KEY is None:
    raise ValueError("GEMINI_API_KEY not found. Please add it to a .env file")

print("Libraries and imports loaded successfully. Proceed to Cell 2.")




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Libraries and imports loaded successfully. Proceed to Cell 2.


In [2]:
# --- Define the Target JSON Schema ---

# 1. Define the Nested Structure for a single Practice Location (Section 4A of CMS-855B)
class PracticeLocation(BaseModel):
    """Schema for extracting a single practice location address."""
    location_name: str = Field(description="The 'Doing Business As' Name, if applicable, for this practice location.")
    address_line_1: str = Field(description="The physical street address line 1 (NOT a P.O. Box) from Section 4A of the CMS-855B).")
    city_town: str = Field(description="City or Town for the practice location.")
    state: str = Field(description="State for the practice location.")
    zip_code: str = Field(description="ZIP Code + 4 for the location.")
    telephone_number: str = Field(description="Telephone Number for this practice location.")
    npi_for_location: str = Field(description="The National Provider Identifier (NPI) for this specific location.")

# 2. Define the Main Supplier Information Structure (Sections 2A1, 4A, 3 of CMS-855B)
class SupplierData(BaseModel):
    """Main schema for the Medicare supplier (group practice/clinic)."""
    legal_business_name: str = Field(description="The Legal Business Name as reported in Section 2A1.")
    tax_id_number: str = Field(description="The Tax Identification Number (TIN) as reported in Section 2A1.")
    main_npi: str = Field(description="The organization's primary Type 2 National Provider Identifier (NPI) from Section 2A1.")
    type_of_supplier: str = Field(description="The organizational type checked in Section 2B (e.g., Clinic/Group Practice, Pharmacy).")
    has_final_adverse_legal_action: bool = Field(description="True if Section 3 indicates any final adverse legal action was imposed against the organization, False otherwise.")
    practice_locations: list[PracticeLocation] = Field(description="A list of all practice locations found in Section 4A.")

# Note: We use the Pydantic class itself (SupplierData) directly in the function call.

print("JSON output schemas defined successfully. Proceed to Cell 3.")

JSON output schemas defined successfully. Proceed to Cell 3.


In [3]:
# --- Raw Text Extraction ---

pdf_path = 'CMS-855B.pdf'
raw_text = ""
target_pages = 11  # Covers up to the end of Section 4A

try:
    with pdfplumber.open(pdf_path) as pdf:
        # Extract text from the first 11 pages
        for i in range(target_pages):
            if i < len(pdf.pages):
                page_text = pdf.pages[i].extract_text()
                if page_text:
                    raw_text += page_text + "\n"
    
    print(f"Raw text extracted successfully. Total characters: {len(raw_text)}")

except FileNotFoundError:
    print(f"Error: The file {pdf_path} was not found. Please ensure the PDF is in the same directory.")
except Exception as e:
    print(f"An error occurred during PDF processing: {e}")

Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBB

Raw text extracted successfully. Total characters: 25308


In [6]:
# --- CExecute Native Gemini Extraction (Final Fix) ---

def extract_data_native_gemini(document_text: str, schema: BaseModel, api_key: str):
    """Uses the native google-genai SDK to extract data, enforcing the Pydantic schema."""
    
    # 1. Initialize the client
    client = genai.Client(api_key=api_key) 
    
    # 2. Define the response configuration using the Pydantic class directly
    generation_config = gtypes.GenerateContentConfig(
        response_mime_type="application/json",
        response_schema=schema, 
    )

    # 3. Define the prompt contents (Combining System and User Instruction)
    # ðŸ›‘ FIX APPLIED HERE: Combining role="system" into the user role text
    full_user_prompt = (
        "You are an expert data extractor for Medicare forms. "
        "Your task is to extract all requested details from the provided CMS-855B form text and return them as a valid JSON object. "
        "The output MUST strictly adhere to the provided JSON schema. "
        "Since the input is a form template, return placeholder values (e.g., 'N/A' or 'Placeholder Name') for fields not explicitly filled out in the template's instructional text.\n\n"
        "--- DOCUMENT TEXT TO EXTRACT FROM ---\n\n"
        f"Extract the required data from this CMS-855B document text:\n\n{document_text}"
    )
    
    contents = [
        gtypes.Content(
            role="user", 
            parts=[gtypes.Part(text=full_user_prompt)] # Now only a single 'user' role
        ),
    ]

    # 4. Generate content
    response = client.models.generate_content(
        model='gemini-2.5-flash', # Excellent model for structured data extraction
        contents=contents,
        config=generation_config
    )
    
    # 5. Parse the response
    if response.text:
        return json.loads(response.text)
    else:
        raise ValueError("Gemini API returned an empty response, possibly due to safety settings.")


# --- EXECUTION ---
gemini_api_key = os.environ.get("GEMINI_API_KEY")

if gemini_api_key and raw_text:
    print("Executing LLM extraction chain via Native Gemini SDK...")
    
    final_output_dict = extract_data_native_gemini(raw_text, SupplierData, gemini_api_key)
    
    print("\n--- FINAL JSON OUTPUT (Ready for Validation Agent) ---")
    print(json.dumps(final_output_dict, indent=4))
    
else:
    print("Execution failed. Check if GEMINI_API_KEY is set or if raw_text is empty.")

Executing LLM extraction chain via Native Gemini SDK...

--- FINAL JSON OUTPUT (Ready for Validation Agent) ---
{
    "legal_business_name": "N/A",
    "tax_id_number": "N/A",
    "main_npi": "N/A",
    "type_of_supplier": "Clinic/Group Practice",
    "has_final_adverse_legal_action": false,
    "practice_locations": []
}


In [7]:
# --- The Validation Agent - NPI Check ---
import requests
import json

NPI_API_URL = "https://npiregistry.cms.hhs.gov/api/?version=2.1"

# --- Define the Validation Function ---
def validate_npi_and_address(extracted_data: dict) -> dict:
    """
    Validates NPI and Primary Address against the official CMS NPPES Registry.
    """
    npi_to_check = extracted_data.get("main_npi")
    
    # 1. Handle Placeholder/Missing NPI (from template extraction)
    if npi_to_check in ["N/A", ""]:
        return {"npi_validation": "FAIL", "reason": "NPI was not extracted from document or is missing."}

    # 2. Query the NPPES API for an Organization (Type 2) NPI
    params = {"number": npi_to_check, "enumeration_type": "NPI-2"} 
    
    try:
        response = requests.get(NPI_API_URL, params=params, timeout=10)
        response.raise_for_status()
        nppes_data = response.json()

        # Check if NPI was found and is Type 2
        if not nppes_data.get("results"):
            return {"npi_validation": "FAIL", "reason": "NPI not found in NPPES Registry or is not Type 2 (Organization)."}

        nppes_record = nppes_data["results"][0]
        nppes_name = nppes_record["basic"]["organization_name"]
        
        # --- Data Comparison ---
        
        # Since the input was a blank template, we simulate the comparison logic here.
        # In a real scenario, you would have addresses in practice_locations[0].
        
        # Check Name Match (Example: Case-insensitive check)
        extracted_name = extracted_data.get("legal_business_name", "").strip()
        is_name_match = extracted_name.lower() in nppes_name.lower() if extracted_name != "N/A" else "SKIPPED_TEMPLATE"
        
        # Check Address Match (Requires address data, which is 'N/A' in the current output)
        nppes_address = next((addr for addr in nppes_record.get("addresses", []) if addr.get("address_purpose") == "PRIMARY"), None)
        is_address_match = "SKIPPED_TEMPLATE"
        nppes_phone = nppes_address.get("telephone_number") if nppes_address else "N/A"

        return {
            "npi_validation": "SUCCESS",
            "extracted_name": extracted_name,
            "nppes_name": nppes_name,
            "name_match_status": is_name_match,
            "address_match_status": is_address_match, # Will be skipped due to template input
            "nppes_primary_phone": nppes_phone
        }
        
    except requests.exceptions.RequestException as e:
        return {"npi_validation": "ERROR", "reason": f"API request failed: {e}"}
    except Exception as e:
        return {"npi_validation": "ERROR", "reason": f"Processing failed: {e}"}


# --- EXECUTION ---

# Since your extraction output the blank template, we manually simulate the final JSON 
# output for a successful test scenario (a real provider) to demonstrate functionality.

# Replace this with the actual dictionary output from Cell 4 if you filled the form in the PDF.
SIMULATED_JSON_INPUT = {
    "legal_business_name": "MAYFIELD CLINIC INC",
    "tax_id_number": "311130490",
    "main_npi": "1710204780", # A valid Type 2 NPI for testing
    "type_of_supplier": "Clinic/Group Practice",
    "practice_locations": [
        {"address_line_1": "222 ROOD ROAD", "city_town": "ERLANGER", "state": "KY", "zip_code": "41018-0001"}
    ]
}


validation_results = validate_npi_and_address(SIMULATED_JSON_INPUT)

print("\n--- VALIDATION AGENT RESULTS (NPI Check) ---")
print(json.dumps(validation_results, indent=4))


--- VALIDATION AGENT RESULTS (NPI Check) ---
{
    "npi_validation": "FAIL",
    "reason": "NPI not found in NPPES Registry or is not Type 2 (Organization)."
}


In [8]:
# --- The Quality Assurance Agent - Confidence Scoring ---

# Define the function to calculate the score
def calculate_confidence_score(npi_results: dict) -> dict:
    """
    Calculates a Confidence Score based on Validation Agent results (Max 100 points).
    """
    score = 0
    max_score = 100
    
    # 1. Base Score for successful PDF Extraction (Assumed from previous steps)
    score += 30 
    
    # 2. NPI Validation (Most critical check)
    if npi_results.get("npi_validation") == "SUCCESS":
        score += 40
        
        # 3. Name Match Check (Only run if NPI was successful)
        name_match_status = npi_results.get("name_match_status")
        if name_match_status is True:
            score += 15
            name_status_text = "Match"
        elif name_match_status is False:
            score -= 5
            name_status_text = "Conflict"
        else:
            score += 10 
            name_status_text = name_match_status
            
        # 4. Address Match Check (Only run if NPI was successful)
        address_match_status = npi_results.get("address_match_status")
        if address_match_status is True:
            score += 15
            address_status_text = "Match"
        elif address_match_status is False:
            score -= 10
            address_status_text = "Conflict"
        else:
            address_status_text = address_match_status
            score += 5 
            
    else:
        # NPI validation failed or encountered an error (major issue)
        score -= 20
        name_status_text = "N/A"
        address_status_text = "N/A"
        
    # Ensure score does not fall below zero
    final_score = max(0, score)
    
    return {
        "final_confidence_score": final_score,
        "max_score": max_score,
        "npi_status": npi_results.get("npi_validation"),
        "name_check": name_status_text,
        "address_check": address_status_text,
        "review_required": final_score < 75 # Custom threshold for human review
    }


# --- EXECUTION ---
# Input the actual FAIL results from Cell 5
validation_input = {
    "npi_validation": "FAIL",
    "reason": "NPI not found in NPPES Registry or is not Type 2 (Organization)."
}

qa_results = calculate_confidence_score(validation_input)

print("\n--- QUALITY ASSURANCE AGENT RESULTS (Confidence Score) ---")
print(json.dumps(qa_results, indent=4))


--- QUALITY ASSURANCE AGENT RESULTS (Confidence Score) ---
{
    "final_confidence_score": 10,
    "max_score": 100,
    "npi_status": "FAIL",
    "name_check": "N/A",
    "address_check": "N/A",
    "review_required": true
}


In [9]:
# --- The Directory Agent (Decision & Routing) ---
import json
import random # Used for simulating directory update actions

# Input the actual results from Cell 6
QA_AGENT_RESULTS = {
    "final_confidence_score": 10,
    "max_score": 100,
    "npi_status": "FAIL",
    "name_check": "N/A",
    "address_check": "N/A",
    "review_required": True
}

# Define the function to execute the final decision
def run_directory_agent(qa_results: dict) -> dict:
    """
    Decides whether to automatically update the provider directory or route the record 
    to the human review queue based on the Confidence Score.
    """
    confidence = qa_results.get("final_confidence_score")
    review_needed = qa_results.get("review_required")
    
    # Threshold is defined by the QA Agent as < 75
    if review_needed:
        # Action 1: Low Confidence - Route to Human Review
        # This completes the main goal: designing a pipeline that reduces validation time from hours to minutes.
        return {
            "action": "ROUTE_TO_HUMAN_REVIEW",
            "priority": "HIGH - NPI Failure",
            "status_message": f"Confidence score of {confidence} is below threshold (75). Record flagged for Payer Operations Staff.",
            "dashboard_alert": "PENDING: Discrepancy detected (NPI validation failed).",
            "next_step": "Human staff must resolve the conflict and trigger the Feedback Loop."
        }
    else:
        # Action 2: High Confidence - Automatic Update
        return {
            "action": "AUTOMATIC_DIRECTORY_UPDATE",
            "priority": "LOW",
            "status_message": f"Confidence score of {confidence} is high. Directory updated successfully.",
            "dashboard_alert": "SUCCESS: Validation Complete.",
            "next_step": "Record embedded in Vector DB; scheduled for next 90-day revalidation cycle."
        }

# --- EXECUTION ---
final_decision = run_directory_agent(QA_AGENT_RESULTS)

print("\n--- DIRECTORY AGENT RESULTS (Final Disposition) ---")
print(json.dumps(final_decision, indent=4))


--- DIRECTORY AGENT RESULTS (Final Disposition) ---
{
    "action": "ROUTE_TO_HUMAN_REVIEW",
    "priority": "HIGH - NPI Failure",
    "status_message": "Confidence score of 10 is below threshold (75). Record flagged for Payer Operations Staff.",
    "dashboard_alert": "PENDING: Discrepancy detected (NPI validation failed).",
    "next_step": "Human staff must resolve the conflict and trigger the Feedback Loop."
}
