# LangExtract Resume Parser

### Imports

In [None]:
import requests
import os
import datetime
import pandas as pd
import pandas_gbq
import json
import webbrowser
import pypdf
import textwrap
import io 
from typing import List
import fitz  # PyMuPDF
import io


from google import genai
from google.cloud import storage
from google.oauth2 import service_account 

import langextract as lx

import streamlit as st

### Env

In [None]:
from dotenv import load_dotenv
# Load the variables from the .env file (it looks for a file named .env in the same directory)
load_dotenv()

### Variables

In [None]:
LANGEXTRACT_API_KEY = os.environ.get("LANGEXTRACT_API_KEY")

GOOGLE_APPLICATION_CREDENTIALS = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")

BUCKET_NAME = os.environ.get("BUCKET_NAME")

PROJECT_ID = os.environ.get("PROJECT_ID")

TABLE_NAME = os.environ.get("TABLE_NAME")

MODEL_NAME = "gemini-2.5-flash"

# Set pricing (as of late 2024, for input tokens)
# Check the Google AI documentation for the most current pricing.
PRICE_PER_MILLION_INPUT_TOKENS = 1.25  # Price in USD

# URL valid for 30 minutes
EXPIRATION = 30 

# Define the target color for white text in PDF RGB (24-bit integer)
# 0xFFFFFF in hex is 16777215 in decimal. This represents pure white foreground text.
WHITE_COLOR_INT = 16777215

### Clients

In [None]:
client = genai.Client(LANGEXTRACT_API_KEY)

In [None]:
client.models.get(model=MODEL_NAME).model_dump(exclude_defaults=True)

### GCS

In [None]:
def list_gcs_bucket_contents(bucket_name):
    """Lists all the blobs in the bucket."""
    
    # Empty list
    blob_list = []


    # Initialize a storage client
    storage_client = storage.Client(project=PROJECT_ID)

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # List all objects in the bucket
    blobs = bucket.list_blobs()

    print("Objects in GCS bucket:")
    for blob in blobs:
        print(blob.name)
        blob_list.append(blob.name)

    return blob_list

In [None]:
# --- SECURITY NOTE (Best Practice) ---
# For production environments (like Cloud Run or GKE), Google strongly recommends 
# avoiding downloaded Service Account key files and instead using Workload Identity 
# Federation or Service Account impersonation. This script uses the JSON key file 
# approach primarily for simple local testing where the environment is trusted 
# by the developer.

#    Authentication (Local/Testing): V4 signed URLs require a Service Account key file (.json).
#    Credentials provided by 'gcloud auth application-default login' are NOT enough.
#    To run locally, you must set the environment variable to the path of your key file:
#    export GOOGLE_APPLICATION_CREDENTIALS="/path/to/your/service-account-key.json"

def generate_gcs_signed_url(bucket_name: str, blob_name: str, expiration_minutes: int = 15, credentials_path: str = None) -> str:
    """
    Generates a V4 signed URL for a Google Cloud Storage object, allowing temporary
    read access. This URL can be passed to the Gemini API for accessing data.

    Args:
        bucket_name: The name of the GCS bucket (e.g., 'my-media-bucket').
        blob_name: The name of the object/file (e.g., 'images/input.png').
        expiration_minutes: The duration (in minutes) the URL will be valid for.

    Returns:
        The generated signed URL string.
    """
    try:
        # V4 signing requires credentials with a private key (Service Account).
        
        if credentials_path:
            # Load explicit Service Account credentials for signing
            credentials = service_account.Credentials.from_service_account_file(credentials_path)
            storage_client = storage.Client(credentials=credentials)
        else:
            # Fallback to default credentials. This will only succeed if running
            # on a Google Cloud service (like a VM or Cloud Run) with a Service 
            # Account already attached, or if default credentials include a key.
            # If running locally without GOOGLE_APPLICATION_CREDENTIALS set, this 
            # will likely result in the 'private key' error.
            storage_client = storage.Client()


        # Get the bucket object
        bucket = storage_client.bucket(bucket_name)

        # Get the blob (file) object
        blob = bucket.blob(blob_name)
        
        # Calculate the expiration time
        expiration_time = datetime.timedelta(minutes=expiration_minutes)

        # Generate the signed URL using V4 signing
        signed_url = blob.generate_signed_url(
            version="v4",
            # The URL will be valid until this expiration time
            expiration=expiration_time,
            # The client needs 'GET' permission to read the file
            method="GET",
        )

        print(f"‚úÖ Generated signed URL for gs://{bucket_name}/{blob_name}")
        print(f"   URL valid for {expiration_minutes} minutes.")
        return signed_url

    except Exception as e:
        # Check for the specific error to give a helpful message
        if "you need a private key to sign credentials" in str(e):
             print("‚ùå ERROR: V4 signing failed. Please ensure you have set the "
                   "GOOGLE_APPLICATION_CREDENTIALS environment variable to the "
                   "path of your Service Account JSON key file.")
             print("   Note: For production, we recommend more secure methods like "
                   "Workload Identity Federation or Service Account impersonation.")
        else:
            print(f"‚ùå An unexpected error occurred during URL generation: {e}")
        return ""


In [None]:
blob_list = list_gcs_bucket_contents(BUCKET_NAME)

In [None]:
BLOB_NAME = blob_list[1]
BLOB_NAME

In [None]:

TEXT_URL = generate_gcs_signed_url(
                                            bucket_name=BUCKET_NAME,
                                            blob_name=BLOB_NAME,
                                            expiration_minutes=EXPIRATION, 
                                            credentials_path=GOOGLE_APPLICATION_CREDENTIALS
                                        )

if TEXT_URL:
    print(TEXT_URL)

### Extract Text from URL

In [None]:
def extract_text_from_url(pdf_url: str) -> str:
    """
    Reads a PDF file from a given URL by downloading its binary content into memory (BytesIO) 
    and extracts all text content without saving the file locally.
    
    Args:
        pdf_url: The public or secured URL path to the PDF document (e.g., 'https://example.com/resume.pdf').

    Returns:
        A single string containing all text extracted from the PDF, or an empty string on error.
    """
    
    pdf_bytes = None
    
    # --- STEP 1: Fetch the Binary Content from the URL ---
    try:
        print(f"Fetching PDF content from URL: {pdf_url}...")
        
        # Use requests to get the content. Crucially, use .content for binary data (PDF)
        response = requests.get(pdf_url)
        response.raise_for_status()  # Check for HTTP errors (4xx or 5xx)
        pdf_bytes = response.content # Use .content for binary data, NOT .text
        
        print(f"Successfully fetched {len(pdf_bytes):,} bytes of PDF data.")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching PDF from URL: {e}")
        return ""
    
    # Check if content was successfully downloaded
    if not pdf_bytes:
        print("Error: Downloaded content was empty.")
        return ""
    
    # --- STEP 2: Process Bytes in Memory ---
    try:
        # Create an in-memory file-like object from the bytes
        pdf_stream = io.BytesIO(pdf_bytes)
        
        # Create a PdfReader object from the in-memory stream
        reader = pypdf.PdfReader(pdf_stream)
        full_text = []

        # Loop through all pages and extract text
        for page in reader.pages:
            text = page.extract_text()
            if text:
                full_text.append(text)
        
        # Join the text from all pages with two newlines for separation
        return "\n\n".join(full_text)

    except Exception as e:
        print(f"An error occurred during PDF processing (in memory): {e}")
        return ""

In [None]:

# Extract the clean text from URL
clean_resume_text = extract_text_from_url(TEXT_URL)

# Print the result (you would pass this 'clean_resume_text' to lx.extract)
print(f"\n--- Extracted Text from {TEXT_URL} (Preview) ---")
if clean_resume_text:
    # Print the first 500 characters of the extracted text
    print(textwrap.shorten(clean_resume_text, width=500, placeholder="... [Text Truncated]"))
    
    # You would then use the clean_resume_text with your LangExtract definitions
else:
    print("No text was extracted or an error occurred. Ensure GCS authentication is set up and the URL is correct.")


### Estimate Token Cost

In [None]:
# --- Count the Tokens ---
try:
    print(f"Initializing model: {MODEL_NAME}")
    model = client.models.get(model=MODEL_NAME)
    
    print("Counting tokens... (This may take a moment for a large book)")
    token_count = client.models.count_tokens(model=MODEL_NAME, contents=clean_resume_text)
    
    total_tokens = token_count.total_tokens
    print(f"\n--- Results ---")
    print(f"‚úÖ Total Tokens: {total_tokens:,}")

    # --- Estimate the Cost ---
    # Cost = (Total Tokens / 1,000,000) * Price per Million
    estimated_cost = (total_tokens / 1_000_000) * PRICE_PER_MILLION_INPUT_TOKENS
    
    print(f"Price per 1M input tokens: ${PRICE_PER_MILLION_INPUT_TOKENS:.2f}")
    print(f"üí∞ Estimated Cost (Input): ${estimated_cost:.6f}") # Use .6f for small fractions of a cent

except Exception as e:
    print(f"\nAn error occurred while counting tokens: {e}")
    print("Please ensure your API key is correct and you have API access.")

### Prompt

In [None]:
# Define the Prompt
# Explicitly request all entities and structured/nested formats for complex ones.
prompt = textwrap.dedent("""
    Extract the following entities from the text:
    - Candidate Name
    - Email Address
    - Phone Number
    - Skills (list)
    - Education (nested structure)
    - Work Experience (nested structure)
    - Certifications (list)
    - Projects (list)
    - Languages Known (list)
    - Awards and Honors (list)
    
    Provide the results in a structured JSON format. For Work Experience and Education, 
    extract each instance (e.g., a single job or degree) as a separate entity 
    using the 'attributes' field to capture details like dates, titles, and descriptions.
""").strip()    

### Examples

In [None]:
# Define a High-Quality Example (Few-Shot Prompting)
# This example covers all requested fields, demonstrating both simple lists 
# and the complex, nested structures (Work Experience, Education) using attributes.

examples: List[lx.data.ExampleData] = [
    lx.data.ExampleData(
        text="""
        Sam Tritto | sam.tritto@example.com | 555-123-4567

        Experience
        Data Scientist at The Home Depot (2019-2023). Led ML model deployment for inventory optimization.
        Research Associate at University Y (2018-2019). Published 2 papers on NLP model efficiency.

        Education
        M.S. in Data Science, University X, Graduated 2019. Thesis on predictive analysis.

        Skills: Python, TensorFlow, SQL, PyTorch.
        Certifications: AWS Certified Data Analytics, Google Cloud Professional Data Engineer.
        Projects: Recommendation Engine (GitHub link).
        Languages: English (Fluent), Spanish (Conversational).
        Awards: Employee of the Year 2021 (THD).
        """,
        extractions=[
            # --- Simple Contact & Identifier Entities ---
            lx.data.Extraction(extraction_class="candidate_name", extraction_text="Sam Tritto"),
            lx.data.Extraction(extraction_class="email_address", extraction_text="sam.tritto@example.com"),
            lx.data.Extraction(extraction_class="phone_number", extraction_text="555-123-4567"),

            # --- List Entities (Repeat extraction_class for each item) ---
            lx.data.Extraction(extraction_class="skill", extraction_text="Python"),
            lx.data.Extraction(extraction_class="skill", extraction_text="TensorFlow"),
            lx.data.Extraction(extraction_class="skill", extraction_text="SQL"),
            lx.data.Extraction(extraction_class="skill", extraction_text="PyTorch"),
            
            lx.data.Extraction(extraction_class="language", extraction_text="English (Fluent)"),
            lx.data.Extraction(extraction_class="language", extraction_text="Spanish (Conversational)"),

            lx.data.Extraction(extraction_class="certification", extraction_text="AWS Certified Data Analytics"),
            lx.data.Extraction(extraction_class="certification", extraction_text="Google Cloud Professional Data Engineer"),

            lx.data.Extraction(extraction_class="project", extraction_text="Recommendation Engine (GitHub link)"),

            lx.data.Extraction(extraction_class="award", extraction_text="Employee of the Year 2021 (THD)"),

            # --- Complex/Nested Entity: Work Experience (Instance 1) ---
            lx.data.Extraction(
                extraction_class="work_experience",
                extraction_text="Data Scientist at The Home Depot (2019-2023). Led ML model deployment for inventory optimization.",
                attributes={
                    "role": "Data Scientist",
                    "company": "The Home Depot",
                    "start_year": "2019",
                    "end_year": "2023",
                    "description": "Led ML model deployment for inventory optimization."
                }
            ),
            # --- Complex/Nested Entity: Work Experience (Instance 2) ---
            lx.data.Extraction(
                extraction_class="work_experience",
                extraction_text="Research Associate at University Y (2018-2019). Published 2 papers on NLP model efficiency.",
                attributes={
                    "role": "Research Associate",
                    "company": "University Y",
                    "start_year": "2018",
                    "end_year": "2019",
                    "description": "Published 2 papers on NLP model efficiency."
                }
            ),
            
            # --- Complex/Nested Entity: Education ---
            lx.data.Extraction(
                extraction_class="education",
                extraction_text="M.S. in Data Science, University X, Graduated 2019. Thesis on predictive analysis.",
                attributes={
                    "degree": "M.S. in Data Science",
                    "institution": "University X",
                    "graduation_year": "2019",
                    "description": "Thesis on predictive analysis."
                }
            ),
            
        ]
    )
]


### LangExtract

In [None]:
result = lx.extract(
    text_or_documents=clean_resume_text,
    prompt_description=prompt,
    examples=examples,
    model_id=MODEL_NAME,
    extraction_passes=2,    # Improves recall through multiple passes
    max_workers=20,         # Parallel processing for speed
    max_char_buffer=1000    # Smaller contexts for better accuracy
)

In [None]:
# Save the results to a JSONL file
lx.io.save_annotated_documents([result], output_name="extraction_results.jsonl", output_dir=".")

### Visualization

In [None]:
file_name = "visualization.html"

# Generate the visualization from the file
html_content = lx.visualize("extraction_results.jsonl")
with open(file_name, "w") as f:
    if hasattr(html_content, 'data'):
        f.write(html_content.data)  # For Jupyter/Colab
    else:
        f.write(html_content)

In [None]:

def open_html_in_browser(file_path: str):
    """
    Opens a local HTML file in the default web browser.
    
    Args:
        file_path: The local path to the HTML file.
    """
    # Convert the file path to a URL format (needed for cross-platform reliability)
    # The 'file:///' prefix tells the browser it's a local file.
    full_path = os.path.abspath(file_path)
    webbrowser.open_new_tab(f"file:///{full_path}")



In [None]:
# Open the generated file in your browser
open_html_in_browser(file_name)

### White Text Detector

In [None]:

def detect_white_text(pdf_file_path_or_bytes: str | bytes) -> dict:
    """
    Scans a PDF document for text spans where the foreground color is white
    (0xFFFFFF), indicating potentially hidden text against a white background.

    Args:
        pdf_file_path_or_bytes: The file path (str) or file bytes (bytes)
                                of the PDF resume.

    Returns:
        A dictionary containing a list of findings and a summary flag.
    """
    findings = []
    
    # Handle both file paths (for local testing) and bytes (for Streamlit upload)
    try:
        if isinstance(pdf_file_path_or_bytes, bytes):
            # Open from memory (e.g., Streamlit uploaded file)
            doc = fitz.open(stream=pdf_file_path_or_bytes, filetype="pdf")
        else:
            # Open from file path
            doc = fitz.open(pdf_file_path_or_bytes)
    except Exception as e:
        print(f"Error opening PDF: {e}")
        return {"flagged": False, "findings": [], "error": str(e)}

    for page_num, page in enumerate(doc):
        # Use 'dict' output for maximum detail, down to the span level
        text_data = page.get_text('dict')
        
        for block in text_data.get('blocks', []):
            if 'lines' in block:
                for line in block['lines']:
                    for span in line['spans']:
                        # PyMuPDF stores color as a 24-bit integer
                        span_color = span.get('color')
                        
                        # Check if the text color is white
                        if span_color == WHITE_COLOR_INT:
                            # Extract the text and its bounding box for reporting
                            hidden_text = span['text'].strip()
                            bbox = span['bbox'] # (x0, y0, x1, y1)
                            
                            if hidden_text: # Only log non-empty strings
                                findings.append({
                                    "page": page_num + 1,
                                    "text": hidden_text,
                                    "bbox": bbox,
                                    "reason": "Foreground text color is white (0xFFFFFF)."
                                })

    doc.close()

    return {
        "flagged": len(findings) > 0,
        "findings": findings,
        "total_findings": len(findings)
    }


In [None]:
# Example usage with a local file (for testing)
white_text_results = detect_white_text('Resume (white text).pdf')
print(white_text_results)

### DataFrame

In [None]:
def load_lx_results_from_jsonl(filepath: str) -> list[dict]:
    """
    Loads one or more LangExtract document results from a JSON Lines (.jsonl) file.
    Each line in the file is expected to be a valid JSON object representing a document.
    """
    results = []
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                # Process only non-empty lines
                if line.strip():
                    try:
                        # Parse the JSON string from the line
                        results.append(json.loads(line))
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON line: {e}")
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.")
    except Exception as e:
        print(f"An unexpected error occurred while reading the file: {e}")
        
    return results


In [None]:

def process_lx_output_to_dataframe(lx_result: dict, file_name: str, white_text_results: dict) -> pd.DataFrame:
    """
    Takes the structured JSON output from a LangExtract (lx.extract) call 
    and converts it into a clean pandas DataFrame.

    This function handles both simple fields (like name, email, skills) and 
    nested fields (like work_experience with attributes).
    
    Args:
        lx_result: The dictionary output representing the full document extraction.

    Returns:
        A pandas DataFrame where each row is a single extracted entity.
    """
    
    records = []
    
    # ---------------------------------------------------
    # Process the LangExtract data
    # ---------------------------------------------------

    # Iterate through all extracted entities
    for extraction in lx_result.get("extractions", []):
        class_name = extraction.get("extraction_class")
        text_value = extraction.get("extraction_text")
        attributes = extraction.get("attributes", {})
        
        # Start the record with basic details
        record = {
            "entity_type": class_name,
            "raw_text_value": text_value,
        }
        
        # If there are nested attributes, add them to the record.
        # This is where we flatten the nested structure.
        if attributes:
            # For complex entities, the desired value is usually in the attributes
            # rather than the raw text. We merge the attributes directly.
            record.update(attributes)
        else:
            # For simple entities (Name, Email, Skills), the value is the raw_text_value itself.
            record["value"] = text_value

        records.append(record)

    # ---------------------------------------------------
    # Process the hidden white text findings
    # ---------------------------------------------------

    try:
        # parsed_json = json.loads(white_text_results)
        parsed_json = white_text_results
        findings = parsed_json.get("findings", [])
        for finding in findings:
            # Create a standardized record for each finding
            record = {
                # New entity type for the final structured finding
                "entity_type": "hidden_text_finding",
                "raw_text_value": finding.get("text", "N/A"),
                "hidden_page": finding.get("page", "N/A"),
                "hidden_reason": finding.get("reason", "N/A"),
                # Convert list bbox to string for easier display in DataFrame
                "hidden_bbox": str(finding.get("bbox", "N/A"))
            }
            records.append(record)
        print(f"Successfully parsed {len(findings)} structured hidden text findings.")
    except json.JSONDecodeError as e:
        # Handle case where LLM or mock injected malformed JSON
        print(f"‚ùå Error parsing Raw Hidden Text JSON: {e}")
        records.append({
            "entity_type": "raw_hidden_text_json_error", 
            "raw_text_value": white_text_results, 
            "error": str(e)
        })


    # Convert the list of dictionaries into a DataFrame
    df = pd.DataFrame(records)
    
    # Optional: Clean up and reorder columns for a clearer view
    # This ensures columns like 'role', 'company', etc., show up nicely.
    df = df.sort_values(by='entity_type').reset_index(drop=True)

    df['file_name'] = file_name 
    
    return df


In [None]:
lx_data = load_lx_results_from_jsonl("extraction_results.jsonl")[0]
extraction_df = process_lx_output_to_dataframe(lx_data, BLOB_NAME, white_text_results)
extraction_df

### BigQuery

In [None]:
extraction_df.to_gbq(
                        destination_table=TABLE_NAME,
                        project_id=PROJECT_ID,
                        if_exists="append" 
                    )