# Resume Extractor with Instructor

This notebook demonstrates extracting structured data from PDF resumes using the Instructor library with LiteLLM proxy.

## Setup Path

Add the backend directory to Python path for proper imports.

In [None]:
import sys
from pathlib import Path

# Add the backend directory to the Python path
backend_dir = Path.cwd().parent
sys.path.insert(0, str(backend_dir))

## Configure Proxy Settings

Ensure localhost connections bypass proxy settings.

In [None]:
import os

os.environ['NO_PROXY'] = 'localhost,127.0.0.1'
os.environ['no_proxy'] = 'localhost,127.0.0.1'

## Import and Setup Instructor with LiteLLM

Configure Instructor to work with your LiteLLM proxy using the OpenAI client wrapper.

For the `.env` file, use the template in `.env.example` under the `backend` directory.

- `LITELLM_BASE_URL`: Should be `http://127.0.0.1:4000` (use 127.0.0.1, not 0.0.0.0)
- `LITELLM_API_KEY`: Virtual key from LiteLLM UI at http://127.0.0.1:4000/ui

In [None]:
import instructor
from openai import AsyncOpenAI
from pathlib import Path
from dotenv import load_dotenv
from typing import List, Optional
from pydantic import BaseModel, Field

# Load .env from the parent (backend) directory
env_path = Path.cwd().parent / '.env'
load_dotenv(env_path)

# Validate environment variables
base_url = os.environ.get('LITELLM_BASE_URL')
api_key = os.environ.get('LITELLM_API_KEY')

if not base_url:
    raise ValueError("LITELLM_BASE_URL not found in environment variables")
if not api_key:
    raise ValueError("LITELLM_API_KEY not found in environment variables")

# Add /v1 to the base URL for OpenAI API compatibility
if not base_url.endswith('/v1'):
    base_url = f"{base_url}/v1"

print(f"Connecting to LiteLLM at: {base_url}")

# Create Instructor client with LiteLLM proxy
client = instructor.from_openai(
    AsyncOpenAI(
        base_url=base_url,
        api_key=api_key,
    )
)

## Define Extraction Schemas

Import the existing EducationLLMSchema and create a wrapper for multiple education entries.

In [None]:
from src.schemas.llm.education import EducationLLMSchema

class EducationList(BaseModel):
    """Wrapper for extracting multiple education experiences from a resume."""
    
    education_entries: List[EducationLLMSchema] = Field(
        default_factory=list,
        description="List of all education experiences found in the resume"
    )
    
    extraction_notes: Optional[str] = Field(
        None,
        description="Any notes or observations about the extraction process"
    )

## Extract Single Education Entry

Simple extraction of a single education experience using Instructor's clean API.

In [None]:
from instructor.multimodal import PDF

# Extract single education entry
single_education = await client.chat.completions.create(
    model='gpt-4o-mini',  # or any model available in your LiteLLM proxy
    response_model=EducationLLMSchema,
    messages=[
        {
            "role": "user",
            "content": [
                "Extract the most recent education experience from this resume.",
                PDF.from_path("./files/sample1.pdf")
            ]
        }
    ]
)

# Display the extracted data
print("Single Education Entry:")
print(single_education.model_dump_json(indent=2))

## Extract Multiple Education Entries

Extract all education experiences from the resume.

In [None]:
# Extract all education entries
all_education = await client.chat.completions.create(
    model='gpt-4o-mini',
    response_model=EducationList,
    messages=[
        {
            "role": "user",
            "content": [
                "Extract ALL education experiences from this resume. Include degrees, certifications, and relevant training programs.",
                PDF.from_path("./files/sample1.pdf")
            ]
        }
    ]
)

# Display all extracted education entries
print(f"Found {len(all_education.education_entries)} education entries:\n")
for i, edu in enumerate(all_education.education_entries, 1):
    print(f"Entry {i}:")
    print(f"  Institution: {edu.institution_name}")
    print(f"  Degree: {edu.degree}")
    print(f"  Field: {edu.field_of_study}")
    print(f"  Duration: {edu.start_date} - {edu.end_date}")
    if edu.gpa and edu.max_gpa:
        print(f"  GPA: {edu.gpa}/{edu.max_gpa}")
    print()

## Extract from URL

Instructor also supports extracting from PDFs hosted at URLs.

In [None]:
# Example with URL (replace with actual URL)
# pdf_url = "https://example.com/resume.pdf"
# 
# education_from_url = await client.chat.completions.create(
#     model='gpt-4o-mini',
#     response_model=EducationLLMSchema,
#     messages=[
#         {
#             "role": "user",
#             "content": [
#                 "Extract the education information from this resume.",
#                 PDF.from_url(pdf_url)
#             ]
#         }
#     ]
# )
# 
# print(education_from_url.model_dump_json(indent=2))

## Error Handling and Retries

Instructor provides built-in retry logic for failed extractions.

In [None]:
from tenacity import retry, stop_after_attempt, wait_exponential

# Configure custom retry logic
@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=4, max=10)
)
async def extract_with_retries(pdf_path: str) -> EducationList:
    """Extract education with automatic retries on failure."""
    return await client.chat.completions.create(
        model='gpt-4o-mini',
        response_model=EducationList,
        messages=[
            {
                "role": "user",
                "content": [
                    "Extract all education experiences from this resume.",
                    PDF.from_path(pdf_path)
                ]
            }
        ],
        max_retries=2  # Instructor's built-in retries
    )

# Use the function with retries
# result = await extract_with_retries("./files/sample1.pdf")
# print(f"Successfully extracted {len(result.education_entries)} entries")

## Check Extraction Completeness

Use the completeness calculation from your base schema.

In [None]:
# Check completeness of extracted data
if single_education:
    completeness = single_education.calculate_completeness()
    print(f"Data completeness: {completeness:.1f}%")
    
    # Show which fields are missing
    missing_fields = [
        field_name for field_name in EducationLLMSchema.model_fields
        if getattr(single_education, field_name, None) is None
    ]
    if missing_fields:
        print(f"Missing fields: {', '.join(missing_fields)}")

## Batch Processing Multiple PDFs

Process multiple resume PDFs efficiently.

In [None]:
import asyncio
from typing import Dict, List

async def process_resume(pdf_path: Path) -> Dict:
    """Process a single resume and return results."""
    try:
        result = await client.chat.completions.create(
            model='gpt-4o-mini',
            response_model=EducationList,
            messages=[
                {
                    "role": "user",
                    "content": [
                        "Extract all education information from this resume.",
                        PDF.from_path(str(pdf_path))
                    ]
                }
            ]
        )
        return {
            "file": pdf_path.name,
            "success": True,
            "data": result,
            "entry_count": len(result.education_entries)
        }
    except Exception as e:
        return {
            "file": pdf_path.name,
            "success": False,
            "error": str(e)
        }

# Process multiple PDFs concurrently
async def batch_process_resumes(pdf_dir: Path) -> List[Dict]:
    """Process all PDF files in a directory."""
    pdf_files = list(pdf_dir.glob("*.pdf"))
    print(f"Found {len(pdf_files)} PDF files to process")
    
    # Process all PDFs concurrently
    results = await asyncio.gather(
        *[process_resume(pdf) for pdf in pdf_files]
    )
    
    # Summary
    successful = sum(1 for r in results if r["success"])
    print(f"\nProcessing complete: {successful}/{len(pdf_files)} successful")
    
    return results

# Example usage (uncomment to run)
# results = await batch_process_resumes(Path("./files"))
# for result in results:
#     if result["success"]:
#         print(f"{result['file']}: {result['entry_count']} education entries")
#     else:
#         print(f"{result['file']}: Failed - {result['error']}")

## Key Advantages of Using Instructor

1. **Simplified Code**: No manual file upload/deletion or complex parsing
2. **Type Safety**: Direct Pydantic model returns with validation
3. **Provider Agnostic**: Same code works with OpenAI, Anthropic, or any LiteLLM-supported model
4. **Built-in Retries**: Automatic retry logic for failed extractions
5. **Clean API**: Intuitive interface with `response_model` parameter
6. **No File Management**: No need to track file IDs or clean up uploads
7. **Flexible Input**: Support for local files, URLs, and base64 strings