## Overview
This notebook demonstrates a generic Resume Parser solution that:
- Supports PDF and Word document formats
- Extracts structured information (name, email, skills)
- Uses latest Google GenAI SDK (google-genai package)
- Returns data in JSON format

## Architecture
1. **Document Processing**: Extract text from PDF/Word files
2. **LLM Parsing**: Use Google's latest GenAI SDK to extract structured data
3. **Validation**: Ensure extracted data meets quality standards
4. **Evaluation**: Measure parser performance

## 1. Environment Setup & Dependencies

In [2]:
# Install required packages
!pip install -q pypdf2 python-docx google-genai python-dotenv pandas numpy scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# Import required libraries
import os
import json
import re
from typing import Dict, List, Optional, Union
from pathlib import Path
from dataclasses import dataclass, asdict
import warnings
warnings.filterwarnings('ignore')

# Document processing
import PyPDF2
from docx import Document

# LLM and API - Using latest Google GenAI SDK
from google import genai
from dotenv import load_dotenv

# Data processing and evaluation
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# Load environment variables
load_dotenv()

print("Dependencies loaded successfully!")
print("Using latest Google GenAI SDK")

Dependencies loaded successfully!
Using latest Google GenAI SDK


## 2. Data Models & Configuration

In [3]:
@dataclass
class ResumeData:
    """Data model for parsed resume information"""
    name: str
    email: str
    skills: List[str]
    
    def to_json(self) -> str:
        """Convert to JSON string"""
        return json.dumps(asdict(self), indent=2)
    
    def validate(self) -> bool:
        """Validate extracted data"""
        # Basic email validation
        email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
        email_valid = bool(re.match(email_pattern, self.email)) if self.email else False
        
        # Name validation (at least 2 characters)
        name_valid = len(self.name) >= 2 if self.name else False
        
        # Skills validation (at least one skill)
        skills_valid = len(self.skills) > 0
        
        return email_valid and name_valid and skills_valid

class Config:
    """Configuration settings"""
    # API Configuration - Using GOOGLE_API_KEY for latest SDK
    GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', os.getenv('GEMINI_API_KEY', ''))
    MODEL_NAME = 'gemini-2.0-flash-exp'
    
    # Parsing Configuration
    MAX_RETRIES = 3
    TEMPERATURE = 0.1  # Low temperature for consistent parsing,
    
    # File paths
    SAMPLE_DIR = Path('sample_resumes')
    
config = Config()
print(f"Configuration loaded. Using model: {config.MODEL_NAME}")
print(f"API Key configured: {'Yes' if config.GOOGLE_API_KEY else 'No (will use fallback parser)'}")

Configuration loaded. Using model: gemini-2.0-flash-exp
API Key configured: Yes


## 3. Document Text Extraction

In [4]:
class DocumentExtractor:
    """Extract text from various document formats"""
    
    @staticmethod
    def extract_from_pdf(file_path: Union[str, Path]) -> str:
        """
        Extract text from PDF file
        
        Args:
            file_path: Path to PDF file
            
        Returns:
            Extracted text as string
        """
        try:
            text = ""
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                num_pages = len(pdf_reader.pages)
                
                for page_num in range(num_pages):
                    page = pdf_reader.pages[page_num]
                    text += page.extract_text()
                    
            return text.strip()
        except Exception as e:
            print(f"Error extracting PDF {file_path}: {str(e)}")
            return ""
    
    @staticmethod
    def extract_from_docx(file_path: Union[str, Path]) -> str:
        """
        Extract text from Word document
        
        Args:
            file_path: Path to Word document
            
        Returns:
            Extracted text as string
        """
        try:
            doc = Document(file_path)
            text = ""
            
            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"
                
            # Also extract text from tables
            for table in doc.tables:
                for row in table.rows:
                    for cell in row.cells:
                        text += cell.text + " "
                    text += "\n"
                    
            return text.strip()
        except Exception as e:
            print(f"Error extracting Word document {file_path}: {str(e)}")
            return ""
    
    @staticmethod
    def extract_text(file_path: Union[str, Path]) -> str:
        """
        Extract text from file based on extension
        
        Args:
            file_path: Path to document
            
        Returns:
            Extracted text as string
        """
        file_path = Path(file_path)
        
        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")
        
        extension = file_path.suffix.lower()
        
        if extension == '.pdf':
            return DocumentExtractor.extract_from_pdf(file_path)
        elif extension in ['.docx', '.doc']:
            return DocumentExtractor.extract_from_docx(file_path)
        else:
            raise ValueError(f"Unsupported file format: {extension}")

# Test the extractor
extractor = DocumentExtractor()
print("Document extractor initialized successfully!")

Document extractor initialized successfully!


## 4. Gemini LLM-Based Resume Parser

In [5]:
class GeminiResumeParser:
    """Parse resume text using latest Google GenAI SDK"""
    
    def __init__(self, api_key: str = None, model: str = 'gemini-2.0-flash-exp'):
        """
        Initialize the Gemini parser with latest SDK
        
        Args:
            api_key: Google API key (or use environment variable)
            model: Model name to use
        """
        self.api_key = api_key or config.GOOGLE_API_KEY
        self.model_name = model
        self.client = None
        
        if self.api_key:
            # Set API key in environment for the SDK
            os.environ['GOOGLE_API_KEY'] = self.api_key
            # Initialize the client with latest SDK pattern
            self.client = genai.Client()
            print(f"Google GenAI client initialized with model: {self.model_name}")
        else:
            print("No API key provided. Will use fallback parser.")
    
    def create_prompt(self, resume_text: str) -> str:
        """
        Create a structured prompt for Gemini
        
        Args:
            resume_text: Raw resume text
            
        Returns:
            Formatted prompt string
        """
        prompt = f"""You are a resume parsing expert. Extract the following information from the resume text below:

1. Full name of the candidate (not job titles)
2. Email address
3. List of technical and professional skills

Return the information in this EXACT JSON format only:
{{
    "name": "Full Name",
    "email": "email@example.com",
    "skills": ["skill1", "skill2", "skill3"]
}}

Important guidelines:
- Extract the actual person's name, not their job title or position
- Only include valid email addresses with @ symbol
- For skills, include programming languages, frameworks, tools, technologies, and relevant soft skills
- If any field is not found, use empty string for name/email or empty list for skills
- Return ONLY the JSON object, no additional text, no markdown formatting
- Do not include ```json or ``` markers

Resume text:
{resume_text[:3000]}  
"""
        return prompt
    
    def parse_with_gemini(self, resume_text: str) -> Optional[ResumeData]:
        """
        Parse resume using latest Google GenAI SDK
        
        Args:
            resume_text: Raw resume text
            
        Returns:
            ResumeData object or None if parsing fails
        """
        if not self.client:
            print("GenAI client not configured. Using fallback parser.")
            return self.fallback_parser(resume_text)
        
        try:
            prompt = self.create_prompt(resume_text)
            
            # Use the latest SDK pattern from pmi-logistics
            response = self.client.models.generate_content(
                model=self.model_name,
                contents=prompt,
                config={
                    "temperature": config.TEMPERATURE,
                    "max_output_tokens": 500,
                    "top_p": 0.9,
                    "top_k": 1
                }
            )
            
            # Extract text from response
            response_text = response.text.strip() if hasattr(response, 'text') else str(response).strip()
            
            # Clean up response text to extract JSON
            # Remove markdown code blocks if present
            if '```json' in response_text:
                response_text = response_text.split('```json')[1].split('```')[0].strip()
            elif '```' in response_text:
                response_text = response_text.split('```')[1].split('```')[0].strip()
            
            # Parse JSON
            data = json.loads(response_text)
            
            return ResumeData(
                name=data.get('name', ''),
                email=data.get('email', ''),
                skills=data.get('skills', [])
            )
            
        except Exception as e:
            print(f"Gemini parsing error: {str(e)}")
            print("Falling back to regex parser...")
            return self.fallback_parser(resume_text)
    
    def fallback_parser(self, resume_text: str) -> ResumeData:
        """
        Fallback regex-based parser for when Gemini is unavailable
        
        Args:
            resume_text: Raw resume text
            
        Returns:
            ResumeData object with extracted information
        """
        # Extract email
        email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
        emails = re.findall(email_pattern, resume_text)
        email = emails[0] if emails else ''
        
        # Extract name (heuristic: first line or line before email)
        lines = resume_text.split('\n')
        name = ''
        for i, line in enumerate(lines[:10]):  # Check first 10 lines
            line = line.strip()
            if line and not any(char.isdigit() for char in line[:5]):
                # Likely a name if it's not empty and doesn't start with numbers
                if len(line.split()) <= 4:  # Names usually have 2-4 parts
                    name = line
                    break
        
        # Extract skills (common programming languages and tools)
        skill_keywords = [
            'Python', 'Java', 'JavaScript', 'TypeScript', 'C++', 'C#', 'Ruby', 'Go', 'Rust', 'Swift', 'Kotlin',
            'HTML', 'CSS', 'SQL', 'NoSQL', 'MongoDB', 'PostgreSQL', 'MySQL', 'Redis', 'Elasticsearch',
            'React', 'Angular', 'Vue.js', 'Node.js', 'Express', 'Django', 'Flask', 'FastAPI', 'Spring Boot',
            'Docker', 'Kubernetes', 'AWS', 'Azure', 'GCP', 'Terraform', 'Ansible',
            'Machine Learning', 'Deep Learning', 'Data Science', 'AI', 'NLP', 'Computer Vision',
            'Git', 'Jenkins', 'CI/CD', 'Agile', 'Scrum', 'REST API', 'GraphQL',
            'TensorFlow', 'PyTorch', 'Scikit-learn', 'Pandas', 'NumPy',
            'Linux', 'Bash', 'PowerShell', 'Microservices', 'DevOps'
        ]
        
        skills = []
        resume_lower = resume_text.lower()
        for skill in skill_keywords:
            if skill.lower() in resume_lower:
                skills.append(skill)
        
        return ResumeData(name=name, email=email, skills=skills)

# Initialize parser
parser = GeminiResumeParser()
print("Resume Parser initialized with latest Google GenAI SDK!")

Both GOOGLE_API_KEY and GEMINI_API_KEY are set. Using GOOGLE_API_KEY.


Google GenAI client initialized with model: gemini-2.0-flash-exp
Resume Parser initialized with latest Google GenAI SDK!


## 5. Main Resume Parser Pipeline

In [6]:
class ResumeParser:
    """Main resume parser combining all components"""
    
    def __init__(self):
        self.extractor = DocumentExtractor()
        self.parser = GeminiResumeParser()
        self.results = []
    
    def parse_resume(self, file_path: Union[str, Path]) -> Dict:
        """
        Parse a single resume file
        
        Args:
            file_path: Path to resume file
            
        Returns:
            Dictionary with parsed data and metadata
        """
        file_path = Path(file_path)
        result = {
            'file_name': file_path.name,
            'file_path': str(file_path),
            'success': False,
            'data': None,
            'error': None
        }
        
        try:
            # Extract text
            print(f"Processing: {file_path.name}")
            text = self.extractor.extract_text(file_path)
            
            if not text:
                raise ValueError("No text extracted from document")
            
            # Parse with Gemini
            resume_data = self.parser.parse_with_gemini(text)
            
            if resume_data and resume_data.validate():
                result['success'] = True
                result['data'] = asdict(resume_data)
                print(f"✓ Successfully parsed: {file_path.name}")
            else:
                result['error'] = "Validation failed"
                if resume_data:
                    result['data'] = asdict(resume_data)  # Store partial data
                print(f"✗ Validation failed: {file_path.name}")
                
        except Exception as e:
            result['error'] = str(e)
            print(f"✗ Error processing {file_path.name}: {str(e)}")
        
        self.results.append(result)
        return result
    
    def parse_directory(self, directory: Union[str, Path]) -> List[Dict]:
        """
        Parse all resumes in a directory
        
        Args:
            directory: Path to directory containing resumes
            
        Returns:
            List of parsing results
        """
        directory = Path(directory)
        results = []
        
        # Find all PDF and Word files
        files = list(directory.glob('*.pdf')) + list(directory.glob('*.docx'))
        
        print(f"Found {len(files)} resume files in {directory}\n")
        
        for file_path in files:
            result = self.parse_resume(file_path)
            results.append(result)
            print()
        
        return results
    
    def get_statistics(self) -> Dict:
        """
        Get parsing statistics
        
        Returns:
            Dictionary with statistics
        """
        total = len(self.results)
        successful = sum(1 for r in self.results if r['success'])
        
        return {
            'total_files': total,
            'successful': successful,
            'failed': total - successful,
            'success_rate': (successful / total * 100) if total > 0 else 0
        }

# Initialize main parser
main_parser = ResumeParser()
print("Main Resume Parser initialized!")

Both GOOGLE_API_KEY and GEMINI_API_KEY are set. Using GOOGLE_API_KEY.


Google GenAI client initialized with model: gemini-2.0-flash-exp
Main Resume Parser initialized!


## 6. Testing with sample resumes

In [7]:
# Parse all resumes in the sample directory
print("Parsing resumes from sample_resumes directory...\n")
print("="*50)

if config.SAMPLE_DIR.exists():
    results = main_parser.parse_directory(config.SAMPLE_DIR)
    
    # Display results
    print("="*50)
    print("\nParsing Results Summary:")
    stats = main_parser.get_statistics()
    for key, value in stats.items():
        print(f"{key}: {value}")
else:
    print(f"Directory {config.SAMPLE_DIR} not found.")
    print("Please add resume files (PDF or DOCX) to the 'sample_resumes' directory.")

Parsing resumes from sample_resumes directory...

Found 4 resume files in sample_resumes

Processing: MathangiC_Resume.pdf
✓ Successfully parsed: MathangiC_Resume.pdf

Processing: N_Anusha_Resume.pdf
✓ Successfully parsed: N_Anusha_Resume.pdf

Processing: dan_bulldog.pdf
✓ Successfully parsed: dan_bulldog.pdf

Processing: RahulRam_Chandrasekaran-Resume-2022.pdf
✓ Successfully parsed: RahulRam_Chandrasekaran-Resume-2022.pdf


Parsing Results Summary:
total_files: 4
successful: 4
failed: 0
success_rate: 100.0


## 7. Evaluation Metrics & Performance Analysis

In [8]:
class ParserEvaluator:
    """Evaluate parser performance"""
    
    def __init__(self, results: List[Dict]):
        self.results = results
        self.df = self._create_dataframe()
    
    def _create_dataframe(self) -> pd.DataFrame:
        """Convert results to DataFrame for analysis"""
        data = []
        for result in self.results:
            row = {
                'file_name': result['file_name'],
                'success': result['success'],
                'has_name': False,
                'has_email': False,
                'has_skills': False,
                'num_skills': 0,
                'error': result.get('error', '')
            }
            
            if result['data']:
                row['has_name'] = bool(result['data'].get('name'))
                row['has_email'] = bool(result['data'].get('email'))
                skills = result['data'].get('skills', [])
                row['has_skills'] = len(skills) > 0
                row['num_skills'] = len(skills)
            
            data.append(row)
        
        return pd.DataFrame(data)
    
    def get_metrics(self) -> Dict:
        """Calculate evaluation metrics"""
        if len(self.df) == 0:
            return {}
        
        metrics = {
            'total_files': len(self.df),
            'parse_success_rate': self.df['success'].mean() * 100,
            'name_extraction_rate': self.df['has_name'].mean() * 100,
            'email_extraction_rate': self.df['has_email'].mean() * 100,
            'skills_extraction_rate': self.df['has_skills'].mean() * 100,
            'avg_skills_per_resume': self.df['num_skills'].mean(),
            'max_skills_extracted': self.df['num_skills'].max(),
            'min_skills_extracted': self.df['num_skills'].min()
        }
        
        return metrics
    
    def display_report(self):
        """Display comprehensive evaluation report"""
        print("\n" + "="*60)
        print("PARSER EVALUATION REPORT")
        print("="*60)
        
        metrics = self.get_metrics()
        
        if not metrics:
            print("No data to evaluate.")
            return
        
        print("\n Overall Performance:")
        print(f"  • Total Files Processed: {metrics['total_files']}")
        print(f"  • Parse Success Rate: {metrics['parse_success_rate']:.1f}%")
        
        print("\n Field Extraction Rates:")
        print(f"  • Name Extraction: {metrics['name_extraction_rate']:.1f}%")
        print(f"  • Email Extraction: {metrics['email_extraction_rate']:.1f}%")
        print(f"  • Skills Extraction: {metrics['skills_extraction_rate']:.1f}%")
        
        print("\n Skills Analysis:")
        print(f"  • Average Skills per Resume: {metrics['avg_skills_per_resume']:.1f}")
        print(f"  • Maximum Skills Extracted: {metrics['max_skills_extracted']}")
        print(f"  • Minimum Skills Extracted: {metrics['min_skills_extracted']}")
        
        # Error analysis
        errors = self.df[self.df['success'] == False]['error'].value_counts()
        if len(errors) > 0:
            print("\n Error Analysis:")
            for error, count in errors.items():
                if error:
                    print(f"  • {error}: {count} occurrence(s)")
        
        print("\n" + "="*60)

# Evaluate results
if main_parser.results:
    evaluator = ParserEvaluator(main_parser.results)
    evaluator.display_report()
    
    # Show detailed results
    print("\n Detailed Results:")
    display_df = evaluator.df[['file_name', 'success', 'has_name', 'has_email', 'num_skills']]
    print(display_df.to_string(index=False))
else:
    print("No results to evaluate. Please run the parser first.")


PARSER EVALUATION REPORT

 Overall Performance:
  • Total Files Processed: 4
  • Parse Success Rate: 100.0%

 Field Extraction Rates:
  • Name Extraction: 100.0%
  • Email Extraction: 100.0%
  • Skills Extraction: 100.0%

 Skills Analysis:
  • Average Skills per Resume: 28.2
  • Maximum Skills Extracted: 41
  • Minimum Skills Extracted: 15


 Detailed Results:
                              file_name  success  has_name  has_email  num_skills
                   MathangiC_Resume.pdf     True      True       True          38
                    N_Anusha_Resume.pdf     True      True       True          41
                        dan_bulldog.pdf     True      True       True          15
RahulRam_Chandrasekaran-Resume-2022.pdf     True      True       True          19


## 8. API Interface & Usage Examples

In [9]:
def parse_resume_file(file_path: str) -> Dict:
    """
    Simple API function to parse a single resume
    
    Args:
        file_path: Path to resume file (PDF or DOCX)
        
    Returns:
        Dictionary with name, email, and skills
    """
    parser = ResumeParser()
    result = parser.parse_resume(file_path)
    
    if result['success']:
        return result['data']
    else:
        # Return partial data if available
        if result['data']:
            return result['data']
        else:
            return {
                'name': '',
                'email': '',
                'skills': [],
                'error': result.get('error', 'Parsing failed')
            }

print("API Usage Example:")
print("="*40)
print("\nTo parse a resume, use:")
print("result = parse_resume_file('path/to/resume.pdf')")
print("print(json.dumps(result, indent=2))")

# Demonstrate with a test if files exist
test_files = list(config.SAMPLE_DIR.glob('*.pdf')) + list(config.SAMPLE_DIR.glob('*.docx'))
if test_files:
    print(f"\nTesting with: {test_files[0].name}")
    test_result = parse_resume_file(test_files[0])
    print(json.dumps(test_result, indent=2))

Both GOOGLE_API_KEY and GEMINI_API_KEY are set. Using GOOGLE_API_KEY.


API Usage Example:

To parse a resume, use:
result = parse_resume_file('path/to/resume.pdf')
print(json.dumps(result, indent=2))

Testing with: MathangiC_Resume.pdf
Google GenAI client initialized with model: gemini-2.0-flash-exp
Processing: MathangiC_Resume.pdf
✓ Successfully parsed: MathangiC_Resume.pdf
{
  "name": "Mathangi Chand",
  "email": "chand53@uwo.ca",
  "skills": [
    "Python",
    "SQL",
    "Java",
    "Spring",
    "JavaScript",
    "React",
    "Angular",
    "PostgreSQL",
    "DB2",
    "Salesforce CRM",
    "Splunk",
    "GCP",
    "Docker",
    "Kubernetes",
    "Git",
    "Linux",
    "Scrum",
    "Jira",
    "Agile",
    "ETL",
    "CI/CD",
    "Maven",
    "Jenkins",
    "Postman",
    "RESTful APIs",
    "HTML",
    "CSS",
    "HTTP",
    "MVC",
    "Microservices",
    "Unit Testing",
    "Hadoop",
    "Kafka",
    "Apache Spark",
    "SDLC",
    "Application Integration",
    "Data Modeling",
    "System Monitoring"
  ]
}


## 9. Best Practices & Production Considerations

In [11]:
# Best practices implementation
class ProductionResumeParser(ResumeParser):
    """
    Production-ready resume parser with additional features:
    - Caching for repeated parses
    - Rate limiting for API calls
    - Enhanced error handling
    - Logging support
    """
    
    def __init__(self):
        super().__init__()
        self.cache = {}
        self.api_calls = 0
        self.max_api_calls = 1500  # Gemini free tier limit per minute
    
    def parse_with_cache(self, file_path: Union[str, Path]) -> Dict:
        """Parse with caching support"""
        file_path = Path(file_path)
        cache_key = f"{file_path.name}_{file_path.stat().st_mtime}"
        
        if cache_key in self.cache:
            print(f"Using cached result for {file_path.name}")
            return self.cache[cache_key]
        
        result = self.parse_resume(file_path)
        self.cache[cache_key] = result
        self.api_calls += 1
        return result
    
    def validate_api_limits(self) -> bool:
        """Check if API rate limits are respected"""
        return self.api_calls < self.max_api_calls



## Best practices Considerations

* Implement caching for repeated parses: Saves time and resources by storing results of previous API calls.

* Add rate limiting for API calls (Gemini: 1500 req/min free tier): Prevents exceeding API limits and ensures fair usage.

* Implement comprehensive error handling: Makes the application robust by gracefully managing unexpected issues.

* Add logging for debugging: Helps developers track the application's behavior and diagnose problems.

* Validate and sanitize all inputs: Crucial for security and preventing malicious data from being processed.

* Consider async processing for bulk operations: Improves performance by allowing simultaneous API calls.

* Implement retry logic for API failures: Ensures reliability by automatically retrying failed requests.

* Add monitoring and metrics collection: Provides insights into the application's health and performance.