In [1]:
from kamiwaza_client import KamiwazaClient
from pydantic import BaseModel, Field
from typing import List, Optional, TypeVar, Type
import json
import PyPDF2
import re

# Initialize Kamiwaza client
client = KamiwazaClient("http://34.230.49.204:7777/api/")
client.serving.list_active_deployments()

# Get OpenAI client for Qwen3
openai_client = client.openai.get_client('Qwen3-32B-AWQ')


In [2]:
# Define structured output models using Pydantic
class ContactInfo(BaseModel):
    """Contact information extracted from resume"""
    full_name: str = Field(..., description="Full name of the candidate")
    email: Optional[str] = Field(None, description="Email address")
    phone: Optional[str] = Field(None, description="Phone number")
    linkedin: Optional[str] = Field(None, description="LinkedIn profile URL")
    github: Optional[str] = Field(None, description="GitHub profile URL")
    location: Optional[str] = Field(None, description="City, State/Country")

class Education(BaseModel):
    """Educational background entry"""
    institution: str = Field(..., description="Name of educational institution")
    degree: str = Field(..., description="Degree obtained (e.g., Bachelor of Science)")
    field_of_study: Optional[str] = Field(None, description="Major or field of study")
    start_year: Optional[int] = Field(None, description="Year started")
    end_year: Optional[int] = Field(None, description="Year completed or expected")
    gpa: Optional[float] = Field(None, description="GPA if mentioned")
    achievements: List[str] = Field(default_factory=list, description="Notable achievements, honors, or relevant coursework")

class WorkExperience(BaseModel):
    """Professional work experience entry"""
    company: str = Field(..., description="Company name")
    position: str = Field(..., description="Job title/position")
    location: Optional[str] = Field(None, description="Job location")
    start_date: Optional[str] = Field(None, description="Start date (any format)")
    end_date: Optional[str] = Field(None, description="End date or 'Present'")
    responsibilities: List[str] = Field(default_factory=list, description="Key responsibilities and achievements")
    technologies: List[str] = Field(default_factory=list, description="Technologies/tools used")

class Skill(BaseModel):
    """Technical or professional skill"""
    category: str = Field(..., description="Skill category (e.g., Programming Languages, Frameworks)")
    skills: List[str] = Field(..., description="List of specific skills in this category")

class Project(BaseModel):
    """Personal or professional project"""
    name: str = Field(..., description="Project name")
    description: str = Field(..., description="Brief project description")
    technologies: List[str] = Field(default_factory=list, description="Technologies used")
    url: Optional[str] = Field(None, description="Project URL if available")
    highlights: List[str] = Field(default_factory=list, description="Key achievements or features")

class StructuredResume(BaseModel):
    """Complete structured representation of a resume"""
    contact: ContactInfo
    summary: Optional[str] = Field(None, description="Professional summary or objective")
    education: List[Education] = Field(default_factory=list, description="Educational background")
    experience: List[WorkExperience] = Field(default_factory=list, description="Work experience")
    skills: List[Skill] = Field(default_factory=list, description="Technical and professional skills")
    projects: List[Project] = Field(default_factory=list, description="Notable projects")
    certifications: List[str] = Field(default_factory=list, description="Professional certifications")
    languages: List[str] = Field(default_factory=list, description="Languages spoken")
    interests: List[str] = Field(default_factory=list, description="Personal interests or hobbies")

In [3]:
# Helper function for Qwen3 structured output
T = TypeVar('T')

def qwen_structured(client, prompt: str, response_format: Type[T], model="model", enable_thinking=True) -> T:
    """
    Helper to get structured output from Qwen3.
    
    Args:
        client: OpenAI client instance (from KamiwazaClient)
        prompt: Your user prompt
        response_format: Pydantic model class
        model: Model name (default "model")
        enable_thinking: Whether to enable thinking mode
    
    Returns:
        Instance of your Pydantic model
    """
    # Get JSON schema
    schema = response_format.model_json_schema()
    
    # Create system message with clear instructions
    system_message = f"""You are an expert resume parser. Extract structured information from the resume text provided.
Be thorough and capture all relevant details. If information is not present, leave those fields as null or empty arrays.

Think through the extraction step by step, then provide your final answer as valid JSON matching this exact schema:

{json.dumps(schema, indent=2)}

Put ONLY the JSON in your final response, no explanatory text."""
    
    # Make the request
    extra_body = {"enable_thinking": True} if enable_thinking else {}
    
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": prompt}
        ],
        temperature=0.1,  # Low temperature for consistent extraction
        extra_body=extra_body
    )
    
    # Extract JSON from response
    content = response.choices[0].message.content or ""
    reasoning_content = getattr(response.choices[0].message, 'reasoning_content', '') or ""
    
    # Try content first, then reasoning_content
    json_text = content.strip() or reasoning_content.strip()
    
    # Clean up JSON if wrapped in code blocks
    if '```json' in json_text:
        json_text = json_text.split('```json')[1].split('```')[0].strip()
    elif json_text.startswith('```'):
        json_text = json_text.split('```')[1].split('```')[0].strip()
    
    # Parse and return
    try:
        json_data = json.loads(json_text)
        return response_format(**json_data)
    except (json.JSONDecodeError, ValueError) as e:
        # If parsing fails, try to find JSON in the text
        import re
        json_match = re.search(r'\{[\s\S]*\}', json_text)
        if json_match:
            try:
                json_data = json.loads(json_match.group())
                return response_format(**json_data)
            except Exception:
                pass
        raise ValueError(f"Failed to parse JSON response: {e}\nRaw response: {json_text}")

# Streaming version for seeing the thinking process
def qwen_structured_stream(client, prompt: str, response_format: Type[T], model="model") -> T:
    """Streaming version that shows thinking process"""
    schema = response_format.model_json_schema()
    system_message = f"""You are an expert resume parser. Think through the extraction step by step, then provide your final answer as valid JSON matching this exact schema:

{json.dumps(schema, indent=2)}

Put ONLY the JSON in your final response, no explanatory text."""
    
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": prompt}
        ],
        temperature=0.1,
        extra_body={"enable_thinking": True},
        stream=True
    )
    
    collected_content = ""
    collected_reasoning = ""
    
    print("🤔 Thinking process:\n")
    
    # Stream the response
    for chunk in response:
        delta = chunk.choices[0].delta
        
        # Show thinking in cyan
        if getattr(delta, "reasoning_content", None):
            print("\033[36m" + delta.reasoning_content + "\033[0m", end="", flush=True)
            collected_reasoning += delta.reasoning_content
            
        # Show final answer in default color
        if delta.content:
            print(delta.content, end="", flush=True)
            collected_content += delta.content
    
    print("\n")  # New lines
    
    # Parse the result
    json_text = collected_content.strip() or collected_reasoning.strip()
    
    # Clean up JSON if wrapped in code blocks
    if '```json' in json_text:
        json_text = json_text.split('```json')[1].split('```')[0].strip()
    elif json_text.startswith('```'):
        json_text = json_text.split('```')[1].split('```')[0].strip()
    
    json_data = json.loads(json_text)
    return response_format(**json_data)

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text content from a PDF file"""
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            num_pages = len(pdf_reader.pages)
            
            for page_num in range(num_pages):
                page = pdf_reader.pages[page_num]
                text += page.extract_text() + "\n"
                
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""
    
    return text

# Function to clean and prepare text for processing
def clean_text(text: str) -> str:
    """Clean extracted text for better processing"""
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters that might interfere
    text = text.replace('\x00', '')
    # Ensure newlines are preserved for structure
    text = re.sub(r'(?<=[.!?])\s+', '\n', text)
    return text.strip()

# Main function to parse resume
def parse_resume(pdf_path: str, use_streaming: bool = False) -> StructuredResume:
    """Parse a resume PDF and extract structured data"""
    
    print("📄 Extracting text from PDF...")
    raw_text = extract_text_from_pdf(pdf_path)
    
    if not raw_text:
        raise ValueError("Could not extract text from PDF")
    
    cleaned_text = clean_text(raw_text)
    print(f"✅ Extracted {len(cleaned_text)} characters of text\n")
    
    # Create the user prompt
    user_prompt = f"""Parse this resume and extract all information according to the schema:

{cleaned_text}"""

    print("🤖 Sending to Qwen3 for structured extraction...\n")
    
    # Use streaming or non-streaming version
    if use_streaming:
        structured_resume = qwen_structured_stream(
            openai_client,
            user_prompt,
            StructuredResume,
            model="model"
        )
    else:
        structured_resume = qwen_structured(
            openai_client,
            user_prompt,
            StructuredResume,
            model="model"
        )
    
    return structured_resume

# Function to display structured resume nicely
def display_resume(resume: StructuredResume):
    """Pretty print the structured resume data"""
    print("\n" + "="*60)
    print("📋 STRUCTURED RESUME DATA")
    print("="*60)
    
    # Contact Information
    print(f"\n👤 {resume.contact.full_name}")
    if resume.contact.email:
        print(f"   📧 {resume.contact.email}")
    if resume.contact.phone:
        print(f"   📱 {resume.contact.phone}")
    if resume.contact.location:
        print(f"   📍 {resume.contact.location}")
    if resume.contact.linkedin:
        print(f"   💼 {resume.contact.linkedin}")
    if resume.contact.github:
        print(f"   🐙 {resume.contact.github}")
    
    # Summary
    if resume.summary:
        print(f"\n📝 SUMMARY\n{resume.summary}")
    
    # Education
    if resume.education:
        print("\n🎓 EDUCATION")
        for edu in resume.education:
            print(f"\n   {edu.institution}")
            print(f"   {edu.degree}" + (f" in {edu.field_of_study}" if edu.field_of_study else ""))
            if edu.start_year and edu.end_year:
                print(f"   {edu.start_year} - {edu.end_year}")
            if edu.gpa:
                print(f"   GPA: {edu.gpa}")
            if edu.achievements:
                print("   Achievements:")
                for achievement in edu.achievements:
                    print(f"   • {achievement}")
    
    # Experience
    if resume.experience:
        print("\n💼 WORK EXPERIENCE")
        for exp in resume.experience:
            print(f"\n   {exp.position} at {exp.company}")
            if exp.location:
                print(f"   {exp.location}")
            if exp.start_date and exp.end_date:
                print(f"   {exp.start_date} - {exp.end_date}")
            if exp.responsibilities:
                print("   Responsibilities:")
                for resp in exp.responsibilities:
                    print(f"   • {resp}")
            if exp.technologies:
                print(f"   Technologies: {', '.join(exp.technologies)}")
    
    # Skills
    if resume.skills:
        print("\n🛠️ SKILLS")
        for skill_group in resume.skills:
            print(f"\n   {skill_group.category}:")
            print(f"   {', '.join(skill_group.skills)}")
    
    # Projects
    if resume.projects:
        print("\n🚀 PROJECTS")
        for project in resume.projects:
            print(f"\n   {project.name}")
            print(f"   {project.description}")
            if project.technologies:
                print(f"   Technologies: {', '.join(project.technologies)}")
            if project.url:
                print(f"   URL: {project.url}")
            if project.highlights:
                for highlight in project.highlights:
                    print(f"   • {highlight}")
    
    # Other sections
    if resume.certifications:
        print("\n🏆 CERTIFICATIONS")
        for cert in resume.certifications:
            print(f"   • {cert}")
    
    if resume.languages:
        print("\n🗣️ LANGUAGES")
        print(f"   {', '.join(resume.languages)}")
    
    if resume.interests:
        print("\n🎯 INTERESTS")
        print(f"   {', '.join(resume.interests)}")



In [4]:
pdf_path = "example_resume.pdf"  

structured_data = parse_resume(pdf_path, use_streaming=True)
        

display_resume(structured_data)
        
# You can also convert to JSON for storage or further processing
print("\n\n📊 JSON Output:")
print(json.dumps(structured_data.model_dump(), indent=2))
        
# Or access specific fields programmatically
print("\n\n✨ Quick Summary:")
print(f"Candidate: {structured_data.contact.full_name}")
print(f"Total Experience: {len(structured_data.experience)} positions")
print(f"Education: {len(structured_data.education)} degrees")
print(f"Projects: {len(structured_data.projects)} projects")

2025-06-11 12:17:55,304 - httpx - INFO - HTTP Request: POST http://34.230.49.204:51105/v1/chat/completions "HTTP/1.1 200 OK"


📄 Extracting text from PDF...
✅ Extracted 1027 characters of text

🤖 Sending to Qwen3 for structured extraction...

🤔 Thinking process:

[36m
[0m[36mOkay[0m[36m,[0m[36m let[0m[36m's[0m[36m start[0m[36m parsing[0m[36m this[0m[36m resume[0m[36m.[0m[36m The[0m[36m user[0m[36m wants[0m[36m all[0m[36m information[0m[36m extracted[0m[36m according[0m[36m to[0m[36m the[0m[36m provided[0m[36m schema[0m[36m.[0m[36m First[0m[36m,[0m[36m I[0m[36m'll[0m[36m look[0m[36m at[0m[36m the[0m[36m contact[0m[36m information[0m[36m.[0m[36m The[0m[36m name[0m[36m is[0m[36m Jane[0m[36m Doe[0m[36m,[0m[36m and[0m[36m the[0m[36m contact[0m[36m details[0m[36m are[0m[36m listed[0m[36m as[0m[36m Software[0m[36m Engineer[0m[36m,[0m[36m email[0m[36m,[0m[36m GitHub[0m[36m,[0m[36m and[0m[36m location[0m[36m.[0m[36m The[0m[36m contact[0m[36m section[0m[36m should[0m[36m include[0m[36m full[0m[36m_n