In [1]:
import os
import sys
sys.path.append('../code')

In [2]:
from fpdf import FPDF
from dotenv import load_dotenv
from llm_connect import get_response
from context import save_candidate_context, get_job_context, save_employee_context, get_all_employees, save_team_summary, get_team_summary
import pyrsm as rsm
import fitz
import random
import markdown2
import pdfkit
import pandas as pd
import json
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import uuid
import re

load_dotenv()

True

# Baseline Team Analysis

In [3]:
# Extracts raw text from a PDF resume using PyMuPDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text() for page in doc])

# Ask LLM to extract structured fields from the resume based on the job description
def parse_employee_with_llm(resume_text):
    prompt = (
        f"You are evaluating an Software Engineer team member currently in the Cloud Infrastructure Team at Firm XYZ:\n\n"
        f"Here is the candidate's resume:\n\n"
        f"{resume_text}\n\n"
        "We already know that this person is a great fit for the company because he/she works here."
        "Generate a detailed profile for this person so that we can compare it to potential candidates."
    )

    # Send the prompt to LLaMA and parse the raw JSON response
    response_text = get_response(
        input=prompt,
        template=lambda x: x,
        llm="llama",
        md=False,
        temperature=0.7,
        max_tokens=700,
    )

    clean_response = response_text.strip()
    return clean_response

def review_llama_summary(resume_text, llama_review):
    prompt = (
        f"You are now receiving the summary and opinion for a Cloud Infrastructure SWE team member for firm XYZ"
        f"Resume Text: \n {resume_text}\n\n"
        f"LLama Review: \n {llama_review}\n\n"
        "Review the LLama input, and create your own detailed judgment on this team member."
        "We need this review to evaluate applicants for the SWE team we are hiring for."
    )

    response = get_response(
        input = prompt,
        template=lambda x: x,
        llm='gemini',
        md=False,
        temperature=0.7,
        max_tokens=700,
    )
    return response.strip()

def generate_team_summary(job_description):
    employees = get_all_employees()
    prompt = (
        f"You are evaluating a Cloud Infrastructure Software Engineer Team at Firm XYZ:\n\n"
        "Here is the data for all employees:\n\n"
        f"{employees}:\n\n"
        "I want you to generate a detailed summary of this team for recruiting purposes."
        f"Identify missing skills and needs relative to this following job description: {job_description}"
    )

    # Send the prompt to LLaMA and parse the raw JSON response
    response_text = get_response(
        input=prompt,
        template=lambda x: x,
        llm="llama",
        md=False,
        temperature=0.7,
        max_tokens=700,
    )

    clean_response = response_text.strip()
    save_team_summary(clean_response)

In [4]:
team_folder = 'data/resumes_team'

counter = 1

for filename in os.listdir(team_folder):
    if not filename.endswith(".pdf"):
        continue

    filepath = os.path.join(team_folder, filename)
    print(f"📄 Processing resume {counter}...")
    counter += 1

    try:
        # 1. Extract plain text from the PDF resume
        resume_text = extract_text_from_pdf(filepath)

        # 2. Use LLaMA & Gemini to parse the resume into structured fields
        llama_employee_profile = parse_employee_with_llm(resume_text)
        gemini_employee_profile = review_llama_summary(resume_text, llama_employee_profile)

        employee_id = os.path.splitext(filename)[0]
        team_dat = {
            'llama_profile': llama_employee_profile,
            'gemini_profile': gemini_employee_profile
        }

        save_employee_context(employee_id, team_dat)

    except Exception as e:
        print(f"❌ Failed to process {filename}: {e}")

📄 Processing resume 1...
📄 Processing resume 2...
📄 Processing resume 3...
📄 Processing resume 4...
📄 Processing resume 5...
📄 Processing resume 6...
📄 Processing resume 7...
📄 Processing resume 8...
📄 Processing resume 9...
📄 Processing resume 10...


In [5]:
with open("data/mcp_context.json", "r") as f:
        mcp_data = json.load(f)
    
#Job Data
job_id = next(iter(mcp_data['jobs']))

# Retrieve the job description text using the job ID
job_description = get_job_context(job_id)['job_description']
generate_team_summary(job_description)

## Baseline Team Analysis

The Baseline Team Analysis module provides a structured evaluation of the current engineering team’s capabilities in the context of a specific job opening. This analysis is stored in `mcp_context.json` under the `team_summary` key and is generated by aggregating detailed profiles of existing employees.

### Purpose

The goal of this analysis is to:
- Establish a benchmark for evaluating new applicants
- Identify skill gaps in the team
- Highlight priority recruitment areas

### Implementation

This section is built using the following logic:
- Team member profiles are generated using LLM summarization based on internal employee resumes.
- A team-wide summary is created using a prompt-driven LLM call that synthesizes individual capabilities into a coherent overview.
- The resulting summary includes strengths, missing competencies, and suggested hiring priorities.

### Example Output

The team summary typically includes:

**Key Strengths**
- Deep experience in cloud infrastructure, including AWS, GCP, and Azure
- Strong backend programming proficiency (Java, Python, C++)
- Familiarity with infrastructure as code tools like Terraform and CloudFormation

**Missing Skills and Needs**
- Limited containerization experience (e.g., Docker, Kubernetes)
- Inconsistent use of agile development methodologies
- Gaps in CI/CD tooling, observability (Prometheus/Grafana), and open-source contributions

**Hiring Recommendations**
- Prioritize candidates with production-level Kubernetes experience
- Recruit engineers familiar with Jenkins/GitLab CI/CD
- Fill team knowledge gaps in networking, monitoring, and agile delivery

### Integration with Evaluation Pipeline

The `team_summary` and individual employee profiles are programmatically injected into the LLM-based resume evaluation prompts. This allows the model to compare candidates directly against the existing team profile, simulating a contextual awareness mechanism similar to a human hiring manager.


# Profile Creation

In [6]:
# Ask LLM to extract structured fields from the resume based on the job description
def parse_resume_with_llm(resume_text, job_description_text, team_profiles, team_summary):
    prompt = (
        f"You are evaluating a candidate for the following job posting:\n\n"
        f"{job_description_text}\n\n"
        f"Here is the candidate's resume:\n\n"
        f"{resume_text}\n\n"
        "Here are the profiles of the current team members:\n\n"
        f"{team_profiles}\n\n"
        "Here is the team summary:\n\n"
        f"{team_summary}\n\n"
        "Extract the following fields into a valid JSON object:\n"
        "- Name\n"
        "- Email\n"
        "- Years of Experience\n"
        "- Key Skills (as a list)\n"
        "- Llama Score (judge the candidate's overall fit for the job on a scale of 1–10)\n\n"
        "⚠️ IMPORTANT:\n"
        "- This is for ONE candidate only. Do NOT return multiple candidates.\n"
        "- Do NOT include a `candidates` array or any explanations.\n"
        "- Return ONLY a single valid JSON object, and nothing else.\n"
    )

    response_text = get_response(
        input=prompt,
        template=lambda x: x,
        llm="llama",
        md=False,
        temperature=0.0,
        max_tokens=700,
    )

    # Strip formatting artifacts
    response_text = response_text.strip().replace("```json", "").replace("```", "").strip()

    # Extract first valid JSON object (avoid partial arrays or junk)
    match = re.search(r'\{\s*".+?"\s*:.+?\}', response_text, re.DOTALL)
    if not match:
        raise ValueError("No valid JSON object found in LLM response.")
    
    raw_json = match.group(0)

    try:
        parsed = json.loads(raw_json)
    except json.JSONDecodeError as e:
        print("LLM JSON decode error:", e)
        print("Raw response was:", raw_json)
        raise e

    # Minimal sanity check
    if not isinstance(parsed, dict) or "Name" not in parsed or "Llama Score" not in parsed:
        raise ValueError("Parsed JSON is missing required fields or is not a dict.")

    return parsed




# Use Gemini to independently review the candidate and give a numeric score (1–10)
def review_llama_score(resume_text, job_description_text, score, team_profiles, team_summary):
    prompt = (
        f"You are evaluating a candidate for the following posting: \n\n"
        f"{job_description_text}\n\n"
        f"Here is the resume: \n\n"
        f"{resume_text}\n\n"
        "Here are the profiles of the current team members:\n\n"
        f"{team_profiles}\n\n"
        "Here is the team summary:\n\n"
        f"{team_summary}\n\n"     
        f"The score llama game for this candidate is: {score}."
        "I want you to judge the candidate's fit for the job on a scale of 1-10."
        "Meeting the year minimum should not be a huge determinant of a person's assessment; it cane be made up via projects/experience."
        "Consider the llama score and give your own judgement."
        "IMPORTANT: Do NOT return any explanations, just the number."
    )

    # Send prompt to Gemini with numeric-only output
    response = get_response(
        input=prompt,
        template=lambda x: x,
        llm='gemini',
        md=False,
        temperature=0.0,
        max_tokens = 10,
        model_name ='gemini-2.0-flash-lite'
    )

    return response

# Ask LLaMA to summarize a candidate’s background and fit in paragraph form
def summarize_entire_resume(resume_text, job_description_text, score, team_profiles, team_summary):
    prompt = (
        f"You are a recruiter reviewing the resume below for the job described.\n\n"
        f"Job Description:\n{job_description_text}\n\n"
        f"Resume Text:\n{resume_text}\n\n"
        "Here are the profiles of the current team members:\n\n"
        f"{team_profiles}\n\n"
        "Here is the team summary:\n\n"
        f"{team_summary}\n\n"     
        f"You have judged this candidate to be a {score} out of 10"
        "Write a detailed, 1 paragraph summary of this candidate's background, skills, and general fit. "
        "Meeting the year minimum should not be a huge determinant of a person's assessment; it cane be made up via projects/experience."
        'If a candidate is qualified, be sure to clearly explain why this person is a good fit.'
        "If a candidate is unqualified, be sure to clearly explain why this person is a bad fit."
        "Be brutally honest with their review."
    )

    response = get_response(
        input=prompt,
        template=lambda x: x,
        llm="llama",
        md=False,
        temperature=0.7,
        max_tokens=500,
    )

    return response.strip()

# Ask Gemini to evaluate the accuracy and fairness of LLaMA's review
def review_llama_summary(resume_text, job_description_text, score, llama_review, team_profiles, team_summary):
    prompt = (
        f"You are now receiving the summary and opinion for a specific candidate for the following job generated by the LLM Llama."
        f" Job Description: \n{job_description_text} \n\n"
        f"Resume Text: \n {resume_text}\n\n"
        f"LLama Review: \n {llama_review}\n\n"
        "Here are the profiles of the current team members:\n\n"
        f"{team_profiles}\n\n"
        "Here is the team summary:\n\n"
        f"{team_summary}\n\n"     
        f"LLama judged this candidate to be a {score} out of 10."
        "Write a detailed, 1 paragraph summary of this candidate and determine if llama's assessment is correct."
        "Meeting the year minimum should not be a huge determinant of a person's assessment; it cane be made up via projects/experience."
        'If a candidate is thought to be qualified but actually is not, clearly explain the difference in opinion.'
        'If a candidate is not qualified but actually is, clearly explain the difference in opinion.'
        "Be brutally honest with this assessment."
    )

    response = get_response(
        input = prompt,
        template=lambda x: x,
        llm='gemini',
        md=False,
        temperature=0.7,
        max_tokens=500,
    )

    return response.strip()

# Process each resume PDF in a folder, extract information, evaluate, and categorize candidates
def process_resumes_with_summaries(resume_folder, job_description_text, score_threshold=7):
    qualified = []      # List to hold qualified candidate profiles
    unqualified = []    # List to hold rejected candidate profiles

    counter = 1         # Resume counter for logging

    # Iterate through all PDF files in the given folder
    for filename in os.listdir(resume_folder):
        if not filename.endswith(".pdf"):
            continue

        filepath = os.path.join(resume_folder, filename)
        print(f"📄 Processing resume {counter}...")
        counter += 1

        try:
            # 1. Extract plain text from the PDF resume
            resume_text = extract_text_from_pdf(filepath)
            team_profiles = get_all_employees()
            team_summary = get_team_summary()


            # 2. Use LLaMA to parse the resume into structured fields
            profile = parse_resume_with_llm(resume_text, job_description_text, team_profiles, team_summary)

            score_raw = profile.get("Llama Score", 0)
            if isinstance(score_raw, int):
                score = score_raw
            elif isinstance(score_raw, str):
                match = re.search(r'\d+', score_raw)
                if match:
                    score = int(match.group())
                else:
                    raise ValueError(f"Invalid score string format: {score_raw}")
            else:
                raise TypeError(f"Unexpected type for score: {type(score_raw)}")


            name = profile.get("Name", "Unnamed")

            # 3. Add internal tracking fields
            profile['Candidate ID'] = filename.replace('.pdf', "")
            profile['Application ID'] = str(uuid.uuid4())
            profile['Name'] = name
            profile["Resume File"] = filename

            # 4. Ask LLaMA to summarize the resume in natural language
            llama_summary = summarize_entire_resume(resume_text, job_description_text, score, team_profiles, team_summary)
            profile['Llama Summary'] = llama_summary

            # 5. Ask Gemini to review LLaMA’s summary and score
            gemini_summary = review_llama_summary(resume_text, job_description_text, score, llama_summary, team_profiles, team_summary)
            profile['Gemini Summary'] = gemini_summary

            # 6. Ask Gemini to independently score the candidate
            gemini_score_raw = review_llama_score(resume_text, job_description_text, score, team_profiles, team_summary)
            try:
                gemini_score = int(re.search(r'\d+', gemini_score_raw).group())
            except Exception:
                print(f"⚠️ Could not parse Gemini score from: {gemini_score_raw}")
                gemini_score = 0

            # 7. Compute the average score across both models
            profile['Gemini Score'] = gemini_score
            avg_score = (int(score) + gemini_score) / 2

            profile['avg_score'] = avg_score

            # 8. Classify the candidate based on the average score
            if float(avg_score) >= score_threshold:
                qualified.append(profile)
            else:
                profile["Note"] = "Below threshold"
                unqualified.append(profile)

        except Exception as e:
            print(f"❌ Failed to process {filename}: {e}")

    # Save to CSVs
    pd.DataFrame(qualified).to_csv("data/candidates/qualified_candidates.csv", index=False)
    pd.DataFrame(unqualified).to_csv("data/candidates/unqualified_candidates.csv", index=False)

    print(f"\n✅ Qualified: {len(qualified)} → data/candidates/qualified_candidates.csv")
    print(f"⚠️ Unqualified: {len(unqualified)} → data/candidates/unqualified_candidates.csv")

if __name__ == "__main__":
    resumes_folder = "data/resumes/"

    with open("data/mcp_context.json", "r") as f:
        mcp_data = json.load(f)
    
     #Job Data
    job_id = next(iter(mcp_data['jobs']))

     # Retrieve the job description text using the job ID
    job_description = get_job_context(job_id)['job_description']


    os.makedirs("data", exist_ok=True)

    # Run the full resume parsing and evaluation pipeline
    process_resumes_with_summaries(resumes_folder, job_description)


📄 Processing resume 1...
📄 Processing resume 2...
📄 Processing resume 3...
📄 Processing resume 4...
📄 Processing resume 5...
📄 Processing resume 6...
📄 Processing resume 7...
📄 Processing resume 8...
📄 Processing resume 9...
📄 Processing resume 10...
📄 Processing resume 11...
📄 Processing resume 12...
📄 Processing resume 13...
📄 Processing resume 14...
📄 Processing resume 15...
📄 Processing resume 16...
📄 Processing resume 17...
📄 Processing resume 18...
📄 Processing resume 19...
📄 Processing resume 20...
📄 Processing resume 21...
📄 Processing resume 22...
📄 Processing resume 23...
📄 Processing resume 24...
📄 Processing resume 25...
📄 Processing resume 26...
📄 Processing resume 27...
📄 Processing resume 28...
📄 Processing resume 29...
📄 Processing resume 30...

✅ Qualified: 9 → data/candidates/qualified_candidates.csv
⚠️ Unqualified: 21 → data/candidates/unqualified_candidates.csv


In [7]:
qualified = pd.read_csv('data/candidates/qualified_candidates.csv')
qualified.head()

Unnamed: 0,Name,Email,Years of Experience,Key Skills,Llama Score,Candidate ID,Application ID,Resume File,Llama Summary,Gemini Summary,Gemini Score,avg_score
0,Alexander Prince,alexanderprince@example.com,8,"['Cloud Infrastructure', 'Security Protocols',...",9,c95788ab-42df-4277-b45c-102d4529df36,37f26b6e-3efd-40a0-870b-66ac104c1c68,c95788ab-42df-4277-b45c-102d4529df36.pdf,Alexander Prince is an exceptionally strong ca...,Alexander Prince presents a strong profile wit...,7,8.0
1,Dawn Osborn,dawnosborn@example.com,9,"['Java', 'Python', 'JavaScript', 'C++', 'Cloud...",8,40157b49-1976-4efb-bd27-3e7bf93c94e3,6d4e867b-376d-44aa-a9cc-91d3f92c9bf1,40157b49-1976-4efb-bd27-3e7bf93c94e3.pdf,Dawn Osborn is a highly skilled and experience...,"Dawn Osborn presents a solid profile, exceedin...",7,7.5
2,Cindy Lambert,cindylambert@example.com,7,"['Java', 'Python', 'C++', 'JavaScript', 'AWS',...",9,096946f9-8ffa-4238-9297-9c0052599845,1e7f4d54-82e5-4ab0-9b01-92c5b67a516b,096946f9-8ffa-4238-9297-9c0052599845.pdf,Cindy Lambert is an exceptionally qualified ca...,Cindy Lambert presents a compelling profile as...,9,9.0
3,Nathan Rodriguez,nathanrodriguez@example.com,8,"['Cloud Infrastructure', 'Microservice Archite...",9,02401398-e3b5-4309-a235-b8aa164ba85f,65738a3b-7240-4678-be43-3eb1696f6eeb,02401398-e3b5-4309-a235-b8aa164ba85f.pdf,Nathan Rodriguez is an exceptionally strong ca...,Nathan Rodriguez presents a very strong profil...,9,9.0
4,Lauren Powers,laurenpowers@example.com,5,"['JavaScript', 'Python', 'Java', 'C++', 'React...",8,c12322b3-d288-4edd-afca-0f5f82bcfd18,8b067655-8cb5-4e45-aefc-8c7c623b23dd,c12322b3-d288-4edd-afca-0f5f82bcfd18.pdf,Lauren Powers is a highly motivated Senior Sof...,"Lauren Powers presents as a promising, albeit ...",6,7.0


In [8]:
unqualified = pd.read_csv('data/candidates/unqualified_candidates.csv')
unqualified.head()

Unnamed: 0,Name,Email,Years of Experience,Key Skills,Llama Score,Candidate ID,Application ID,Resume File,Llama Summary,Gemini Summary,Gemini Score,avg_score,Note
0,Katie Mckenzie,katiemckenzie@example.com,9,"['Java', 'Swift', 'Kotlin', 'Python', 'React N...",6,b823daf8-4d51-4dd0-adcb-8f990e77dd40,5e801293-f233-4e61-acfc-07edc8525b78,b823daf8-4d51-4dd0-adcb-8f990e77dd40.pdf,Katie Mckenzie is a highly experienced Staff S...,Katie McKenzie presents a mixed profile for th...,6,6.0,Below threshold
1,Jorge Padilla,jorgepadilla@example.com,3,"['Cloud Infrastructure', 'Backend Development'...",6,710b3861-c06c-41bc-8df1-b05524abc855,e528bf36-6672-4a57-8c54-c7984fed1c98,710b3861-c06c-41bc-8df1-b05524abc855.pdf,Jorge Padilla is a mid-level software engineer...,"Jorge Padilla, while holding a PhD in Computer...",4,5.0,Below threshold
2,Luis Williams,luiswilliams@example.com,0,"['Java', 'Swift', 'React Native', 'AWS', 'GCP'...",2,020bb618-c3f1-47b8-8a2d-a25879b01173,9643a5e3-9863-4ce5-853e-07813c06f17a,020bb618-c3f1-47b8-8a2d-a25879b01173.pdf,Luis Williams is a highly unqualified candidat...,"Luis Williams, a recent Computer Science gradu...",3,2.5,Below threshold
3,Brian Flores,brianflores@example.com,0,"['JavaScript', 'HTML', 'CSS', 'React', 'Angula...",2,6160ec22-4d63-4a89-89e1-4bf8089bc2d2,c7dfa781-d13f-4f94-9a72-1e04fde304fb,6160ec22-4d63-4a89-89e1-4bf8089bc2d2.pdf,"This candidate, Brian Flores, is a recent grad...",Brian Flores is a recent Computer Science grad...,1,1.5,Below threshold
4,Jennifer Rodriguez,jenniferrodriguez@example.com,0,"['Python', 'Java', 'C++', 'TensorFlow', 'PyTor...",2,f7d40321-23cb-4287-9918-0bb96bec9a01,eef11c20-6ee7-46d9-9f2a-4aca466d07dd,f7d40321-23cb-4287-9918-0bb96bec9a01.pdf,"This candidate, Jennifer Rodriguez, is a recen...","Jennifer Rodriguez, a recent graduate with a M...",3,2.5,Below threshold


## Profile Generation and Evaluation Using LLMs

This section automates the process of extracting structured candidate profiles from raw resumes using large language models (LLMs). The pipeline is designed to mimic how a recruiter would screen and summarize resumes in the context of a specific job description. The key functionality of this section is to allow the LLM to generate its own judgment on each candidate they see. Not only will it give a score, it creates a description of each candidate after parsing each resume.

### Process Overview

1. **Resume Text Extraction**: We use `PyMuPDF (fitz)` to extract text content from PDF resumes stored in the `data/resumes/` directory.

2. **Profile Extraction via LLM**: The extracted resume text is passed into a templated prompt along with the target job description. The LLM returns a structured JSON object containing:
   - Candidate name and email
   - Years of experience
   - A list of key skills
   - A numeric fit score (LLama Score)
   - A team level summary & employee level summary

3. **Score Validation and Review**: A second LLM (Gemini) re-evaluates the candidate using the resume and the first model’s assessment. This acts as a secondary opinion:
   - Gemini produces its own score
   - Both models provide free-text summaries explaining their reasoning
   - An average score is calculated to determine qualification

In part 2 and 3, there is a methodology which LLM is used to generate each profile and score. The first is via LLama as there is no usage limit for this. However, we want to cross check it using Gemini to see if the data and score makes sense. So, to validate the description we use Gemini 2.0 and allow it to create a new description via prior LLama context. Then, the score is validated via Gemini 2.0 Lite as it has a higher token limit and it is will equipped to do simple tasks such as create a score. This ensures that there is less LLM biases that is created by one single model.  

4. **Qualification Filtering**: Candidates with an average score above a defined threshold (default 7) are marked as "qualified." The rest are flagged as "unqualified" and annotated accordingly.

5. **Profile Enrichment**: Additional metadata such as:
   - `Candidate ID`
   - `Application ID`
   - Resume file name
   - Generated summaries
   is included to support downstream analytics and storage.

6. **Output and Storage**: Final profiles are saved as CSVs:
   - `qualified_candidates.csv`
   - `unqualified_candidates.csv`

This setup enables scalable and consistent evaluation of resumes using GenAI, forming a foundation for automated recruiting workflows.


# Data Checks

In [9]:
def validate_and_clean_profiles(df):
    issues = []         # Store rows with validation issues
    clean_rows = []     # Store rows that pass all checks

    for index, row in df.iterrows():
        row_issues = [] # Track issues for this row

        # Clean: trim whitespace in Name
        row["Name"] = str(row.get("Name", "")).strip().title()

        # Validation: check required Name field
        if pd.isna(row.get("Name")) or row["Name"] == "":
            row_issues.append("Missing name")

        # Validation: Years of Experience must be numeric
        if not isinstance(row.get("Years of Experience"), (int, float, np.number)):
            try:
                row["Years of Experience"] = float(row["Years of Experience"])
            except:
                row_issues.append("Years of Experience is not a number")


        # Validation: LLM score must be a number between 1 and 10
        if not isinstance(row.get("Llama Score"), (int, float, np.number)) or not (1 <= float(row["Llama Score"]) <= 10):
            row_issues.append("LLM Score must be between 1–10")

        # Validation: convert Key Skills string to list if needed
        skills = row.get("Key Skills")
        if isinstance(skills, str):
            try:
                row["Key Skills"] = eval(skills) if skills.strip().startswith("[") else [skills.strip()]
            except:
                row["Key Skills"] = [skills.strip()]

        # Ensure Key Skills is a non-empty list
        if not isinstance(row["Key Skills"], list) or len(row["Key Skills"]) == 0:
            row_issues.append("Key Skills must be a non-empty list")

        # If row has any issues, log them
        if row_issues:
            issues.append({"row": index, "issues": row_issues, "data": row.to_dict()})
        else:
            clean_rows.append(row)

    clean_df = pd.DataFrame(clean_rows)
    return clean_df, issues

# Load candidate CSVs, validate, save cleaned data and issue reports
def validate_profiles_from_csvs(qualified_csv, unqualified_csv):
    os.makedirs("data/validated", exist_ok=True)

    print("🔍 Validating qualified candidates...")
    qualified = pd.read_csv(qualified_csv)
    clean_qualified, issues_qualified = validate_and_clean_profiles(qualified)
    clean_qualified.to_csv("data/validated/clean_qualified_candidates.csv", index=False)

    # Save validation issues for qualified candidates
    with open("data/validated/invalid_qualified_profiles.txt", "w") as f:
        for issue in issues_qualified:
            f.write(f"Row {issue['row']} issues: {issue['issues']}\n")
            f.write(f"{issue['data']}\n\n")

    print("🔍 Validating unqualified candidates...")
    unqualified = pd.read_csv(unqualified_csv)
    clean_unqualified, issues_unqualified = validate_and_clean_profiles(unqualified)
    clean_unqualified.to_csv("data/validated/clean_unqualified_candidates.csv", index=False)

    # Save validation issues for unqualified candidates
    with open("data/validated/invalid_unqualified_profiles.txt", "w") as f:
        for issue in issues_unqualified:
            f.write(f"Row {issue['row']} issues: {issue['issues']}\n")
            f.write(f"{issue['data']}\n\n")

    print(f"\n✅ Clean qualified profiles: {len(clean_qualified)}")
    print(f"⚠️ Invalid qualified profiles: {len(issues_qualified)}")
    print(f"✅ Clean sunqualified profiles: {len(clean_unqualified)}")
    print(f"⚠️ Invalid unqualified profiles: {len(issues_unqualified)}")

if __name__ == "__main__":
    validate_profiles_from_csvs(
        qualified_csv="data/candidates/qualified_candidates.csv",
        unqualified_csv="data/candidates/unqualified_candidates.csv"
    )


🔍 Validating qualified candidates...
🔍 Validating unqualified candidates...

✅ Clean qualified profiles: 9
⚠️ Invalid qualified profiles: 0
✅ Clean sunqualified profiles: 21
⚠️ Invalid unqualified profiles: 0


In [10]:
qualified_cleaned_profiles = pd.read_csv('data/validated/clean_qualified_candidates.csv')
qualified_cleaned_profiles.head()

Unnamed: 0,Name,Email,Years of Experience,Key Skills,Llama Score,Candidate ID,Application ID,Resume File,Llama Summary,Gemini Summary,Gemini Score,avg_score
0,Alexander Prince,alexanderprince@example.com,8,"['Cloud Infrastructure', 'Security Protocols',...",9,c95788ab-42df-4277-b45c-102d4529df36,37f26b6e-3efd-40a0-870b-66ac104c1c68,c95788ab-42df-4277-b45c-102d4529df36.pdf,Alexander Prince is an exceptionally strong ca...,Alexander Prince presents a strong profile wit...,7,8.0
1,Dawn Osborn,dawnosborn@example.com,9,"['Java', 'Python', 'JavaScript', 'C++', 'Cloud...",8,40157b49-1976-4efb-bd27-3e7bf93c94e3,6d4e867b-376d-44aa-a9cc-91d3f92c9bf1,40157b49-1976-4efb-bd27-3e7bf93c94e3.pdf,Dawn Osborn is a highly skilled and experience...,"Dawn Osborn presents a solid profile, exceedin...",7,7.5
2,Cindy Lambert,cindylambert@example.com,7,"['Java', 'Python', 'C++', 'JavaScript', 'AWS',...",9,096946f9-8ffa-4238-9297-9c0052599845,1e7f4d54-82e5-4ab0-9b01-92c5b67a516b,096946f9-8ffa-4238-9297-9c0052599845.pdf,Cindy Lambert is an exceptionally qualified ca...,Cindy Lambert presents a compelling profile as...,9,9.0
3,Nathan Rodriguez,nathanrodriguez@example.com,8,"['Cloud Infrastructure', 'Microservice Archite...",9,02401398-e3b5-4309-a235-b8aa164ba85f,65738a3b-7240-4678-be43-3eb1696f6eeb,02401398-e3b5-4309-a235-b8aa164ba85f.pdf,Nathan Rodriguez is an exceptionally strong ca...,Nathan Rodriguez presents a very strong profil...,9,9.0
4,Lauren Powers,laurenpowers@example.com,5,"['JavaScript', 'Python', 'Java', 'C++', 'React...",8,c12322b3-d288-4edd-afca-0f5f82bcfd18,8b067655-8cb5-4e45-aefc-8c7c623b23dd,c12322b3-d288-4edd-afca-0f5f82bcfd18.pdf,Lauren Powers is a highly motivated Senior Sof...,"Lauren Powers presents as a promising, albeit ...",6,7.0


In [11]:
unqualified_cleaned_profiles = pd.read_csv('data/validated/clean_unqualified_candidates.csv')
unqualified_cleaned_profiles.head()

Unnamed: 0,Name,Email,Years of Experience,Key Skills,Llama Score,Candidate ID,Application ID,Resume File,Llama Summary,Gemini Summary,Gemini Score,avg_score,Note
0,Katie Mckenzie,katiemckenzie@example.com,9,"['Java', 'Swift', 'Kotlin', 'Python', 'React N...",6,b823daf8-4d51-4dd0-adcb-8f990e77dd40,5e801293-f233-4e61-acfc-07edc8525b78,b823daf8-4d51-4dd0-adcb-8f990e77dd40.pdf,Katie Mckenzie is a highly experienced Staff S...,Katie McKenzie presents a mixed profile for th...,6,6.0,Below threshold
1,Jorge Padilla,jorgepadilla@example.com,3,"['Cloud Infrastructure', 'Backend Development'...",6,710b3861-c06c-41bc-8df1-b05524abc855,e528bf36-6672-4a57-8c54-c7984fed1c98,710b3861-c06c-41bc-8df1-b05524abc855.pdf,Jorge Padilla is a mid-level software engineer...,"Jorge Padilla, while holding a PhD in Computer...",4,5.0,Below threshold
2,Luis Williams,luiswilliams@example.com,0,"['Java', 'Swift', 'React Native', 'AWS', 'GCP'...",2,020bb618-c3f1-47b8-8a2d-a25879b01173,9643a5e3-9863-4ce5-853e-07813c06f17a,020bb618-c3f1-47b8-8a2d-a25879b01173.pdf,Luis Williams is a highly unqualified candidat...,"Luis Williams, a recent Computer Science gradu...",3,2.5,Below threshold
3,Brian Flores,brianflores@example.com,0,"['JavaScript', 'HTML', 'CSS', 'React', 'Angula...",2,6160ec22-4d63-4a89-89e1-4bf8089bc2d2,c7dfa781-d13f-4f94-9a72-1e04fde304fb,6160ec22-4d63-4a89-89e1-4bf8089bc2d2.pdf,"This candidate, Brian Flores, is a recent grad...",Brian Flores is a recent Computer Science grad...,1,1.5,Below threshold
4,Jennifer Rodriguez,jenniferrodriguez@example.com,0,"['Python', 'Java', 'C++', 'TensorFlow', 'PyTor...",2,f7d40321-23cb-4287-9918-0bb96bec9a01,eef11c20-6ee7-46d9-9f2a-4aca466d07dd,f7d40321-23cb-4287-9918-0bb96bec9a01.pdf,"This candidate, Jennifer Rodriguez, is a recen...","Jennifer Rodriguez, a recent graduate with a M...",3,2.5,Below threshold


## Data Validation and Quality Control

Although no separate cleaning script is used, data validation is tightly integrated into the resume processing and profile extraction pipeline. Here’s how validation is enforced at multiple stages:

1. **Structured JSON Parsing**: The resume parser prompts the LLM to return structured JSON containing expected fields (e.g., Name, Email, Skills). If the response is malformed or missing required keys, the parsing step fails early, preventing corrupted data from being saved.

2. **Exception Handling**: All resume processing occurs within try/except blocks. Resumes that cannot be parsed correctly (e.g., malformed PDFs or unexpected output) are gracefully skipped with error logging.

3. **Score Thresholding**: Profiles with inconsistent or low-quality data tend to receive lower LLama and Gemini scores. These are automatically filtered into an “unqualified” category and stored separately for analysis.

4. **Consistent Field Assignment**: Each profile is enriched with standard metadata (candidate ID, resume filename, score fields) and validated against the job description context, ensuring comparability across profiles.

This validation strategy minimizes noisy data and supports trustworthy downstream analysis, without requiring manual data cleaning at this stage. Where this will be the most practical is when resume or application generation fails by the LLM. But faulty resumes or broken applications will be common amongst real world data. So, the best case usage of this data validation for this example is to manually create faulty data for the validator to catch.


# Context Saving

In [12]:
full_dat = pd.concat([qualified_cleaned_profiles, unqualified_cleaned_profiles], axis=0)
application_dat = pd.read_csv('data/applications.csv')
full_dat = full_dat.merge(application_dat, left_on='Candidate ID', right_on='candidate_id', how='inner')
full_dat = full_dat.drop(columns=['resume_file', 'email', 'years_experience'])
full_dat.to_csv('data/validated/all_candidates.csv', index=False)
full_dat.head()

Unnamed: 0,Name,Email,Years of Experience,Key Skills,Llama Score,Candidate ID,Application ID,Resume File,Llama Summary,Gemini Summary,Gemini Score,avg_score,Note,candidate_id,location,education,job_id,application_date,source
0,Alexander Prince,alexanderprince@example.com,8,"['Cloud Infrastructure', 'Security Protocols',...",9,c95788ab-42df-4277-b45c-102d4529df36,37f26b6e-3efd-40a0-870b-66ac104c1c68,c95788ab-42df-4277-b45c-102d4529df36.pdf,Alexander Prince is an exceptionally strong ca...,Alexander Prince presents a strong profile wit...,7,8.0,,c95788ab-42df-4277-b45c-102d4529df36,"New York, NY",Bachelor's Degree,c143e4c4-0444-45b9-b03f-31c48a310cd0,2025-04-01,Company Website
1,Dawn Osborn,dawnosborn@example.com,9,"['Java', 'Python', 'JavaScript', 'C++', 'Cloud...",8,40157b49-1976-4efb-bd27-3e7bf93c94e3,6d4e867b-376d-44aa-a9cc-91d3f92c9bf1,40157b49-1976-4efb-bd27-3e7bf93c94e3.pdf,Dawn Osborn is a highly skilled and experience...,"Dawn Osborn presents a solid profile, exceedin...",7,7.5,,40157b49-1976-4efb-bd27-3e7bf93c94e3,"Chicago, IL",Bachelor's Degree,c143e4c4-0444-45b9-b03f-31c48a310cd0,2025-03-14,Referral
2,Cindy Lambert,cindylambert@example.com,7,"['Java', 'Python', 'C++', 'JavaScript', 'AWS',...",9,096946f9-8ffa-4238-9297-9c0052599845,1e7f4d54-82e5-4ab0-9b01-92c5b67a516b,096946f9-8ffa-4238-9297-9c0052599845.pdf,Cindy Lambert is an exceptionally qualified ca...,Cindy Lambert presents a compelling profile as...,9,9.0,,096946f9-8ffa-4238-9297-9c0052599845,"San Francisco, CA",PhD,c143e4c4-0444-45b9-b03f-31c48a310cd0,2025-03-08,Indeed
3,Nathan Rodriguez,nathanrodriguez@example.com,8,"['Cloud Infrastructure', 'Microservice Archite...",9,02401398-e3b5-4309-a235-b8aa164ba85f,65738a3b-7240-4678-be43-3eb1696f6eeb,02401398-e3b5-4309-a235-b8aa164ba85f.pdf,Nathan Rodriguez is an exceptionally strong ca...,Nathan Rodriguez presents a very strong profil...,9,9.0,,02401398-e3b5-4309-a235-b8aa164ba85f,"Austin, TX",Bachelor's Degree,c143e4c4-0444-45b9-b03f-31c48a310cd0,2025-04-21,Company Website
4,Lauren Powers,laurenpowers@example.com,5,"['JavaScript', 'Python', 'Java', 'C++', 'React...",8,c12322b3-d288-4edd-afca-0f5f82bcfd18,8b067655-8cb5-4e45-aefc-8c7c623b23dd,c12322b3-d288-4edd-afca-0f5f82bcfd18.pdf,Lauren Powers is a highly motivated Senior Sof...,"Lauren Powers presents as a promising, albeit ...",6,7.0,,c12322b3-d288-4edd-afca-0f5f82bcfd18,"Chicago, IL",Master's Degree,c143e4c4-0444-45b9-b03f-31c48a310cd0,2025-03-12,Indeed


In [13]:
# Loop through each row in the full dataset (combined qualified + unqualified candidates)
for _, row in full_dat.iterrows():
    # Convert the row into a dictionary for easier manipulation
    profile = row.to_dict()

    # Use candidate_id or fallback to email
    candidate_id = profile.get("candidate_id") or profile.get("Email")

    # Save the candidate profile to the context store (mcp_context.json)
    # This allows other parts of the system to retrieve candidate data
    # for tasks like interview scheduling, ranking, or profile enrichment
    save_candidate_context(candidate_id, profile)

## Context Generation and Storage

After extracting and evaluating candidate profiles, we persist them using a shared context store to support future retrieval, reasoning, and GenAI interactions.

### What Is the Context?

The `context` module maintains a centralized file (`mcp_context.json`) that holds two major sections:
- `"jobs"`: Metadata and descriptions for each job role
- `"candidates"`: Profile information for every evaluated candidate

### How It Works

Each candidate is saved into the context file using the `save_candidate_context()` function. This function:
- Loads the existing context file from disk
- Inserts or updates the profile associated with the given candidate ID
- Writes the updated context back to disk

This process creates a durable, structured memory that can be used by downstream workflows—for example:
- Matching candidates to job descriptions
- Powering LLM-based comparisons or search
- Rehydrating user state across notebook sessions

By standardizing the storage of both jobs and candidates, we ensure consistent access to relevant information for inference, analysis, or UI-based review. The point of this section is to ensure the entire process can have access to candidate level data at any point in time through the `json` file.
