## load candidate dataset

In [258]:
!uv add pandas
!uv add numpy
!uv add rapidfuzz

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2mResolved [1m143 packages[0m [2min 9ms[0m[0m
[2mAudited [1m97 packages[0m [2min 0.53ms[0m[0m
[2mResolved [1m143 packages[0m [2min 0.47ms[0m[0m
[2mAudited [1m97 packages[0m [2min 0.03ms[0m[0m


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2mResolved [1m143 packages[0m [2min 0.62ms[0m[0m
[2mAudited [1m97 packages[0m [2min 0.04ms[0m[0m


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### load and pre-process data

In [259]:
import pandas as pd
import ast

df = pd.read_csv('resume_data.csv', encoding='utf-8-sig')

# Remove column name surrounding whitespace - this is done since `job_position_name` has a prefix in its column name
df.columns = df.columns.str.replace('\ufeff', '').str.strip()

# custome candidate id index
df["candidate_id"] = ["C" + str(i).zfill(3) for i in range(1, len(df) + 1)]
df.set_index("candidate_id", drop=False) 

def parse_list_str(list_str):
    if isinstance(list_str, str):
        try:
            return ast.literal_eval(list_str)
        except (ValueError, SyntaxError):
            return []
    return list_str if isinstance(list_str, list) else []

### process skills

In [260]:
def process_skills():
    # cleaning and parsing data
    df['skills'] = df['skills'].apply(parse_list_str)
    df['related_skils_in_job'] = df['related_skils_in_job'].apply(parse_list_str)
    df['certification_skills'] = df['certification_skills'].apply(parse_list_str)

    # print(df['skills'].apply(type).value_counts())
    # print(df['related_skils_in_job'].apply(type).value_counts())

    # df[df['skills'].apply(lambda x: isinstance(x, float))]

    def combine_skills(row):
        skills = row['skills'] if isinstance(row['skills'], list) else []
        related_skills = [x for sublist in row['related_skils_in_job'] if isinstance(sublist, list) for x in sublist]
        certification_skills = [x for sublist in row['certification_skills'] if isinstance(sublist, list) for x in sublist]

        all_skills = skills + related_skills + certification_skills

        # print(all_skills)

        return list(set(all_skills)) 

    # combine skills and related skills
    df['all_skills'] = df.apply(combine_skills, axis=1)


process_skills()

### process qualifications

In [261]:
from rapidfuzz import process, fuzz

DEGREE_MAP = {
    "bachelor": [
        "b.e", "be", "b.tech", "btech", "b.sc", "bsc", "b.a", "ba", "b.com", "bcom",
        "bba", "bca", "b.arch", "barch", "b.pharm", "bpharm", "b.ed", "bed", "bfa",
        "b.des", "bdes", "b.lit", "blit", "b.s", "bs", "b.eng", "beng", "b.engg", "bengg",
        "bachelors", "bachelor", "undergraduate", "ug", "licentiate", "bacharelado"
    ],
    "master": [
        "m.e", "me", "m.tech", "mtech", "m.sc", "msc", "m.a", "ma", "m.com", "mcom",
        "mba", "mca", "m.arch", "march", "m.pharm", "mpharm", "m.ed", "med", "mfa",
        "m.des", "mdes", "m.lit", "mlit", "m.s", "ms", "m.eng", "meng", "m.engg", "mengg",
        "masters", "master", "postgraduate", "pg", "post grad", "post-graduate", "magister"
    ],
    "phd": [
        "ph.d", "phd", "d.phil", "dphil", "doctorate", "doctoral", "dr.", "dr",
        "sc.d", "scd", "eng.d", "engd", "edd", "dba"
    ],
    "associate": [
        "associate", "a.a", "aa", "a.s", "as", "a.sc", "asc", "a.a.s", "aas"
    ],
    "diploma": [
        "diploma", "advanced diploma", "postgraduate diploma", "pg diploma", "pgd", "post diploma",
        "polytechnic", "certificate", "post-baccalaureate diploma", "post baccalaureate diploma"
    ],
    "highschool": [
        "high school", "secondary school", "hsc", "ssc", "10th", "12th", "intermediate", "matriculation",
        "matric", "senior secondary", "pre-university", "pu", "pu college", "a-levels", "alevels", "o-levels", "olevels"
    ],
    "other": [
        "postdoc", "post doctoral", "post-doctoral", "post doctorate", "post-doctorate",
        "certificate course", "vocational", "trade school", "training", "bootcamp"
    ]
}

TERM_TO_CATEGORY = {
    term: category
    for category, terms in DEGREE_MAP.items()
    for term in terms
}

all_terms = list(TERM_TO_CATEGORY.keys())

def normalize_degree(degree: str, threshold=80):
    match, score, _ = process.extractOne(degree, all_terms, scorer=fuzz.partial_ratio)

    if match and score >= threshold:
        return TERM_TO_CATEGORY[match]

    return degree

def normalize_degree_list(degrees: list[str], threshold=80):
        degree_texts = [degree_text.lower().strip() for degree_text in degrees if isinstance(degree_text, str)]
        
        return [normalize_degree(deg, threshold) for deg in degree_texts]

def process_qualifications(): 

    df['degree_names'] = df['degree_names'].apply(parse_list_str)
    df['degree_names_norm'] = df['degree_names'].apply(normalize_degree_list)

    df['major_field_of_studies'] = df['major_field_of_studies'].apply(parse_list_str)
    
    # print(df['degree_names'].apply(type).value_counts())
    # print(df['major_field_of_studies'].apply(type).value_counts())

    df["qualifications"] = df.apply(lambda row: [f"{a} in {b}" if b != 'N/A' else a for a, b in zip(row["degree_names_norm"], row["major_field_of_studies"])], axis=1)



process_qualifications()

## jd extractor

In [None]:
!uv add pydantic
!uv add google-genai
!uv add "instructor[google-genai]"
!uv add python-dotenv

### define JD pydantic models

In [91]:
# define JD extraction model
from typing import List, Optional, Literal
from pydantic import BaseModel, Field
from enum import Enum


class ImportanceLevel(Enum):
    ESSENTIAL = "essential" 
    IMPORTANT = "important" 
    VALUABLE = "valuable" 
    SUPPLEMENTARY = "supplementary"

ProficiencyLevel = Literal["beginner", "intermediate", "advanced", "expert"]
LanguageLevel = Literal["basic", "conversational", "professional", "native", "fluent"]
SeniorityLevel = Literal["entry", "mid", "senior", "executive", "c_level"]
RemoteOption = Literal["remote", "hybrid", "on_site", "flexible"]
EmploymentType = Literal["full_time", "part_time", "contract", "contract_to_hire", "internship"]
UrgencyLevel = Literal["immediate", "within_30_days", "within_60_days", "flexible"]
PayFrequency = Literal["hourly", "annually", "monthly"]
CompanySize = Literal["startup", "small", "medium", "large", "enterprise"]
CompanyStage = Literal["early_stage", "growth", "mature", "public"]
EnvironmentType = Literal["cloud", "on_premise", "hybrid"]


class Skill(BaseModel):
    skill: str = Field(..., description="Name of the skill", example="System Architecture")
    priority: ImportanceLevel = Field(..., description="Importance level of the skill")
    proficiency_level: Optional[ProficiencyLevel] = Field(None, description="Required proficiency level")


class Technology(BaseModel):
    technology: str = Field(..., description="Technology name", example="AWS")
    priority: ImportanceLevel = Field(..., description="Importance level of the technology")
    version: Optional[str] = Field(None, description="Specific version if applicable", example="Python 3.9+")


class LanguageProficiency(BaseModel):
    language: str = Field(..., description="Language name", example="English")
    level: LanguageLevel = Field(..., description="Required proficiency level")
    priority: ImportanceLevel = Field(..., description="Importance level of the language")


class Education(BaseModel):
    degree: str = Field(..., description="Degree name", example="Bachelor's in Computer Science")
    field: Optional[str] = Field(None, description="Field of study", example="Computer Science")
    priority: ImportanceLevel = Field(..., description="Importance level of the degree")


class Certification(BaseModel):
    certification: str = Field(..., description="Certification name", example="CISSP")
    priority: ImportanceLevel = Field(..., description="Importance level of the certification")
    timeline: Optional[str] = Field(None, description="Timeframe to obtain certification", example="within 6 months")


class Qualifications(BaseModel):
    education: Optional[List[Education]] = None
    certifications: Optional[List[Certification]] = None


class ExperienceRange(BaseModel):
    min: Optional[int] = Field(None, description="Minimum years")
    max: Optional[int] = Field(None, description="Maximum years")


class IndustryExperience(BaseModel):
    industry: str = Field(..., description="Industry name", example="fintech")
    priority: ImportanceLevel = Field(..., description="Importance level")


class Leadership(BaseModel):
    required: Optional[bool] = Field(None, description="Whether leadership experience is required")
    team_size: Optional[ExperienceRange] = None
    priority: Optional[ImportanceLevel] = Field(None, description="Importance level of leadership experience")


class Experience(BaseModel):
    level: Optional[SeniorityLevel] = None
    years_total: Optional[ExperienceRange] = None
    years_relevant: Optional[ExperienceRange] = None
    industry_experience: Optional[List[IndustryExperience]] = None
    leadership: Optional[Leadership] = None


class LocationTravelRequirements(BaseModel):
    percentage: Optional[int] = Field(None, description="Travel percentage required", example=25)
    frequency: Optional[str] = Field(None, description="Travel frequency", example="quarterly client visits")


class Relocation(BaseModel):
    assistance_available: Optional[bool] = Field(None, description="Whether relocation assistance is available")
    required: Optional[bool] = Field(None, description="Whether relocation is mandatory")


class Location(BaseModel):
    cities: Optional[List[str]] = Field(None, description="Acceptable cities")
    countries: Optional[List[str]] = Field(None, description="Acceptable countries")
    remote_options: Optional[RemoteOption] = Field(None, description="Work arrangement options")
    travel_requirements: Optional[LocationTravelRequirements] = None
    relocation: Optional[Relocation] = None


class SalaryRange(BaseModel):
    min: Optional[float] = Field(None, description="Minimum salary")
    max: Optional[float] = Field(None, description="Maximum salary")
    currency: Optional[str] = Field(None, description="Currency code", example="USD")
    frequency: Optional[PayFrequency] = Field(None, description="Pay frequency")


class EmploymentDetails(BaseModel):
    type: Optional[EmploymentType] = None
    urgency: Optional[UrgencyLevel] = None
    salary_range: Optional[SalaryRange] = None
    benefits: Optional[List[str]] = Field(None, description="Benefits and perks")


class SoftSkill(BaseModel):
    skill: str = Field(..., description="Soft skill name", example="Communication")
    priority: ImportanceLevel = Field(..., description="Importance level")
    context: Optional[str] = Field(None, description="Context or application", example="client-facing communication")


class ProjectContext(BaseModel):
    types: Optional[List[str]] = Field(None, description="Types of projects", example=["migration", "compliance"])
    methodologies: Optional[List[str]] = Field(None, description="Work methodologies", example=["agile", "devops"])
    environment: Optional[EnvironmentType] = Field(None, description="Technical environment")


class TeamContext(BaseModel):
    size: Optional[int] = Field(None, description="Team size")
    structure: Optional[str] = Field(None, description="Team structure", example="cross-functional agile team")
    reporting_to: Optional[str] = Field(None, description="Reporting role", example="Director of Security")


class SecurityClearance(BaseModel):
    required: Optional[bool] = Field(None, description="Whether clearance is required")
    level: Optional[str] = Field(None, description="Clearance level", example="Secret")


class ComplianceLegal(BaseModel):
    visa_sponsorship: Optional[bool] = Field(None, description="Whether visa sponsorship is available")
    security_clearance: Optional[SecurityClearance] = None
    background_check: Optional[bool] = Field(None, description="Whether background check is required")


class CulturalFit(BaseModel):
    company_values: Optional[List[str]] = Field(None, description="Company values", example=["innovation"])
    work_style: Optional[List[str]] = Field(None, description="Preferred work styles", example=["detail-oriented"])


class Company(BaseModel):
    name: str = Field(..., description="Company name", example="TechCorp Inc.")
    size: Optional[CompanySize] = Field(None, description="Company size category")
    stage: Optional[CompanyStage] = Field(None, description="Company maturity stage")


class RoleSchema(BaseModel):
    role: str = Field(..., description="Job title or role name", example="Senior Security Administrator")
    company: Company
    industry: Optional[List[str]] = Field(None, description="Industry sectors", example=["technology", "healthcare"])
    role_objectives: Optional[List[str]] = Field(None, description="High-level role objectives")
    responsibilities: List[str] = Field(..., description="Specific job responsibilities")
    skills: List[Skill] = Field(..., description="Technical and soft skills required")
    technologies: Optional[List[Technology]] = Field(None, description="Required technologies or tools")
    language_proficiency: Optional[List[LanguageProficiency]] = None
    qualifications: Optional[Qualifications] = None
    experience: Optional[Experience] = None
    location: Optional[Location] = None
    employment_details: Optional[EmploymentDetails] = None
    soft_skills: Optional[List[SoftSkill]] = None
    project_context: Optional[ProjectContext] = None
    team_context: Optional[TeamContext] = None
    compliance_legal: Optional[ComplianceLegal] = None
    cultural_fit: Optional[CulturalFit] = None
    growth_opportunities: Optional[List[str]] = Field(None, description="Mentioned career growth opportunities")


### initialise instructor and google genai

In [93]:
# prompts

system_prompt = """You are a specialized job description analyzer that extracts structured information from job postings. Your task is to parse job descriptions and extract key information into a standardized JSON format.

EXTRACTION GUIDELINES:

**Priority Classification:**
- "essential": Must-have requirements, deal-breakers, uses language like "required", "must have", "essential", "mandatory"
- "important": Strongly preferred, significant impact, uses language like "preferred", "strongly desired", "should have"
- "valuable": Nice-to-have, adds value, uses language like "plus", "bonus", "advantage", "would be great"
- "supplementary": Extra credit, differentiators, uses language like "additional", "extra", "optional"

**Experience Level Classification:**
- "entry": 0-2 years, junior, associate, entry-level positions
- "mid": 3-5 years, mid-level, standard individual contributor
- "senior": 6-10 years, senior, lead, principal roles
- "executive": 10+ years, director, VP, C-level positions

**Company Size Classification:**
- "startup": <50 employees, early-stage, seed/Series A
- "small": 50-200 employees, Series B/C
- "medium": 200-1000 employees, established but growing
- "large": 1000-5000 employees, mature company
- "enterprise": 5000+ employees, Fortune 500, multinational

**Language Proficiency Levels:**
- "basic": Can understand simple phrases, basic communication
- "conversational": Can hold basic conversations, daily interactions
- "professional": Can handle work-related communication effectively
- "fluent": Near-native proficiency, complex discussions
- "native": Native speaker level

**Extraction Rules:**
1. Extract skills from responsibilities, requirements, and qualifications sections
2. Infer priority levels from the language used (required vs preferred vs nice-to-have)
3. Separate technical skills from soft skills
4. Extract technologies, tools, and platforms separately from general skills
5. Look for experience requirements in years, seniority levels, and leadership mentions
6. Identify location preferences, remote work options, and travel requirements
7. Extract educational requirements and certifications
8. Capture salary information if mentioned (even ranges or hints)
9. Identify company culture indicators and work style preferences
10. Extract compliance requirements (visa, security clearance, background checks)

**Common Patterns to Look For:**
- "X+ years of experience" → extract min years
- "Bachelor's degree required" → essential education
- "Preferred: Master's degree" → important education
- "Must be authorized to work in US" → visa sponsorship false
- "Remote-first company" → remote work arrangement
- "Travel 25% of time" → travel requirements
- "Secret clearance required" → security clearance
- "Competitive salary" → salary info even if no numbers

**Inference Guidelines:**
- If a skill appears multiple times or in different contexts, mark as higher priority
- If role title contains "Senior/Lead/Principal" → senior experience level
- If mentions "startup environment" → startup company size
- If no remote work mentioned explicitly → assume "on_site"
- If salary not mentioned → omit salary_range entirely
- If no team size mentioned → omit team context

**Quality Checks:**
- Ensure at least one skill is marked as "essential"
- Verify experience level matches role title seniority
- Check that priority distributions make sense (not everything should be essential)
- Ensure location and remote work options are consistent
- Validate that required fields are populated

Return only the JSON object without additional commentary or explanations.
"""

user_prompt = """Please extract and structure the information from the following job description into the specified JSON format. Focus on accurately identifying priorities, experience levels, and all relevant details.

Pay special attention to:
- Language indicating requirement levels (must have vs nice to have)
- Experience requirements and seniority indicators
- Technical skills vs soft skills separation
- Location and work arrangement details
- Educational and certification requirements

Job Description:
```
{job_desc}
```

Extract the information into the JSON schema format, ensuring all priority levels are accurately assigned based on the language used in the job description.
"""

In [94]:
from google import genai
from google.genai import types
# import instructor
from dotenv import load_dotenv

load_dotenv('./.env')

client = genai.Client()
# instructor_client = instructor.from_genai(client, mode=instructor.Mode.GENAI_TOOLS)

def process_jd(jd: str):
    if not jd.strip():
        raise ValueError("No Job description provided")

    try:
        response = client.models.generate_content(
            model="gemini-2.5-pro",
            contents=user_prompt.format(**{"job_desc": jd}),
            config=types.GenerateContentConfig(
                temperature=0.2,
                system_instruction=system_prompt,
                response_mime_type="application/json",
                response_schema=RoleSchema,
            )
        )
        if response and response.text:
            output = RoleSchema.model_validate_json(response.text)
            return output
        else:
            raise ValueError("response.text is empty")
    except:
        raise

    
def load_sample_jd():
    job_desc = ""
    with open('./sample_jd.txt', 'r') as file:
        job_desc = file.read()

    return job_desc

# def load_sample_output()


sample_jd = load_sample_jd()
processed_jd = process_jd(sample_jd)



## match candidates

In [None]:
!uv add sentence_transformers

### load embed model

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

def similarity_match(item1: str, item2: str):
    item1_embedding = model.encode(item1, convert_to_tensor=True)
    item2_embedding = model.encode(item2, convert_to_tensor=True)

    return util.cos_sim(item1_embedding, item2_embedding).item()

### filter candidates by job title

In [None]:
from rapidfuzz import fuzz, process, utils
import numpy as np

def fuzzy_match(set1, set2, scorer=fuzz.token_set_ratio):
    return process.cdist(
        set1, 
        set2, 
        scorer=scorer,
        processor=utils.default_process
    )

def fuzzy_job_title_score(threshold=0.8):
    role_scores = fuzzy_match(df['job_position_name'], [processed_jd.role])
    df['title_score'] = role_scores[:, 0] / 100
    
    return df[df['title_score'] >= threshold]


def similarity_job_title_score(threshold=0.6):
    df['title_score'] = df.apply(lambda x: similarity_match(x['job_position_name'], processed_jd.role), axis=1)
    
    return df[df['title_score'] >= threshold]


df_filtered = fuzzy_job_title_score() # TODO: improve filter by switching to semantic similarity
# df_filtered = similarity_job_title_score() 

print(f"filtered candidates: {len(df_filtered)}")
print("")
print(df[['candidate_id', 'job_position_name', 'title_score']])

filtered candidates: 1024

     candidate_id                                                                        job_position_name  title_score
0            C001                                                                 Senior Software Engineer     1.000000
1            C002                                                           Machine Learning (ML) Engineer     0.640000
2            C003                           Executive/ Senior Executive- Trade Marketing, Hygiene Products     0.393939
3            C004                                                           Business Development Executive     0.297872
4            C005                                                                      Senior iOS Engineer     0.666667
5            C006                                                                              AI Engineer     0.842105
6            C007                                                                      Senior iOS Engineer     0.666667
7            

### score candidates by skills

In [None]:
importance_multipliers = {
    ImportanceLevel.ESSENTIAL: 1.0,
    ImportanceLevel.IMPORTANT: 0.7,
    ImportanceLevel.VALUABLE: 0.4,
    ImportanceLevel.SUPPLEMENTARY: 0.2
}

def weighted_fuzzy_skill_score(candidate_id: str, jd_skills: list[Skill], candidate_skills: list[str], score_threshold=80):
    if not jd_skills or not candidate_skills:
        return 0.0

    candidate_score = 0.0
    total_possible_score = sum(importance_multipliers[s.priority] for s in jd_skills)

    for jd_skill in jd_skills:
        jd_skill_name = jd_skill.skill.lower().strip()
        jd_weight = importance_multipliers.get(jd_skill.priority, 0.0)

        # Compare JD skill against all candidate skills
        match_scores = fuzzy_match([jd_skill_name], candidate_skills)
        best_score = np.max(match_scores)

        if best_score >= score_threshold:
            candidate_score += jd_weight
            
            # print matched skills:
            # print("JD Skill: " + jd_skill_name)
            # print("Candidate: " + candidate_id)
            # matched_skills_indices = np.where(match_scores[0] > score_threshold)[0]
            # for idx in matched_skills_indices:
            #     print(f"Skill: {candidate_skills[idx]}, Score: {match_scores[0][idx]}")

            # print("")

    return candidate_score / total_possible_score


if processed_jd:
    jd_skills = processed_jd.skills
    
    if processed_jd.technologies:
        for t in processed_jd.technologies:
            jd_skills.append(
                Skill(
                    skill=t.technology,
                    priority=t.priority,
                    proficiency_level=None
                )
            )

    print(jd_skills)

    df_filtered['skill_score'] = df_filtered.apply(lambda x: weighted_fuzzy_skill_score(x['candidate_id'], jd_skills, x['all_skills']), axis=1)
    
    # filter by skill
    SKILL_SCORE_THRESHOLD = 0.25
    # df_filtered = df_filtered[df_filtered['skill_score'] >= SKILL_SCORE_THRESHOLD]
    df_filtered[['candidate_id', 'job_position_name', 'skill_score']]
    print(f"total candidates: {len(df_filtered[df_filtered['skill_score'] >= SKILL_SCORE_THRESHOLD])}")
else: 
    raise ValueError('JD is corrupted or not processed')

# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', 1000)
# pd.set_option('display.max_colwidth', None)
# df_filtered

### score candidates by qualifications

In [284]:
def weighted_fuzzy_qualification_score(candidate_id: str, jd_qualifications: list[Education] | None, candidate_qualifications, score_threshold=70):
    if not jd_qualifications or not candidate_qualifications:
        return 0.0

    candidate_score = 0.0
    total_possible_score = sum(importance_multipliers[s.priority] for s in jd_qualifications)

    candidate_degrees = candidate_qualifications["degrees"]
    candidate_fields = candidate_qualifications["fields"]

    for jd_qualification in jd_qualifications:
        jd_degree_norm = normalize_degree(jd_qualification.degree)

        # Soft match degrees using token_set_ratio
        degree_match = any(
            fuzz.token_set_ratio(jd_degree_norm, cand_deg, processor=utils.default_process) >= 80
            for cand_deg in candidate_degrees
        )

        if not degree_match:
            continue

        jd_weight = importance_multipliers.get(jd_qualification.priority, 0.0)

        # Compare JD skill against all candidate skills
        match_scores = fuzzy_match([jd_qualification.field], candidate_qualifications["fields"])
        best_score = np.max(match_scores)

        if best_score >= score_threshold:
            candidate_score += jd_weight
            
            # print matched skills:
            # print("Candidate: " + candidate_id)
            # matched_qualification_indices = np.where(match_scores[0] > score_threshold)[0]
            # for idx in matched_qualification_indices:
            #     print(f"Qualification: {candidate_degrees[idx]} in {candidate_fields[idx]}, Score: {match_scores[0][idx]}")

    return candidate_score / total_possible_score

if processed_jd and processed_jd.qualifications and processed_jd.qualifications.education:
    df_filtered['qualification_score'] = df.apply(
        lambda x: weighted_fuzzy_qualification_score(
            x['candidate_id'], 
            processed_jd.qualifications.education, 
            { "degrees": x['degree_names_norm'],  "fields": x['major_field_of_studies']} 
        ), axis=1)
    
    # filter by qualification
    # QUALIFICATION_SCORE_THRESHOLD = 0.2
    # df_filtered = df_filtered[df_filtered['qualification_score'] >= QUALIFICATION_SCORE_THRESHOLD]

    # print(len(df_filtered))
    print(df_filtered[['candidate_id', 'job_position_name', 'qualification_score']])
else:
    df_filtered['qualification_score'] = None

     candidate_id         job_position_name  qualification_score
0            C001  Senior Software Engineer                  0.0
5            C006               AI Engineer                  0.0
29           C030               AI Engineer                  0.0
36           C037               AI Engineer                  0.0
39           C040               AI Engineer                  0.0
52           C053  Senior Software Engineer                  0.0
63           C064               AI Engineer                  0.0
114          C115               AI Engineer                  0.0
135          C136             Site Engineer                  0.0
138          C139  Senior Software Engineer                  0.0
142          C143             Site Engineer                  0.0
150          C151             Site Engineer                  0.0
151          C152             Site Engineer                  0.0
164          C165  Senior Software Engineer                  0.0
165          C166        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['qualification_score'] = df.apply(


### calculate total score

In [None]:
df_filtered['total_score'] = (
    0.4 * df_filtered['title_score'] + 
    0.4 * df_filtered['skill_score'] + 
    0.2 * df_filtered['qualification_score']
)

df_filtered[['candidate_id', 'job_position_name', 'total_score', 'title_score', 'skill_score', 'qualification_score']]

### initialize similarity score