In [3]:
import pandas as pd

# Load datasets
resumes = pd.read_csv("../data/resumes_dataset_powerful.csv")
jobs = pd.read_csv("../data/job_descriptions_powerful.csv")

# Function to safely convert skills from string to list
def safe_convert_skills(skill_str):
    """Ensure skills are converted to a clean list format."""
    if isinstance(skill_str, str):  # If it's a string, split by commas
        return [s.strip().lower() for s in skill_str.split(",")]
    elif isinstance(skill_str, list):  # If already a list, return as-is
        return skill_str
    else:
        return []  # Handle NaN or unexpected cases

# Apply function to normalize skills
resumes["Hard Skills"] = resumes["Hard Skills"].apply(safe_convert_skills)
resumes["Soft Skills"] = resumes["Soft Skills"].apply(safe_convert_skills)
jobs["Required Skills"] = jobs["Required Skills"].apply(safe_convert_skills)

# Check if fix worked
display(resumes[["Desired Role", "Hard Skills"]].head(10))
display(jobs[["Job Title", "Required Skills"]].head(10))


Unnamed: 0,Desired Role,Hard Skills
0,Marketing Analyst,"[tableau, stakeholder communication, scikit-le..."
1,BI Analyst,"[business strategy, sql, scikit-learn, power b..."
2,Data Scientist,"[google analytics, machine learning, scikit-le..."
3,Marketing Analyst,"[business strategy, tensorflow, sql, pytorch, ..."
4,Business Analyst,"[nlp, tensorflow, excel, business strategy, ma..."
5,Operations Analyst,"[excel, google analytics, sql, business strate..."
6,Data Scientist,"[machine learning, tableau, power bi, business..."
7,Operations Analyst,"[stakeholder communication, sql, nlp, machine ..."
8,BI Analyst,"[google analytics, tableau, nlp, python, pytor..."
9,Data Analyst,"[machine learning, python, pytorch, business s..."


Unnamed: 0,Job Title,Required Skills
0,Business Analyst,"[tableau, tensorflow, pytorch, scikit-learn, g..."
1,Data Engineer,"[machine learning, scikit-learn, statistics, p..."
2,Marketing Analyst,"[python, business strategy, stakeholder commun..."
3,AI Engineer,"[power bi, nlp, pytorch, sql, tableau]"
4,HR Analytics Specialist,"[business strategy, tableau, deep learning, st..."
5,HR Analytics Specialist,"[deep learning, statistics, power bi, excel, n..."
6,Operations Analyst,"[pytorch, statistics, power bi, stakeholder co..."
7,Business Analyst,"[machine learning, tensorflow, python, excel, ..."
8,HR Analytics Specialist,"[google analytics, tensorflow, sql, statistics..."
9,Data Analyst,"[statistics, python, google analytics, power b..."


In [4]:
def calculate_skill_match(candidate_skills, job_skills):
    """Calculate percentage of required job skills that candidate has."""
    if not candidate_skills or not job_skills:
        return 0  # No skills to compare
    
    candidate_skills = set(candidate_skills)
    job_skills = set(job_skills)
    
    match_count = len(candidate_skills.intersection(job_skills))
    total_required = len(job_skills)

    return round((match_count / total_required) * 100, 2)  # Percentage match

# Apply function to create a new column "Skill Match %" for each candidate-job pair
match_results = []

for _, job in jobs.iterrows():
    for _, candidate in resumes.iterrows():
        skill_match = calculate_skill_match(candidate["Hard Skills"], job["Required Skills"])
        
        match_results.append({
            "CandidateID": candidate["CandidateID"],
            "JobID": job["JobID"],
            "Skill Match %": skill_match
        })

# Convert results to DataFrame
match_df = pd.DataFrame(match_results)

# Display top matches
display(match_df.head(10))


Unnamed: 0,CandidateID,JobID,Skill Match %
0,1,1,37.5
1,2,1,62.5
2,3,1,50.0
3,4,1,50.0
4,5,1,50.0
5,6,1,50.0
6,7,1,37.5
7,8,1,0.0
8,9,1,50.0
9,10,1,25.0


In [5]:
def experience_fit(candidate_exp, job_min_exp):
    """Calculate experience fit score (out of 100)."""
    if candidate_exp >= job_min_exp:
        return 100  # Perfect match or overqualified
    return round((candidate_exp / job_min_exp) * 100, 2) if job_min_exp > 0 else 0

# Add experience match to match_df
match_df["Experience Fit %"] = match_df.apply(
    lambda row: experience_fit(
        resumes.loc[resumes["CandidateID"] == row["CandidateID"], "Experience"].values[0],
        jobs.loc[jobs["JobID"] == row["JobID"], "Min Experience"].values[0]
    ), axis=1
)

# Display updated match results
display(match_df.head(10))


Unnamed: 0,CandidateID,JobID,Skill Match %,Experience Fit %
0,1,1,37.5,100.0
1,2,1,62.5,100.0
2,3,1,50.0,0.0
3,4,1,50.0,100.0
4,5,1,50.0,100.0
5,6,1,50.0,0.0
6,7,1,37.5,100.0
7,8,1,0.0,100.0
8,9,1,50.0,100.0
9,10,1,25.0,50.0


In [6]:
def calculate_final_score(skill_match, experience_fit, skill_weight=0.7, exp_weight=0.3):
    """Calculate weighted final match score."""
    return round((skill_match * skill_weight) + (experience_fit * exp_weight), 2)

# Apply final score calculation
match_df["Final Match Score"] = match_df.apply(
    lambda row: calculate_final_score(row["Skill Match %"], row["Experience Fit %"]),
    axis=1
)

# Sort results by best match
match_df = match_df.sort_values(by="Final Match Score", ascending=False)

# Display top matches
display(match_df.head(10))


Unnamed: 0,CandidateID,JobID,Skill Match %,Experience Fit %,Final Match Score
54782,783,55,100.0,100.0,100.0
26679,680,27,100.0,100.0,100.0
5557,558,6,100.0,100.0,100.0
60544,545,61,100.0,100.0,100.0
7694,695,8,100.0,100.0,100.0
60164,165,61,100.0,100.0,100.0
1182,183,2,100.0,100.0,100.0
101030,31,102,100.0,100.0,100.0
26863,864,27,100.0,100.0,100.0
149657,658,150,100.0,100.0,100.0


In [7]:
# Save the final match results as a CSV
match_df.to_csv("../output/match_results.csv", index=False)

print("✅ Match results saved to output/match_results.csv")


✅ Match results saved to output/match_results.csv
