In [1]:
import pandas as pd

# Load datasets
resumes = pd.read_csv("../data/resumes_dataset_powerful.csv")
jobs = pd.read_csv("../data/job_descriptions_powerful.csv")

# Display first few rows
display(resumes.head())
display(jobs.head())


Unnamed: 0,CandidateID,Name,Email,Phone,Location,Experience,Desired Role,Hard Skills,Soft Skills,Education,Certifications,Last Job Title,Resume Text,Resume Length,Salary Expectation,Availability,Work Authorization,Remote Preference,LinkedIn,GitHub
0,1,Sherry Townsend,sherry.townsend21@email.com,015-348-0921x66934,"Jameston, GA",5,Marketing Analyst,"Tableau, Stakeholder Communication, Scikit-lea...","Teamwork, Communication",BSc Data Science,AWS ML,Cybersecurity Analyst,Sherry Townsend has 5 years of experience as a...,21,$70k-$90k,More than 1 Month,Citizen,Remote,https://linkedin.com/in/sherrytownsend,https://github.com/sherry816
1,2,Crystal Riddle,crystal.riddle5@email.com,759-733-3741,"Heathland, ME",9,BI Analyst,"Business Strategy, SQL, Scikit-learn, Power BI...","Communication, Time Management",BSc Computer Science,Microsoft AI-900,Data Analyst,Crystal Riddle has 9 years of experience as a ...,26,$60k-$80k,1 Month,H1B,Hybrid,https://linkedin.com/in/crystalriddle,https://github.com/crystal633
2,3,Elizabeth Gonzalez,elizabeth.gonzalez23@email.com,001-105-909-4690,"Williamsfort, MI",0,Data Scientist,"Google Analytics, Machine Learning, Scikit-lea...","Teamwork, Time Management",MBA Business Analytics,AWS ML,Operations Analyst,Elizabeth Gonzalez has 0 years of experience a...,25,$80k-$100k,2 Weeks,OPT,Hybrid,https://linkedin.com/in/elizabethgonzalez,https://github.com/elizabeth783
3,4,Marcus Beasley,marcus.beasley56@email.com,(071)904-3549x86437,"West Lisa, CT",4,Marketing Analyst,"Business Strategy, TensorFlow, SQL, PyTorch, T...","Communication, Problem Solving",BSc Data Science,IBM Data Science,Business Analyst,Marcus Beasley has 4 years of experience as a ...,22,$80k-$100k,2 Weeks,H1B,On-site,https://linkedin.com/in/marcusbeasley,https://github.com/marcus291
4,5,Brandon Harris,brandon.harris89@email.com,+1-562-174-8857x0999,"Butlerton, WA",8,Business Analyst,"NLP, TensorFlow, Excel, Business Strategy, Mac...","Time Management, Problem Solving",BSc Data Science,,BI Analyst,Brandon Harris has 8 years of experience as a ...,29,$70k-$90k,1 Month,H1B,Hybrid,https://linkedin.com/in/brandonharris,https://github.com/brandon604


Unnamed: 0,JobID,Job Title,Company,Location,Industry,Role Category,Job Type,Work Mode,Min Experience,Required Education,Salary Range,Required Skills,Description,Posted Date,Urgency
0,1,Business Analyst,Mcdonald-Nelson,"Los Angeles, CA",Finance,AI/ML,Full-time,Hybrid,2,PhD,$60k-$80k,"Tableau, TensorFlow, PyTorch, Scikit-learn, Go...",Mcdonald-Nelson is hiring a Business Analyst i...,2025-01-18,Within 30 Days
1,2,Data Engineer,"Williams, Henderson and Williams","Atlanta, GA",Retail,AI/ML,Part-time,Hybrid,0,Master's,$70k-$90k,"Machine Learning, Scikit-learn, Statistics, Py...","Williams, Henderson and Williams is hiring a D...",2025-02-11,Rolling
2,3,Marketing Analyst,"Elliott, Jackson and Rodriguez","Boston, MA",Healthcare,Finance,Full-time,Hybrid,0,Bachelor's,$80k-$100k,"Python, Business Strategy, Stakeholder Communi...","Elliott, Jackson and Rodriguez is hiring a Mar...",2024-11-08,Within 30 Days
3,4,AI Engineer,"Maldonado, Anderson and Tanner","San Francisco, CA",Consulting,AI/ML,Contract,On-site,3,PhD,$80k-$100k,"Power BI, NLP, PyTorch, SQL, Tableau","Maldonado, Anderson and Tanner is hiring a AI ...",2024-11-16,Rolling
4,5,HR Analytics Specialist,"Bauer, Thomas and Harrison","Seattle, WA",Finance,Finance,Internship,Remote,0,Bachelor's,$100k-$150k,"Business Strategy, Tableau, Deep Learning, Sta...","Bauer, Thomas and Harrison is hiring a HR Anal...",2025-02-13,Within 30 Days


In [2]:
# Check for missing values
print("🔍 Missing Values in Resumes Dataset:")
print(resumes.isnull().sum())

print("\n🔍 Missing Values in Jobs Dataset:")
print(jobs.isnull().sum())


🔍 Missing Values in Resumes Dataset:
CandidateID             0
Name                    0
Email                   0
Phone                   0
Location                0
Experience              0
Desired Role            0
Hard Skills             0
Soft Skills             0
Education               0
Certifications        157
Last Job Title          0
Resume Text             0
Resume Length           0
Salary Expectation      0
Availability            0
Work Authorization      0
Remote Preference       0
LinkedIn                0
GitHub                  0
dtype: int64

🔍 Missing Values in Jobs Dataset:
JobID                 0
Job Title             0
Company               0
Location              0
Industry              0
Role Category         0
Job Type              0
Work Mode             0
Min Experience        0
Required Education    0
Salary Range          0
Required Skills       0
Description           0
Posted Date           0
Urgency               0
dtype: int64


In [3]:
# Define role-based certification mapping
certification_map = {
    "Data Analyst": "Google Data Analytics",
    "BI Analyst": "Tableau Desktop Specialist",
    "AI Engineer": "AWS Certified ML",
    "Machine Learning Engineer": "IBM AI Engineering",
    "Software Engineer": "Microsoft AI-900",
}

# Apply mapping: If missing, assign relevant certification
resumes["Certifications"] = resumes.apply(
    lambda row: certification_map.get(row["Desired Role"], "None") if pd.isnull(row["Certifications"]) else row["Certifications"],
    axis=1
)

# Check updates
display(resumes[["Desired Role", "Certifications"]].head(10))


Unnamed: 0,Desired Role,Certifications
0,Marketing Analyst,AWS ML
1,BI Analyst,Microsoft AI-900
2,Data Scientist,AWS ML
3,Marketing Analyst,IBM Data Science
4,Business Analyst,
5,Operations Analyst,Microsoft AI-900
6,Data Scientist,Microsoft AI-900
7,Operations Analyst,IBM Data Science
8,BI Analyst,Google Data Analytics
9,Data Analyst,Microsoft AI-900


In [4]:
import re

def clean_skills(skill_str):
    if pd.isnull(skill_str):  
        return []  # Return empty list if missing
    
    # Convert to lowercase, remove spaces, split into a list
    skill_list = [s.strip().lower() for s in skill_str.split(",")]
    
    # Ensure no duplicates & return clean list
    return list(set(skill_list))

# Apply function to normalize skills
resumes["Hard Skills"] = resumes["Hard Skills"].apply(clean_skills)
resumes["Soft Skills"] = resumes["Soft Skills"].apply(clean_skills)
jobs["Required Skills"] = jobs["Required Skills"].apply(clean_skills)

# Check updates
display(resumes[["Desired Role", "Hard Skills"]].head(10))
display(jobs[["Job Title", "Required Skills"]].head(10))


Unnamed: 0,Desired Role,Hard Skills
0,Marketing Analyst,"[business strategy, stakeholder communication,..."
1,BI Analyst,"[sql, excel, business strategy, scikit-learn, ..."
2,Data Scientist,"[business strategy, scikit-learn, nlp, pytorch..."
3,Marketing Analyst,"[sql, tensorflow, business strategy, pytorch, ..."
4,Business Analyst,"[sql, tensorflow, excel, business strategy, nl..."
5,Operations Analyst,"[sql, excel, business strategy, power bi, goog..."
6,Data Scientist,"[business strategy, python, machine learning, ..."
7,Operations Analyst,"[stakeholder communication, sql, nlp, machine ..."
8,BI Analyst,"[business strategy, python, nlp, machine learn..."
9,Data Analyst,"[business strategy, python, pytorch, machine l..."


Unnamed: 0,Job Title,Required Skills
0,Business Analyst,"[tensorflow, excel, business strategy, scikit-..."
1,Data Engineer,"[scikit-learn, python, nlp, statistics, machin..."
2,Marketing Analyst,"[tensorflow, excel, business strategy, python,..."
3,AI Engineer,"[sql, nlp, power bi, pytorch, tableau]"
4,HR Analytics Specialist,"[tensorflow, excel, business strategy, python,..."
5,HR Analytics Specialist,"[excel, nlp, power bi, statistics, tableau, de..."
6,Operations Analyst,"[business strategy, python, scikit-learn, powe..."
7,Business Analyst,"[tensorflow, excel, python, statistics, machin..."
8,HR Analytics Specialist,"[sql, tensorflow, excel, scikit-learn, pytorch..."
9,Data Analyst,"[python, nlp, power bi, statistics, tableau, g..."
