In [3]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
# Replace 'Resume.csv' with the actual file name from the dataset if it's different.
file_path = "Resume/Resume.csv"

# The previous attempt to load 'Resume.csv' failed.
# Let's list the files available in the dataset directory to find the correct one.
import os
# print(os.listdir('/kaggle/input/resume-dataset/Resume'))

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "snehaanbhawal/resume-dataset",
  file_path,
  # Provide any additional arguments like
  # sql_query or pandas_kwargs. See the
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", df.head())

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/snehaanbhawal/resume-dataset?dataset_version_number=1&file_name=Resume/Resume.csv...


100%|██████████| 7.97M/7.97M [00:00<00:00, 64.2MB/s]

Extracting zip of Resume.csv...





First 5 records:          ID                                         Resume_str  \
0  16852973           HR ADMINISTRATOR/MARKETING ASSOCIATE\...   
1  22323967           HR SPECIALIST, US HR OPERATIONS      ...   
2  33176873           HR DIRECTOR       Summary      Over 2...   
3  27018550           HR SPECIALIST       Summary    Dedica...   
4  17812897           HR MANAGER         Skill Highlights  ...   

                                         Resume_html Category  
0  <div class="fontsize fontface vmargins hmargin...       HR  
1  <div class="fontsize fontface vmargins hmargin...       HR  
2  <div class="fontsize fontface vmargins hmargin...       HR  
3  <div class="fontsize fontface vmargins hmargin...       HR  
4  <div class="fontsize fontface vmargins hmargin...       HR  


In [4]:
print(df.shape)
print(df.columns)


(2484, 4)
Index(['ID', 'Resume_str', 'Resume_html', 'Category'], dtype='object')


In [5]:
df = df.rename(columns={
    'Resume_str': 'resume_text',
    'Category': 'role'
})

In [6]:
df.isnull().sum()


Unnamed: 0,0
ID,0
resume_text,0
Resume_html,0
role,0


In [7]:
df = df.dropna(subset=['resume_text'])
df['role'].value_counts().head(10)



Unnamed: 0_level_0,count
role,Unnamed: 1_level_1
INFORMATION-TECHNOLOGY,120
BUSINESS-DEVELOPMENT,120
ADVOCATE,118
CHEF,118
ENGINEERING,118
ACCOUNTANT,118
FINANCE,118
FITNESS,117
AVIATION,117
SALES,116


In [8]:
import re

#Create a Resume-Safe Cleaning Function
def clean_resume(text):
    # lowercase
    text = text.lower()

    # remove URLs
    text = re.sub(r'http\S+|www\S+', ' ', text)

    # remove email addresses
    text = re.sub(r'\S+@\S+', ' ', text)

    # remove non-technical special chars (keep + and #)
    text = re.sub(r'[^a-z0-9+# ]', ' ', text)

    # remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [9]:
#Apply Cleaning to All Resumes
df['clean_resume'] = df['resume_text'].apply(clean_resume)
df[['resume_text', 'clean_resume']].head(2)

Unnamed: 0,resume_text,clean_resume
0,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,hr administrator marketing associate hr admini...
1,"HR SPECIALIST, US HR OPERATIONS ...",hr specialist us hr operations summary versati...


In [10]:
#Clean Job Description (Same Logic)
job_description = """
Looking for a Machine Learning Engineer with experience in Python, SQL,
Machine Learning, Deep Learning, scikit-learn, NLP, and data analysis.
"""
clean_job_desc = clean_resume(job_description)
print(clean_job_desc)


looking for a machine learning engineer with experience in python sql machine learning deep learning scikit learn nlp and data analysis


In [12]:
import os
# print(os.listdir('/kaggle/input/resume-dataset/data/data'))

# Original code (commented out for diagnosis):
# with open("skills/skills_list.txt", "r") as f:
#     skill_list = [skill.strip().lower() for skill in f.readlines()]

In [13]:
skill_list = [
    "python", "java", "c++", "sql",
    "machine learning", "deep learning", "nlp",
    "data analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "pytorch",
    "excel", "power bi", "tableau",
    "statistics", "linux", "git",
    "docker", "aws", "flask", "django"
]
print(len(skill_list))
print(skill_list)


23
['python', 'java', 'c++', 'sql', 'machine learning', 'deep learning', 'nlp', 'data analysis', 'pandas', 'numpy', 'scikit-learn', 'tensorflow', 'pytorch', 'excel', 'power bi', 'tableau', 'statistics', 'linux', 'git', 'docker', 'aws', 'flask', 'django']


In [14]:
#Create TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
tfidf = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1,2)
)
all_text = df['clean_resume'].tolist() + [clean_job_desc]

tfidf_matrix = tfidf.fit_transform(all_text)



In [15]:
#Separate Resume & Job Vectors
resume_vectors = tfidf_matrix[:-1]
job_vector = tfidf_matrix[-1]


In [16]:
#Compute Similarity Scores
similarity_scores = cosine_similarity(resume_vectors, job_vector)
df['similarity_score'] = similarity_scores.flatten()



In [17]:
#View Top Matching Resumes
df[['role', 'similarity_score']].sort_values(
    by='similarity_score',
    ascending=False
).head(10)


Unnamed: 0,role,similarity_score
2291,ARTS,0.259547
1762,ENGINEERING,0.254655
194,DESIGNER,0.239213
1348,AUTOMOBILE,0.229012
2153,BANKING,0.228858
1218,CONSULTANT,0.211623
337,TEACHER,0.198914
374,TEACHER,0.191626
349,TEACHER,0.19086
436,TEACHER,0.185768


In [18]:
#Compute Skill Match Percentage
def skill_match_score(resume_skills, job_skills):
    if len(job_skills) == 0:
        return 0.0

    matched = set(resume_skills).intersection(set(job_skills))
    return len(matched) / len(job_skills)


In [19]:
# Helper function to extract skills from a text
def extract_skills(text, skills_list):
    found_skills = []
    for skill in skills_list:
        # Check if the skill (as a whole word) is present in the text
        if f" {skill} " in f" {text} ":
            found_skills.append(skill)
    return found_skills

# Extract skills from each clean resume and create a new 'skills' column
df['skills'] = df['clean_resume'].apply(lambda x: extract_skills(x, skill_list))

# Extract skills from the cleaned job description to define job_skills
job_skills = extract_skills(clean_job_desc, skill_list)

#Apply Skill Scoring
df['skill_match_score'] = df['skills'].apply(
    lambda x: skill_match_score(x, job_skills)
)

In [20]:
#Identify Missing Skills
def missing_skills(resume_skills, job_skills):
    return list(set(job_skills) - set(resume_skills))
df['missing_skills'] = df['skills'].apply(
    lambda x: missing_skills(x, job_skills)
)


In [21]:
#Final Score (Weighted)
df['final_score'] = (
    0.6 * df['similarity_score'] +
    0.4 * df['skill_match_score']
)


In [22]:
#Rank Candidates
ranked_candidates = df.sort_values(
    by='final_score',
    ascending=False
)
ranked_candidates[
    ['role', 'final_score', 'skill_match_score', 'missing_skills']
].head(5)


Unnamed: 0,role,final_score,skill_match_score,missing_skills
1762,ENGINEERING,0.41946,0.666667,"[nlp, deep learning]"
1348,AUTOMOBILE,0.404074,0.666667,"[nlp, deep learning]"
1218,CONSULTANT,0.39364,0.666667,"[nlp, deep learning]"
2153,BANKING,0.337315,0.5,"[nlp, data analysis, deep learning]"
1339,AUTOMOBILE,0.297354,0.5,"[nlp, machine learning, deep learning]"


In [23]:
import pandas as pd

#Final Screening Function
def screen_resumes(df, clean_job_desc, job_skills, top_n=5):
    result = df.copy()

    result = result.sort_values(
        by='final_score',
        ascending=False
    ).head(top_n)

    output = []

    for idx, row in result.iterrows():
        output.append({
            "Candidate Role": row['role'],
            "Final Score": round(row['final_score'], 3),
            "Skill Match %": round(row['skill_match_score'] * 100, 1),
            "Missing Skills": row['missing_skills']
        })

    return pd.DataFrame(output)

In [24]:
#Run the Screening System
final_results = screen_resumes(
    df,
    clean_job_desc,

    job_skills,
    top_n=10
)

final_results

Unnamed: 0,Candidate Role,Final Score,Skill Match %,Missing Skills
0,ENGINEERING,0.419,66.7,"[nlp, deep learning]"
1,AUTOMOBILE,0.404,66.7,"[nlp, deep learning]"
2,CONSULTANT,0.394,66.7,"[nlp, deep learning]"
3,BANKING,0.337,50.0,"[nlp, data analysis, deep learning]"
4,AUTOMOBILE,0.297,50.0,"[nlp, machine learning, deep learning]"
5,INFORMATION-TECHNOLOGY,0.288,50.0,"[nlp, machine learning, deep learning]"
6,ENGINEERING,0.275,50.0,"[nlp, data analysis, deep learning]"
7,BANKING,0.25,50.0,"[python, machine learning, data analysis]"
8,BANKING,0.239,33.3,"[python, nlp, machine learning, deep learning]"
9,ENGINEERING,0.233,33.3,"[python, nlp, machine learning, deep learning]"


In [25]:
# take user input
job_description = input("Enter Job Description:\n")

# clean input
clean_job_desc = clean_resume(job_description)

# extract required skills
job_skills = extract_skills(clean_job_desc, skill_list)

print("\nExtracted Job Skills:")
print(job_skills)


Enter Job Description:
python

Extracted Job Skills:
['python']


test_inputs = [
    # ✅ VALID INPUTS
    """We are hiring a Machine Learning Engineer.
    Required skills include Python, SQL, Machine Learning,
    Deep Learning, NLP, scikit-learn, TensorFlow,
    and data analysis.""",

    """Looking for a Data Analyst with strong skills in
    Python, SQL, Excel, Power BI, Tableau,
    statistics, and data analysis.""",

    """Hiring a Software Developer with experience in
    Java, Python, SQL, Git, Linux, Docker,
    and Flask or Django.""",

    """Seeking an intern with basic knowledge of
    Python, SQL, data analysis, statistics,
    and interest in learning machine learning.""",

    # ❌ WRONG / EDGE INPUTS
    "",

    "Need someone good.",

    """Looking for a friendly person with good communication
    skills and positive attitude.""",

    "asdfghjkl qwerty 12345 !!!???",

    """Looking for experience in SAP HANA,
    COBOL, and mainframe systems."""
]
