In [1]:
# Core libraries
import pandas as pd
import numpy as np
import re

# NLP & ML libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Load resume dataset
df = pd.read_csv(r"C:\Users\user\Music\RESUME PROJECT\UpdatedResumeDataSet.csv")

print("Dataset shape:", df.shape)
print("Columns:", df.columns)


Dataset shape: (962, 2)
Columns: Index(['Category', 'Resume'], dtype='object')


In [3]:
# TEXT CLEANING FUNCTION
def clean_text(text):
    """
    Cleans resume text by:
    - Lowercasing
    - Removing punctuation
    - Removing extra spaces
    """
    text = str(text).lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [4]:
# Apply cleaning
df['Cleaned_Resume'] = df['Resume'].apply(clean_text)

print(df['Cleaned_Resume'][0][:500])


skills programming languages python pandas numpy scipy scikitlearn matplotlib sql java javascriptjquery machine learning regression svm na√£ve bayes knn random forest decision trees boosting techniques cluster analysis word embedding sentiment analysis natural language processing dimensionality reduction topic modelling lda nmf pca neural nets database visualizations mysql sqlserver cassandra hbase elasticsearch d3js dcjs plotly kibana matplotlib ggplot tableau others regular expression html css 


In [5]:
#DEFINE SKILL SET
skills_list = [
    'python', 'sql', 'machine learning', 'ml', 'deep learning',
    'nlp', 'natural language processing',
    'pandas', 'numpy', 'scikit learn',
    'tensorflow', 'keras',
    'matplotlib', 'seaborn'
]


In [6]:
# SKILL EXTRACTION FUNCTION
def extract_skills(text):
    """
    Extracts matching skills from resume text
    """
    found_skills = []
    for skill in skills_list:
        if skill in text:
            found_skills.append(skill)
    return list(set(found_skills))


In [7]:
df['Skills_Found'] = df['Cleaned_Resume'].apply(extract_skills)
df['Skill_Score'] = df['Skills_Found'].apply(len)


In [8]:
# JOB DESCRIPTION INPUT
job_description = """
Looking for a Data Scientist with strong Python, SQL,
Machine Learning, NLP, Pandas, NumPy and data visualization skills.
"""

job_description_clean = clean_text(job_description)


In [9]:
# TF-IDF + COSINE SIMILARITY
# Combine resumes + job description
all_texts = df['Cleaned_Resume'].tolist() + [job_description_clean]

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(all_texts)

# Similarity between resumes & JD
similarity_scores = cosine_similarity(
    tfidf_matrix[:-1], tfidf_matrix[-1]
).flatten()

df['Text_Similarity'] = similarity_scores


In [10]:
#COMBINED SCORE
df['Combined_Score'] = (
    0.6 * df['Text_Similarity'] +
    0.4 * (df['Skill_Score'] / df['Skill_Score'].max())
)


In [12]:
#FINAL RANKING
final_df = df[['Category', 'Skills_Found', 'Skill_Score',
               'Text_Similarity', 'Combined_Score']].copy()

final_df['Rank'] = final_df['Combined_Score'].rank(
    ascending=False, method='dense'
)

final_df = final_df.sort_values(by='Combined_Score', ascending=False)

final_df.head(30)


Unnamed: 0,Category,Skills_Found,Skill_Score,Text_Similarity,Combined_Score,Rank
37,Data Science,"[numpy, keras, pandas, machine learning, matpl...",13,0.244761,0.546857,1.0
7,Data Science,"[numpy, keras, pandas, machine learning, matpl...",13,0.244761,0.546857,1.0
27,Data Science,"[numpy, keras, pandas, machine learning, matpl...",13,0.244761,0.546857,1.0
17,Data Science,"[numpy, keras, pandas, machine learning, matpl...",13,0.244761,0.546857,1.0
8,Data Science,"[numpy, pandas, machine learning, deep learnin...",8,0.260321,0.402347,2.0
38,Data Science,"[numpy, pandas, machine learning, deep learnin...",8,0.260321,0.402347,2.0
18,Data Science,"[numpy, pandas, machine learning, deep learnin...",8,0.260321,0.402347,2.0
28,Data Science,"[numpy, pandas, machine learning, deep learnin...",8,0.260321,0.402347,2.0
10,Data Science,"[numpy, pandas, machine learning, matplotlib, ...",9,0.096855,0.335036,3.0
20,Data Science,"[numpy, pandas, machine learning, matplotlib, ...",9,0.096855,0.335036,3.0
