In [5]:
import os
import re
import nltk
import pandas as pd

In [7]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return ' '.join(words)

In [11]:
with open("../data/resumes/resume_sample.txt", "r") as f:
    resume_text = f.read()

clean_resume = clean_text(resume_text)
clean_resume

'skills python sql pandas numpy excel data analysis experience worked data analyst intern performed data cleaning exploratory data analysis reporting business teams education b tech electronics communication engineering'

In [13]:
jd_folder = "../data/job_descriptions/"
clean_jds = {}

for file in os.listdir(jd_folder):
    with open(os.path.join(jd_folder, file), "r") as f:
        clean_jds[file] = clean_text(f.read())

clean_jds

{'jd_backend_dev.txt': 'skills python java apis databases backend development responsibilities develop server side logic apis manage databases',
 'jd_business_dev.txt': 'skills sales lead generation client communication negotiation crm responsibilities acquire clients manage partnerships grow business revenue',
 'jd_data_analyst.txt': 'skills python sql pandas numpy excel data analysis statistics responsibilities analyze datasets generate insights build reports support business decisions',
 'jd_data_scientist.txt': 'skills python machine learning statistics data analysis model building responsibilities build predictive models analyze data trends communicate insights',
 'jd_hr.txt': 'skills recruitment communication talent acquisition hr operations responsibilities hire candidates manage interviews support hr processes',
 'jd_ml_engineer.txt': 'skills python machine learning deep learning model deployment natural language processing mlops apis responsibilities develop ml models deploy p

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
documents = [clean_resume] + list(clean_jds.values())
documents

['skills python sql pandas numpy excel data analysis experience worked data analyst intern performed data cleaning exploratory data analysis reporting business teams education b tech electronics communication engineering',
 'skills python java apis databases backend development responsibilities develop server side logic apis manage databases',
 'skills sales lead generation client communication negotiation crm responsibilities acquire clients manage partnerships grow business revenue',
 'skills python sql pandas numpy excel data analysis statistics responsibilities analyze datasets generate insights build reports support business decisions',
 'skills python machine learning statistics data analysis model building responsibilities build predictive models analyze data trends communicate insights',
 'skills recruitment communication talent acquisition hr operations responsibilities hire candidates manage interviews support hr processes',
 'skills python machine learning deep learning mode

In [19]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

tfidf_matrix.shape

(8, 88)

In [21]:
resume_vector = tfidf_matrix[0]
jd_vectors = tfidf_matrix[1:]

similarity_scores = cosine_similarity(resume_vector, jd_vectors)[0]
similarity_scores

array([0.02374633, 0.05820195, 0.37450617, 0.30349215, 0.02882472,
       0.02200025, 0.06265865])

In [23]:
results = pd.DataFrame({
    "Job_Role_File": list(clean_jds.keys()),
    "Match_Score": similarity_scores
})

results

Unnamed: 0,Job_Role_File,Match_Score
0,jd_backend_dev.txt,0.023746
1,jd_business_dev.txt,0.058202
2,jd_data_analyst.txt,0.374506
3,jd_data_scientist.txt,0.303492
4,jd_hr.txt,0.028825
5,jd_ml_engineer.txt,0.022
6,jd_project_coordinator.txt,0.062659


In [25]:
results_sorted = results.sort_values(by="Match_Score", ascending=False)
results_sorted

Unnamed: 0,Job_Role_File,Match_Score
2,jd_data_analyst.txt,0.374506
3,jd_data_scientist.txt,0.303492
6,jd_project_coordinator.txt,0.062659
1,jd_business_dev.txt,0.058202
4,jd_hr.txt,0.028825
0,jd_backend_dev.txt,0.023746
5,jd_ml_engineer.txt,0.022


In [27]:
best_role = results_sorted.iloc[0]["Job_Role_File"]
best_score = results_sorted.iloc[0]["Match_Score"]

best_role, best_score

('jd_data_analyst.txt', np.float64(0.3745061733121536))

In [29]:
def extract_skills(text, skill_list):
    text_words = set(text.split())
    return set(skill_list).intersection(text_words)

In [31]:
master_skills = [
    # Tech
    "python", "sql", "pandas", "numpy", "excel", "statistics",
    "machine", "learning", "deep", "deployment", "api",
    "java", "database", "backend", "natural language processing"
    
    # Non-tech
    "sales", "communication", "crm", "negotiation",
    "recruitment", "talent", "hr",
    "coordination", "planning", "documentation"
]


In [33]:
resume_skills = extract_skills(clean_resume, master_skills)
resume_skills

{'communication', 'excel', 'numpy', 'pandas', 'python', 'sql'}

In [35]:
best_jd_text = clean_jds[best_role]
jd_skills = extract_skills(best_jd_text, master_skills)

jd_skills

{'excel', 'numpy', 'pandas', 'python', 'sql', 'statistics'}

In [37]:
matched_skills = resume_skills.intersection(jd_skills)
missing_skills = jd_skills - resume_skills

matched_skills, missing_skills

({'excel', 'numpy', 'pandas', 'python', 'sql'}, {'statistics'})

In [41]:
recommendation = f"""
Best Matched Role: {best_role.replace('jd_', '').replace('.txt','').replace('_',' ').title()}

Match Score: {round(best_score * 100, 2)} %

Matched Skills:
{', '.join(matched_skills) if matched_skills else 'None'}

Missing Skills:
{', '.join(missing_skills) if missing_skills else 'None'}

Recommendation:
Candidate is suitable for this role with scope to improve missing skills.
"""

print(recommendation)


Best Matched Role: Data Analyst

Match Score: 37.45 %

Matched Skills:
python, sql, excel, numpy, pandas

Missing Skills:
statistics

Recommendation:
Candidate is suitable for this role with scope to improve missing skills.

