In [28]:
pip install python-docx


Note: you may need to restart the kernel to use updated packages.


In [29]:
!pip install pandas





In [30]:
pip install nltk


Note: you may need to restart the kernel to use updated packages.


In [31]:
from docx import Document

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return " ".join(full_text)



In [32]:
import os
import pandas as pd

resume_folder = "resumes"

resumes = []

for file in os.listdir(resume_folder):
    if file.endswith(".docx"):
        text = extract_text_from_docx(os.path.join(resume_folder, file))
        resumes.append({
            "name": file,
            "resume_text": text
        })

df = pd.DataFrame(resumes)
df.head()



Unnamed: 0,name,resume_text
0,Venkat_BA.docx,Venkat N Phone: 314-662-2902 Email: vhealthba@...
1,Shashank.docx,\t SHASHANK TIWARI Shashank.tiwari44@gmail.com...
2,Anudeep N_Sr Java Developer.docx,Anudeep Sr Java Programmer anudeepreddynallama...
3,ram nandyala.docx,...
4,Neha Mugghala.docx,Sourya Senior Java/J2EE Developer sny.java@gma...


In [33]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"[^a-zA-Z ]", " ", text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

df["clean_resume"] = df["resume_text"].apply(clean_text)


[nltk_data] Downloading package stopwords to /Users/bp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
job_description = """
Looking for an AI/ML Engineer with strong Python skills,
experience in machine learning, NLP, data analysis, and SQL.
"""

jd_clean = clean_text(job_description)


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=3000)
resume_vectors = vectorizer.fit_transform(df["clean_resume"])
jd_vector = vectorizer.transform([jd_clean])


In [36]:
from sklearn.metrics.pairwise import cosine_similarity

scores = cosine_similarity(resume_vectors, jd_vector).flatten()
df["match_percentage"] = scores * 100


In [37]:
ranked_df = df.sort_values(by="match_percentage", ascending=False)
ranked_df[["name", "match_percentage"]]


Unnamed: 0,name,match_percentage
102,Kashyap K. Vora resume.docx,14.149594
90,Vijay Bhargav.docx,13.832719
31,Rajesh_k.docx,12.302564
50,Vivek Joshi_CV.docx,10.753763
179,vema reddy.docx,10.671277
...,...,...
107,Adelina_Erimia_PMP1.docx,0.392641
86,employer_mounika details.docx,0.000000
185,Raju Goduguchinta_Technical Program Manager.docx,0.000000
125,SAURABH_PM.docx,0.000000


In [38]:
SKILLS = [
    "python", "machine learning", "deep learning",
    "nlp", "sql", "data analysis", "tensorflow",
    "pytorch", "excel"
]

def extract_skills(text):
    return [skill for skill in SKILLS if skill in text]

df["skills"] = df["clean_resume"].apply(extract_skills)


In [39]:
jd_skills = extract_skills(jd_clean)

def explain_resume(resume_skills):
    matched = list(set(resume_skills) & set(jd_skills))
    missing = list(set(jd_skills) - set(resume_skills))
    return matched, missing

df["explanation"] = df["skills"].apply(explain_resume)
