<a href="https://colab.research.google.com/github/sangeeta05071997/pdf-image/blob/main/resume_parsing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
#Step 1: Install Required Packages
!pip install pdfminer.six python-docx spacy
!python -m spacy download en_core_web_sm

Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx, pdfminer.six
Successfully installed pdfminer.six-20250506 python-docx-1.2.0
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m77.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and in

In [5]:
import os
import re
from pdfminer.high_level import extract_text as extract_text_from_pdf
from docx import Document
import spacy

In [6]:
# Step 2: Load spaCy NLP model
nlp = spacy.load("en_core_web_sm")

# ---------- Extract Text from DOCX ----------
def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    return '\n'.join([para.text for para in doc.paragraphs])

# ---------- Unified Text Extractor ----------
def extract_resume_text(file_path):
    ext = os.path.splitext(file_path)[-1].lower()
    if ext == ".pdf":
        return extract_text_from_pdf(file_path)
    elif ext == ".docx":
        return extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file type. Only PDF and DOCX are supported.")

In [7]:
# Step 3: Extract Text from Resume (PDF)
from pdfminer.high_level import extract_text

def extract_resume_text(pdf_path):
    return extract_text(pdf_path)

In [8]:
# Step 4: Extract Name (Top of Resume or NER)
def extract_name(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return ent.text
    match = re.search(r'\b[A-Z][a-z]+\s[A-Z][a-z]+\b', text)
    return match.group() if match else None

In [9]:
# Step 5: Extract Email
# import re for this part

def extract_email(text):
    match = re.search(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text)
    return match.group() if match else None

In [10]:
# Step 6: Extract Phone Number
def extract_phone(text):
    match = re.search(r'(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', text)
    return match.group() if match else None

In [11]:
# Step 7: Extract Skills (Using List)
# skills = ["Python", "Java", "Machine Learning", "Data Analysis", "Project Management"]
def extract_skills(text, skill_list):
    found = []
    text = text.lower()
    for skill in skill_list:
        if skill.lower() in text:
            found.append(skill)
    return list(set(found))  # remove duplicates

In [12]:
# Step 8: Extract Education
def extract_education(text):
    edu_keywords = ['Bachelor', 'Master', 'B.Tech', 'M.Tech', 'B.Sc', 'M.Sc', 'Ph.D', 'B.E']
    degrees = []
    for keyword in edu_keywords:
        matches = re.findall(rf"{keyword}.*\d{{4}}", text)
        degrees.extend(matches)
    return degrees

In [13]:
# Step 9: Extract Experience
def extract_experience(text):
    experience = []
    exp_section = re.findall(r'(?:Experience|Employment|Work History).*?(?=Education|Skills|$)', text, re.IGNORECASE | re.DOTALL)
    if exp_section:
        jobs = re.findall(r'[A-Z][a-zA-Z\s]+.*?\d{4}[-–]\d{4}', exp_section[0])
        experience.extend(jobs)
    return experience

In [14]:
# Step 10: Wrap Everything in a Function
def parse_resume(file_path, skills_list):
    text = extract_resume_text(file_path)
    return {
        "Name": extract_name(text),
        "Email": extract_email(text),
        "Phone": extract_phone(text),
        "Skills": extract_skills(text, skills_list),
        "Education": extract_education(text),
        "Experience": extract_experience(text)
    }

In [15]:
# resume_data = parse_resume("sample_resume.pdf", skills)
#print(resume_data)

Step 3

In [25]:
!pip install scikit-learn spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd



In [26]:
# --------- Sample Extracted Resume Skills (Simulating Step 2 Output) ---------
resumes = [
    "Excel, Python, Tableau, SQL, Data Visualization, Pandas",
    "Power BI, SQL, Data Analysis, Excel, Python",
    "Python, NumPy, Scikit-learn, Matplotlib, Machine Learning",
]

# Data Analyst Job Description
job_description = """
We are seeking a Data Analyst with expertise in Python, SQL, Excel, and data visualization tools like Tableau or Power BI.
The ideal candidate should be familiar with data cleaning, statistical analysis, and reporting dashboards.
Skills in Pandas, NumPy, or Scikit-learn are a plus.
"""

In [27]:
# Preprocessing Function
def preprocess(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Preprocess JD and resumes
documents = [preprocess(job_description)] + [preprocess(resume) for resume in resumes]

TF-IDF Vectorization + Cosine Similarity

In [28]:
# TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# First vector is Job Description
jd_vector = tfidf_matrix[0:1]
resume_vectors = tfidf_matrix[1:]

# Cosine Similarity
similarity_scores = cosine_similarity(jd_vector, resume_vectors).flatten()

# Display Ranking
df = pd.DataFrame({
    'Resume': [f"Resume {i+1}" for i in range(len(resumes))],
    'Similarity Score': similarity_scores
}).sort_values(by='Similarity Score', ascending=False).reset_index(drop=True)

In [29]:
print("Ranked Resumes for Data Analyst Role:")
print(df)