<a href="https://colab.research.google.com/github/sangeeta05071997/python-machine-learning/blob/main/resume-parsing_NLP-SpaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
#Step 1: Install Required Packages
!pip install pdfminer.six python-docx spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m81.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [19]:
import os
import re
from pdfminer.high_level import extract_text as extract_text_from_pdf
from docx import Document
import spacy

In [20]:
# Step 2: Load spaCy NLP model
nlp = spacy.load("en_core_web_sm")

# ---------- Extract Text from DOCX ----------
def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    return '\n'.join([para.text for para in doc.paragraphs])

# ---------- Unified Text Extractor ----------
def extract_resume_text(file_path):
    ext = os.path.splitext(file_path)[-1].lower()
    if ext == ".pdf":
        return extract_text_from_pdf(file_path)
    elif ext == ".docx":
        return extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file type. Only PDF and DOCX are supported.")

In [21]:
# Step 3: Extract Text from Resume (PDF)
from pdfminer.high_level import extract_text

def extract_resume_text(pdf_path):
    return extract_text(pdf_path)

In [22]:
# Step 4: Extract Name (Top of Resume or NER)
def extract_name(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return ent.text
    match = re.search(r'\b[A-Z][a-z]+\s[A-Z][a-z]+\b', text)
    return match.group() if match else None

In [23]:
# Step 5: Extract Email
# import re for this part

def extract_email(text):
    match = re.search(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text)
    return match.group() if match else None

In [24]:
# Step 6: Extract Phone Number
def extract_phone(text):
    match = re.search(r'(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', text)
    return match.group() if match else None

In [25]:
# Step 7: Extract Skills (Using List)
# skills = ["Python", "Java", "Machine Learning", "Data Analysis", "Project Management"]
def extract_skills(text, skill_list):
    found = []
    text = text.lower()
    for skill in skill_list:
        if skill.lower() in text:
            found.append(skill)
    return list(set(found))  # remove duplicates

In [26]:
# Step 8: Extract Education
def extract_education(text):
    edu_keywords = ['Bachelor', 'Master', 'B.Tech', 'M.Tech', 'B.Sc', 'M.Sc', 'Ph.D', 'B.E']
    degrees = []
    for keyword in edu_keywords:
        matches = re.findall(rf"{keyword}.*\d{{4}}", text)
        degrees.extend(matches)
    return degrees

In [27]:
# Step 9: Extract Experience
def extract_experience(text):
    experience = []
    exp_section = re.findall(r'(?:Experience|Employment|Work History).*?(?=Education|Skills|$)', text, re.IGNORECASE | re.DOTALL)
    if exp_section:
        jobs = re.findall(r'[A-Z][a-zA-Z\s]+.*?\d{4}[-–]\d{4}', exp_section[0])
        experience.extend(jobs)
    return experience

In [28]:
# Step 10: Wrap Everything in a Function
def parse_resume(file_path, skills_list):
    text = extract_resume_text(file_path)
    return {
        "Name": extract_name(text),
        "Email": extract_email(text),
        "Phone": extract_phone(text),
        "Skills": extract_skills(text, skills_list),
        "Education": extract_education(text),
        "Experience": extract_experience(text)
    }

In [29]:
# resume_data = parse_resume("sample_resume.pdf", skills)
#print(resume_data)