In [7]:
# Install required libraries
!pip install spacy nltk PyMuPDF pdfminer.six python-docx pandas

# Download spaCy model
!python -m spacy download en_core_web_sm

# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab') # Add this line to download the missing resource

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m100.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [17]:
from google.colab import files
import fitz
import docx
import re
import spacy
import json

# Upload file
print("Upload your resume file...")
uploaded = files.upload()
file_path = list(uploaded.keys())[0]

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# ---------- File Reading ----------
def read_pdf(file_path):
    text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([p.text for p in doc.paragraphs])

def read_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def extract_text(file_path):
    if file_path.lower().endswith(".pdf"):
        return read_pdf(file_path)
    elif file_path.lower().endswith(".docx"):
        return read_docx(file_path)
    elif file_path.lower().endswith(".txt"):
        return read_txt(file_path)
    else:
        raise ValueError("Unsupported file format")

# ---------- Regex Helpers ----------
def extract_email(text):
    match = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
    return match[0] if match else None

def extract_phone(text):
    match = re.findall(r"\+?\d[\d\s-]{8,}\d", text)
    return match[0] if match else None

# ---------- Name Extraction ----------
def extract_name(text):
    # Try first non-empty line
    first_line = text.split("\n")[0].strip()
    if len(first_line.split()) <= 4 and first_line.replace(" ", "").isalpha():
        return first_line
    # Fallback to spaCy
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return ent.text
    return None

# ---------- Section Extractor ----------
def extract_sections(text):
    headings = [
        "professional summary", "education", "projects", "skills & abilities",
        "skills", "certifications", "certification", "achievements",
        "volunteering work", "extra-curricular activities"
    ]
    lines = text.split("\n")
    sections = {}
    current_section = None

    for line in lines:
        clean_line = line.strip()
        if not clean_line:
            continue

        lower_line = clean_line.lower()
        if any(h in lower_line for h in headings):
            current_section = lower_line
            sections[current_section] = []
        elif current_section:
            sections[current_section].append(clean_line)

    return sections

# ---------- Cleaning Functions ----------
def clean_projects(lines):
    titles = []
    for line in lines:
        title = re.split(r" - |:|;", line)[0]
        words = title.strip("•- ").strip().split()
        if len(words) > 10:
            title = " ".join(words[:8])  # shorten long sentences
        if 2 <= len(title.split()) <= 12:
            titles.append(title.strip())
    return list(dict.fromkeys(titles))  # remove duplicates

def clean_certificates(lines):
    certs = []
    for line in lines:
        if len(line.strip()) > 2:
            certs.append(line.strip("•- ").strip())
    return certs

def clean_skills(lines):
    return [line.strip("•- ").strip() for line in lines if len(line.strip()) > 2]

def clean_achievements(lines):
    return [line.strip("•- ").strip() for line in lines if len(line.strip()) > 2]

def clean_education_from_section(lines):
    degree_pattern = r"(B\.?Tech|M\.?Tech|Bachelors?|Masters?|MBA|Ph\.?D|Diploma|HSC|SSC|Board|Institute|College|University|20\d{2}|19\d{2})"
    filtered = []
    for line in lines:
        if re.search(degree_pattern, line, re.IGNORECASE):
            filtered.append(line.strip("•- ").strip())
    return filtered

def clean_education_from_text(text):
    pattern = r"(B\.?Tech.*|M\.?Tech.*|HSC.*|SSC.*|Diploma.*|Bachelors?.*|Masters?.*)"
    matches = re.findall(pattern, text, re.IGNORECASE)
    return list(dict.fromkeys([m.strip("•- ").strip() for m in matches]))

# ---------- Main Parser ----------
def parse_resume(file_path):
    text = extract_text(file_path)
    sections = extract_sections(text)

    parsed = {
        "name": extract_name(text),
        "email": extract_email(text),
        "phone": extract_phone(text),
        "achievements": [],
        "projects": [],
        "certificates": [],
        "skills": [],
        "education": []
    }

    for sec, content in sections.items():
        if "project" in sec:
            parsed["projects"] = clean_projects(content)
        elif "skill" in sec:
            parsed["skills"] = clean_skills(content)
        elif "certification" in sec:
            parsed["certificates"] = clean_certificates(content)
        elif "achievement" in sec:
            parsed["achievements"] = clean_achievements(content)
        elif "education" in sec:
            parsed["education"] = clean_education_from_section(content)

    # Fallback for education if section-based detection failed
    if not parsed["education"]:
        parsed["education"] = clean_education_from_text(text)

    # Fallback for certificates if section-based detection failed
    if not parsed["certificates"] and "certification" not in sections:
        cert_matches = re.findall(r"(Certification.*|Certified.*)", text, re.IGNORECASE)
        parsed["certificates"] = [c.strip("•- ").strip() for c in cert_matches]

    return parsed

# ---------- Run ----------
parsed_resume = parse_resume(file_path)
print(json.dumps(parsed_resume, indent=2))


Upload your resume file...


Saving ARYAN RESUME for college.pdf to ARYAN RESUME for college.pdf
{
  "name": "Internships",
  "email": "aryanrawool1674@gmail.com",
  "phone": "9324598889",
  "achievements": [],
  "projects": [
    "REGENRATIVE BRAKING IN BLDC MOTOR",
    "A C PATIL COLLEGE \u2022 January 2024 -July 2025",
    "Utilizes the motor's kinetic energy to slow it",
    "Played a helpful role in component selection and",
    "\u2022 Scope"
  ],
  "certificates": [],
  "skills": [],
  "education": [
    "Bachelor of Engineering - BE, Electrical engineering",
    "A.C PATIL College of Engineering",
    "HSC, PCM",
    "Atomic Energy Junior College \u2022 2022 \u2022 65%",
    "SSC, Science",
    "Atomic Energy Central Schoo-2 \u2022 2020 \u2022 69%",
    "December 2024 - January 2025, Navi Mumbai"
  ]
}
