In [3]:
import re
import hashlib
from collections import defaultdict
import fitz  # PyMuPDF
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity

# -------------------- CONFIG --------------------

SECTION_LABELS = {
    "education":  "Academic history, degrees, universities, schools, coursework, graduation years.",
    "experience": "Work history, job titles, internships, responsibilities, achievements at companies.",
    "skills":     "Technical skills, programming languages, tools, frameworks, platforms, soft skills.",
    "projects":   "Descriptions of projects, systems built, responsibilities within projects, outcomes.",
    "awards":     "Honors, certifications, distinctions, recognitions, prizes.",
    "hobbies":    "Personal interests, extracurricular activities, volunteering, non-work hobbies."
}

# Load models once
SEM_MODEL_NAME = "all-MiniLM-L6-v2"
semantic_model = SentenceTransformer(SEM_MODEL_NAME)
zero_shot = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

section_keys = list(SECTION_LABELS.keys())
section_embeddings = semantic_model.encode(list(SECTION_LABELS.values()))

# -------------------- UTILITIES --------------------

def normalize_text(s: str) -> str:
    """Lowercase & collapse non-alphanumerics for stable hashing/compare."""
    return re.sub(r"\W+", " ", s.lower()).strip()

def jaccard(a: str, b: str) -> float:
    """Simple Jaccard similarity on word sets."""
    sa = set(normalize_text(a).split())
    sb = set(normalize_text(b).split())
    if not sa or not sb:
        return 0.0
    return len(sa & sb) / len(sa | sb)

def is_near_duplicate(existing_texts, candidate, jaccard_threshold=0.85):
    """Check if candidate is near-duplicate of any already kept text."""
    n = normalize_text(candidate)
    h = hashlib.md5(n.encode("utf-8")).hexdigest()
    if h in existing_texts["hashes"]:
        return True
    # quick jaccard pass vs up to N prior samples (avoid O(n^2) blowup)
    for prev in existing_texts["samples"][-50:]:
        if jaccard(prev, candidate) >= jaccard_threshold:
            return True
    return False

def mark_kept(existing_texts, text):
    n = normalize_text(text)
    h = hashlib.md5(n.encode("utf-8")).hexdigest()
    existing_texts["hashes"].add(h)
    existing_texts["samples"].append(text)

# -------------------- PDF → BLOCKS --------------------

def extract_text_blocks(pdf_path):
    """
    Use PyMuPDF 'blocks' to get reasonably distinct chunks.
    Filters tiny/noisy blocks. Merges lines in the same block.
    """
    doc = fitz.open(pdf_path)
    blocks = []
    for page in doc:
        for x0, y0, x1, y1, text, _, btype in page.get_text("blocks"):
            t = (text or "").strip()
            if not t:
                continue
            # Filter tiny/noisy blocks (e.g., single token footers)
            if len(t.split()) < 4:
                continue
            # Compact internal whitespace
            t = re.sub(r"[ \t]+", " ", t)
            t = re.sub(r"\n{2,}", "\n", t)
            blocks.append(t)
    return blocks

# -------------------- CONTACT INFO --------------------

def extract_contact(full_text):
    email = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", full_text)
    phone = re.search(r"\+?\d[\d\s\-]{8,}", full_text)
    linkedin = re.search(r"https?://(www\.)?linkedin\.com/[^\s]+", full_text)

    # Heuristic address: look for a line with a number + a street-like token / city token
    address = None
    for line in full_text.splitlines():
        l = line.strip()
        if re.search(r"\d{1,5}\s+\S+", l) and re.search(r"(street|straat|road|lane|avenue|weg|plein|laan|city|amsterdam|nl|ind|uk|usa|us)\b", l, re.IGNORECASE):
            address = l
            break

    return {
        "email": email.group() if email else None,
        "phone": phone.group() if phone else None,
        "linkedin": linkedin.group() if linkedin else None,
        "address": address,
    }

# -------------------- CLASSIFICATION --------------------

def classify_block(block: str) -> str:
    """Ensemble: zero-shot vs semantic similarity; pick higher confidence."""
    # Zero-shot
    zs = zero_shot(block, section_keys)
    zs_label = zs["labels"][0]
    zs_score = zs["scores"][0]

    # Semantic similarity
    emb = semantic_model.encode([block])
    sims = cosine_similarity(emb, section_embeddings)[0]
    sem_idx = int(np.argmax(sims))
    sem_label = section_keys[sem_idx]
    sem_score = sims[sem_idx]

    # Winner (with small tie-break towards zero-shot)
    if abs(zs_score - sem_score) < 0.05:
        return zs_label
    return zs_label if zs_score > sem_score else sem_label

def post_rule(block: str, label: str) -> str:
    """Hard rules to fix common misclassifications."""
    low = block.lower()

    # Strong education cues
    if re.search(r"\b(bsc|msc|phd|b\.tech|m\.tech|bachelor|master|university|universiteit|degree|diploma|coursework)\b", low):
        return "education"

    # Strong experience cues (titles, companies, date ranges)
    if re.search(r"\b(intern|assistant|engineer|developer|analyst|associate|consultant|manager|present|worked at|company)\b", low):
        return "experience"
    if re.search(r"\b(20\d{2}\s*[-–]\s*(present|20\d{2}))\b", low):
        return "experience"

    # Skills: long comma/semicolon lists, tool stacks
    if (low.count(",") + low.count(";")) >= 4 or re.search(r"\bpython|java|c\+\+|sql|power bi|azure|gcp|aws|pandas|numpy|sklearn|tensorflow|pytorch|tableau\b", low):
        return "skills"

    # Projects
    if re.search(r"\bproject(s)?\b", low) and len(low) > 60:
        return "projects"

    # Awards/certs
    if re.search(r"\b(award|honor|prize|certification|certified|recognition)\b", low):
        return "awards"

    # Hobbies
    if re.search(r"\b(hobbies|interests|extracurricular|volunteer)\b", low):
        return "hobbies"

    return label

# -------------------- DEDUP / OVERLAP PREVENTION --------------------

def deduplicate_sections(sections: dict, jaccard_threshold: float = 0.85) -> dict:
    """
    Remove duplicates/near-duplicates across ALL sections.
    - Uses hash of normalized text for exact dupes.
    - Uses Jaccard similarity for near dupes.
    - Keeps the first occurrence encountered (by section order below).
    """
    ordered_sections = ["education", "experience", "skills", "projects", "awards", "hobbies"]
    existing = {"hashes": set(), "samples": []}
    cleaned = {k: [] for k in ordered_sections}

    # enforce a stable section iteration; include any unexpected keys too
    all_keys = [s for s in ordered_sections if s in sections] + [k for k in sections.keys() if k not in ordered_sections]

    for sec in all_keys:
        for block in sections[sec]:
            if not block or len(block.strip()) < 5:
                continue
            if is_near_duplicate(existing, block, jaccard_threshold=jaccard_threshold):
                continue
            cleaned.setdefault(sec, []).append(block)
            mark_kept(existing, block)

    # Drop empty sections for neatness
    return {k: v for k, v in cleaned.items() if v}

# -------------------- MAIN PIPELINE --------------------

def parse_resume(pdf_path: str, dedupe_jaccard: float = 0.85):
    # 1) Blocks from PDF
    blocks = extract_text_blocks(pdf_path)
    full_text = "\n".join(blocks)

    # 2) Contact info (before anything else)
    contact = extract_contact(full_text)

    # 3) Classify each block with ensemble + post-rules
    raw_sections = defaultdict(list)
    for block in blocks:
        label = classify_block(block)
        fixed = post_rule(block, label)
        raw_sections[fixed].append(block)

    # 4) De-duplicate across sections
    sections = deduplicate_sections(raw_sections, jaccard_threshold=dedupe_jaccard)

    # 5) Print nicely
    print("\n=== CONTACT INFO ===")
    for k, v in contact.items():
        if v:
            print(f"{k.capitalize()}: {v}")

    for sec, items in sections.items():
        print(f"\n=== {sec.upper()} SECTION ===")
        for it in items:
            print(it)
            print()

# -------------------- RUN --------------------

if __name__ == "__main__":
    parse_resume("Rushil-CV.pdf", dedupe_jaccard=0.86)


Device set to use cpu



=== CONTACT INFO ===
Email: nalekar.rushil23@gmail.com
Phone: +31 639607724 
Address: Marconistraat 5,Aalsmeer • +31 639607724 • nalekar.rushil23@gmail.com

=== EDUCATION SECTION ===
Bsc in Artificial Intelligence | Vrije Universitiet Amsterdam


=== EXPERIENCE SECTION ===
Teaching Assistant
Vrije Universitiet Amsterdam

Feb 2025 - Present

Sep 2024 - Present

Data Science Intern
ReadLer

Jul 2024- Sep 2024

Marconistraat 5,Aalsmeer • +31 639607724 • nalekar.rushil23@gmail.com

Software Enginerring Intern
SHARP ENGINEERING


=== SKILLS SECTION ===
Specialization in Artificial Intelligence and Data Science.
Projects on NLP, AI Agents, and Predictive Modeling.
Strong foundation in Mathematics: Probability & Statistics, Calculus, and Linear Algebra.

Assisted students with assignments, group sessions, and model-building projects. 
Clarified concepts in process modeling, causal graphs, and agent-based modeling
Facilitating study groups to encourage peer learning and engagement.

Technical