In [63]:
with open('../data/parsed_output/sample_resume.txt', 'r', encoding='utf-8') as f:
    resume_text = f.read()

In [64]:
import os
print(os.path.exists('../data/parsed_output/sample_resume.txt'))

True


## Block 1: Import libraries

In [65]:
import re
import spacy
from transformers import pipeline

# ✅ Domain-specific skills dictionary
DOMAIN_SKILLS = {
    "IT": ["python", "sql", "docker", "aws", "pytorch", "javascript", "agile"],
    "Accounting": ["quickbooks", "audit", "tally", "reconciliation", "ledger", "gaap"],
    "Education": ["lesson planning", "curriculum", "classroom management"],
    "Healthcare": ["patient care", "emr", "icu", "diagnostics", "nursing"],
    "HR": ["recruitment", "onboarding", "payroll", "compliance", "employee relations"],
    "Sales": ["crm", "negotiation", "lead generation", "branding"]
}

## Block 2: Load spaCy model

In [66]:
nlp = spacy.load("en_core_web_sm")

## Block 3: Extract email

In [67]:
def extract_email(text):
    pattern = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')
    matches = pattern.findall(text)
    return matches[0] if matches else None

## Block 4: Phone Extraction

In [68]:
def extract_phone(text):
    pattern = re.compile(r'(\+91[\-\s]?)?[6-9]\d{9}')
    matches = pattern.findall(text)
    for match in matches:
        cleaned = re.sub(r'\D', '', match)
        if 10 <= len(cleaned) <= 13:
            return match.strip()
    return None

## Block 5: Name Extraction(SpaCy + HuggingFace fallback)

In [69]:
def extract_name(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            if ent.text.lower() not in ["resume", "cv", "curriculum vitae", "mission statement"]:
                return ent.text
    return None

# HuggingFace NER fallback
try:
    ner_model = pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True)
except Exception as e:
    print("⚠️ HuggingFace NER failed to load:", e)
    ner_model = None

def extract_name_bert(text):
    if not ner_model:
        return None
    entities = ner_model(text[:512])
    for ent in entities:
        if ent['entity_group'] == "PER":
            return ent['word']
    return None

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


## Block 6: Extract Skills (Domain-aware version)

In [70]:
def extract_skills(text, domain="IT"):  # Default domain is IT
    text = text.lower()
    keywords = DOMAIN_SKILLS.get(domain, [])
    found_skills = []
    
    for skill in keywords:
        if re.search(rf'\b{re.escape(skill.lower())}\b', text):
            found_skills.append(skill)

    return list(set(found_skills))

## Block 8: Parse Resume

In [71]:
def parse_resume(text, domain="IT"):
    return {
        "Name": extract_name(text) or extract_name_bert(text) or "[Not found]",
        "Email": extract_email(text) or "[Not found]",
        "Phone": extract_phone(text) or "[Not found]",
        "Skills": extract_skills(text, domain)
    }

## Block 9: Run + View Result

In [72]:
parsed_data = parse_resume(resume_text, domain="IT")  # Change domain here
import pandas as pd
pd.DataFrame([parsed_data])

Unnamed: 0,Name,Email,Phone,Skills
0,Accomplishments Created,[Not found],[Not found],[]


In [73]:
parsed_data

{'Name': 'Accomplishments Created',
 'Email': '[Not found]',
 'Phone': '[Not found]',
 'Skills': []}