In [44]:
with open('../data/parsed_output/sample_resume.txt', 'r', encoding='utf-8') as f:
    resume_text = f.read()

## Block 0: Import Dependencies

In [51]:
import re
import spacy
from transformers import pipeline
from collections import Counter
import pandas as pd

## Block 1: Load spaCy NLP Model

In [46]:
nlp = spacy.load("en_core_web_sm")

## Block 2A: Extract Email

In [47]:
def extract_email(text):
    """
    Returns the first email address found in the text using regex.
    """
    match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
    return match.group(0) if match else None
print(re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', resume_text))

[]


## Block 2B: Extract Phone Number

In [52]:
def extract_phone(text):
    matches = re.findall(r'\+?\d[\d\-\(\) ]{7,}\d', text)
    
    for number in matches:
        if not any(year in number for year in ['2003', '2008', '1996', '1997']):
            digits = re.sub(r'\D', '', number)
            if 10 <= len(digits) <= 13:
                return number
    return None

## Block 2C: Extract Name (NER)

In [53]:
def extract_name(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            if ent.text.lower() not in ['accomplishments created', 'accomplishments']:
                if 2 <= len(ent.text.split()) <= 3:
                    return ent.text
    return None

## Block 2D: Extract Skills (from list)

In [40]:
# Example skill list – expand this based on your domain
SKILL_KEYWORDS = [
    "python", "machine learning", "data analysis", "excel", "sql", "nlp",
    "deep learning", "keras", "pytorch", "communication", "teaching"
]

def extract_skills(text):
    """
    Matches lowercase keywords from the skill list found in the resume text.
    """
    text_lower = text.lower()
    found_skills = [skill for skill in SKILL_KEYWORDS if skill in text_lower]
    return list(set(found_skills))

## Block 3: Run All Extraction Functions

In [41]:
# Load the extracted resume text saved from Notebook 1
with open('../data/parsed_output/sample_resume.txt', 'r', encoding='utf-8') as f:
    resume_text = f.read()

# Run all parsing functions
parsed_data = {
    "Name": extract_name(resume_text),
    "Email": extract_email(resume_text),
    "Phone": extract_phone(resume_text),
    "Skills": extract_skills(resume_text)
}

# Display output neatly
import pandas as pd
pd.DataFrame([parsed_data])

Unnamed: 0,Name,Email,Phone,Skills
0,Accomplishments Created,,,"[communication, excel]"
