# **Stage 2 # Resume Parser**

This Notebook focuses on Extracting Skills, education and experience from the resume

In [None]:
!pip install nltk
!pip install spacy==2.3.5
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
!pip install numpy==1.26.4  pyyaml
!pip install pyresparser

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 1. PyResparser (Rule-Based Extraction)

A simple resume parser used for extracting information from resumes. Parsing of PDF and DOCx files are supported by pyresparser. It extract features:

*  Name
*  Email
*  Mobile numbers
*  Skills
*  Total experience
*  College name
*  Degree
*  Designation
*  Company names

In [None]:
from pyresparser import ResumeParser
import pprint

# Initialize ResumeParser object
resume_data = ResumeParser('/content/16852973.pdf').get_extracted_data()

# Parse resume
pprint.pprint(resume_data)



{'college_name': None,
 'company_names': None,
 'degree': None,
 'designation': ['Assistant General Manager',
                 'HR Administrator/Marketing Associate HR Administrator',
                 'Executive Support / Marketing Assistant Jul'],
 'email': None,
 'experience': ['HR Administrator/Marketing Associate HR Administrator Dec '
                '2013 to Current',
                'Company Name ï¼\u200b City , State',
                'Helps to develop policies, directs and coordinates activities '
                'such as employment, compensation, labor relations, benefits, '
                'training, and',
                'employee services.',
                'Prepares employee separation notices and related '
                'documentation',
                'Keeps records of benefits plans participation such as '
                'insurance and pension plan, personnel transactions such as '
                'hires, promotions,',
                'transfers, performance reviews

## 2. SpaCy Custom NER Model

This model support PDF, Docx, TXT files and dataset from the Kaggle. We will be loading resume dataset from Kaggle to extract entities (Hard skills, Soft skills, Education and Experience)

In [None]:
import pandas as pd
import kagglehub
import pandas as pd
import os
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
import spacy

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Extract SpaCy custom NER model
#!pip install spacy-transformers
#!python -m spacy download en_core_web_trf

import zipfile
import os

zip_file_path = '/content/model-best1.zip' # path to zip file
extract_dir = '/content/model-best/' # Directory to extract the model

# Create the extraction directory
os.makedirs(extract_dir, exist_ok=True)

# Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"Extracted model to: {extract_dir}")

Extracted model to: /content/model-best/


In [None]:
# Load Resume Dataset

path_for_resume = kagglehub.dataset_download("snehaanbhawal/resume-dataset")
print("Path to resume dataset files:", path_for_resume)
resume_df = pd.read_csv(os.path.join(path_for_resume, 'Resume', 'Resume.csv'))

Path to resume dataset files: /kaggle/input/resume-dataset


In [None]:
resume_df.drop(columns=['Resume_html'], inplace=True)
resume_df.drop_duplicates(subset=['Resume_str'], inplace=True)
print(f"Resumes after duplicate removal: {len(resume_df)}")

Resumes after duplicate removal: 2482


In [None]:
def preprocess_resume_text(raw_text):
    # Remove page numbers, headers, email signatures
    text = re.sub(r'\d{4}-\d{2}-\d{2}', '', raw_text)
    text = re.sub(r'Page\s+\d+', '', text)
    text = re.sub(r'[•·\uf0b7]', '-', text)
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'\n{2,}', '\n', text)
    return text.strip()

resume_df['Resume_str'] = resume_df['Resume_str'].apply(preprocess_resume_text)

In [None]:
import spacy
import pandas as pd

# Load custom NER model
nlp = spacy.load("model-best")

def extract_custom_ner(text):
    text = text.replace('\n', ' ')  # Flatten line breaks
    doc = nlp(text)

    def clean_text(s):
        return s.strip().lower().rstrip(",. ")

    hard_skills = set()
    soft_skills = set()
    education = set()
    experience = set()

    for ent in doc.ents:
        label = ent.label_.lower()
        cleaned = clean_text(ent.text)

        if label == "hard_skill":
            hard_skills.add(cleaned)
        elif label == "soft_skill":
            soft_skills.add(cleaned)
        elif label == "education":
            education.add(cleaned)
        elif label == "experience":
            experience.add(cleaned)

    return {
        'custom_ner_hard_skills': list(hard_skills),
        'custom_ner_soft_skills': list(soft_skills),
        'custom_ner_education': list(education),
        'custom_ner_experience': list(experience)
    }



In [None]:
import spacy
import pandas as pd

# Apply to first 2 rows
resume_subset = resume_df.iloc[:2].copy()
ner_outputs = resume_subset['Resume_str'].apply(extract_custom_ner)

# Convert the Series of dictionaries into a DataFrame
ner_outputs_df = ner_outputs.apply(pd.Series)

# Merge results back with original subset
result_df = pd.concat([resume_subset.reset_index(drop=True), ner_outputs_df], axis=1)

# Display results
display(result_df[['Resume_str', 'custom_ner_hard_skills', 'custom_ner_soft_skills', 'custom_ner_education', 'custom_ner_experience']])

Unnamed: 0,Resume_str,custom_ner_hard_skills,custom_ner_soft_skills,custom_ner_education,custom_ner_experience
0,HR ADMINISTRATOR/MARKETING ASSOCIATE HR ADMINI...,[designed and created marketing collateral for...,"[strives to instill a shared, enthusiastic com...",[high school diploma],[15+ years of experience in hospitality and cu...
1,"HR SPECIALIST, US HR OPERATIONS Summary Versat...","[increase product awareness, maintained the my...","[collaborated with communication team, coordin...",[],[]


In [None]:
# Print in Structured Json Format
import json

# Convert DataFrame to a list of dictionaries
ner_outputs_list = ner_outputs_df.to_dict(orient='records')

# Iterate through the list and print each dictionary with a label
for i, resume_output in enumerate(ner_outputs_list):
    print(f"Resume {i+1}:")
    print(json.dumps(resume_output, indent=4))
    print("-" * 20) # Add a separator for clarity

Resume 1:
{
    "custom_ner_hard_skills": [
        "designed and created marketing collateral for sales meetings, trade shows and company executives",
        "customer service",
        "completed courses and seminars in",
        "statistics",
        "managed front-end operations",
        "marketing and advertising",
        "accounting",
        "reviewed medical bills for the accuracy of the treatments, tests, and hospital stays",
        "trained to interpret the codes (icd-9, cpt) and terminology commonly used in medical billing",
        "budgeting",
        "sales strategies",
        "marketing savvy",
        "helps to develop policies, directs and coordinates activities such as employment, compensation, labor relations, benefits, training, and employee services",
        "keeps records of benefits plans participation such as insurance and pension plan, personnel transactions such as hires, promotions, transfers, performance reviews, and terminations, and employee statisti

## 3.	LLM-Based Resume Parsing (Gemini 2.5 Flash-lite API)

In [None]:
resume_subset = resume_df.iloc[:2]

In [None]:
import json
import base64
import os
import pandas as pd
from google import genai
from google.genai import types
import time

GEMINI_API_KEY = "Your Gemini API Key"

# Initialize Gemini Client
client = genai.Client(api_key=GEMINI_API_KEY)
model = "gemini-2.5-flash-lite-preview-06-17"
generate_content_config = types.GenerateContentConfig(
    temperature=0.8,
    thinking_config=types.ThinkingConfig(thinking_budget=0),
    response_mime_type="text/plain",
)

def build_prompt(resume_text):
    return f"""
You are an expert resume parser. Read the resume text below and extract structured information in JSON format with the following fields:

1. Education: [Exact degrees, certifications, or academic requirements mentioned in the job description, e.g., "Bachelor's in Computer Science", "Master's in Business Administration",
              (e.g., "Bachelor’s in Engineering", "MBA", "Certified Public Accountant")

]

2. Experience: [Specific requirements around years of experience, industry experience, or role-based experience mentioned in the job text (e.g., "5+ years in project management", "experience in healthcare")

]

3. Total_experience_years: [total years of professional experience, e.g. 5+ years]

4. Skills: {{
    "hard_skills": [Should Extraxt All hard/technical skills mentioned in the resume and job responsibilities],
    "soft_skills": [Should extract All soft/interpersonal/communication/managerial skills mentioned or implied in the resume and job responsibilities]
}}

Only return a clean JSON dictionary with the above fields.
Exclude any field if the information is missing or unclear.
Use double quotes around all field values.
Do not include generic skills, vague summaries, or industry norms unless they are stated directly in the Resume.

Resume:
\"\"\"{resume_text}\"\"\"
""".strip()


def clean_json_fences(s: str) -> str:
    """Strip markdown fences and leading 'json' token."""
    if not isinstance(s, str):
        return s
    s = s.strip()
    # Remove ```json or ``` at the start
    for prefix in ("```json", "```"):
        if s.startswith(prefix):
            s = s[len(prefix):].strip()
    # Remove trailing ```
    if s.endswith("```"):
        s = s[:-3].strip()
    # Remove leading 'json' token if present
    if s.lower().startswith("json"):
        s = s[4:].strip()
    return s

def query_gemini(resume_text):
    contents = [
        types.Content(
            role="user",
            parts=[{"text": build_prompt(resume_text)}]
        )
    ]
    try:
        raw = ""
        for chunk in client.models.generate_content_stream(
            model=model,
            contents=contents,
            config=generate_content_config,
        ):
            if chunk.text:
                raw += chunk.text
        return clean_json_fences(raw)
    except Exception as e:
        print(f"Error: {e}")
        return None

# Create a list to store responses
llm_responses = []

for idx, row in resume_subset.iterrows():
    print(f"Processing index {idx}...")
    job_text = row["resume_text"]
    result = query_gemini(job_text)
    llm_responses.append(result)
    time.sleep(4.1)

# Add the responses to DataFrame
resume_subset['LLM_response'] = llm_responses

Processing index 0...
Processing index 1...


In [None]:
resume_subset

Unnamed: 0,ID,Category,resume_text,LLM_response
0,16852973,HR,HR ADMINISTRATOR/MARKETING ASSOCIATE HR ADMINI...,"{\n ""Education"": [\n ""Business Administrat..."
1,22323967,HR,"HR SPECIALIST, US HR OPERATIONS Summary Versat...","{\n ""Education"": [\n ""Master of Arts : Cor..."


In [None]:
import pandas as pd
import json

# Function to safely parse the LLM response
def safe_parse(text):
    try:
        # If already a dictionary, return as-is
        if isinstance(text, dict):
            return text
        # If it's a JSON string, parse it
        elif isinstance(text, str):
            return json.loads(text)
    except:
        pass
    # Fallback default in case of error or invalid format
    return {

        "Work_experience": [],
        "Education": [],
        "Skills": {
            "hard_skills": [],
            "soft_skills": []
        },
        "Total_experience_years": ""
    }

# Apply safe_parse to the 'LLM_response' column
parsed_df = resume_subset["LLM_response"].apply(safe_parse).apply(pd.Series)

# Concatenate parsed structured data with original dataframe
final = pd.concat([resume_subset, parsed_df], axis=1)

# print one sample row as JSON
sample_json = safe_parse(resume_subset["LLM_response"].iloc[0])
print(json.dumps(sample_json, indent=4))


{
    "Education": [
        "Business Administration",
        "Marketing / Advertising",
        "High School Diploma"
    ],
    "Experience": [
        "15+ years of experience in Hospitality and Customer Service Management"
    ],
    "Total_experience_years": "15+ years",
    "Skills": {
        "hard_skills": [
            "HR Administrator",
            "Marketing Associate",
            "Medical Claims Analyst",
            "Reservation & Front Office Manager",
            "Price Integrity Coordinator",
            "HR policies",
            "compensation",
            "labor relations",
            "benefits administration",
            "training",
            "employee services",
            "employee separation",
            "personnel records",
            "government reporting",
            "employee relations",
            "insurance",
            "pension plans",
            "vacation",
            "sick leave",
            "leave of absence",
            "employee assi

In [None]:
sample_json = safe_parse(resume_subset["LLM_response"].iloc[1])
print(json.dumps(sample_json, indent=4))

{
    "Education": [
        "Master of Arts : Corporate Communication & Public Relations",
        "Bachelor of Arts : Relational Communication"
    ],
    "Experience": [
        "HR Specialist, US HR Operations",
        "IT, Marketing and Communications Co - op",
        "Relationship Coordinator/Marketing Specialist",
        "Assistant Head Teller",
        "Senior Producer"
    ],
    "Total_experience_years": "9+ years",
    "Skills": {
        "hard_skills": [
            "Adobe Photoshop",
            "ADP",
            "Asset Management",
            "branding",
            "brochures",
            "content",
            "Final Cut Pro",
            "graphics",
            "graphic",
            "Illustrator",
            "In Design",
            "inventory",
            "Lotus Notes",
            "marketing",
            "marketing materials",
            "marketing material",
            "materials",
            "Microsoft Office",
            "Share Point",
            "n