# Resume Parser using Spacy

In [1]:
# !pip install pdfminer.six

In [2]:
import spacy
import random
from spacy.training.example import Example
from spacy.util import minibatch, compounding
from pathlib import Path

2024-01-27 17:33:58.934766: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-27 17:34:03.072863: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-27 17:34:03.074410: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your 

In [3]:
from pdfminer.high_level import extract_text

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

In [4]:
def parse_resume(text):
    doc = nlp(text)

    # Extract information based on spaCy's NER (Named Entity Recognition) capabilities
    entities = [(ent.text, ent.label_) for ent in doc.ents]

    return entities

In [5]:
nlp = spacy.load("en_core_web_sm")
ner = nlp.get_pipe('ner')
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [6]:
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
unaffected_pipes

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer']

In [7]:
# Add the new label to ner
Label1 = "TENURE"
Label2 = "SKILLS"
Label3 = "COURSE"
Label4 = "INVOLVEMENT"
Label5 = "EXPERIENCE"
Label6 = "SOCIALS"
Label7 = "EDUCATION"
# Label8 = "ORG" # for education and work institutions

# label=[]
# for i in range(1, 6):
#     label.append(globals()[f"Label{i}"])

for i in range(1,8):
    ner.add_label(globals()[f"Label{i}"])

# # Resume training
optimizer = nlp.resume_training()
move_names = list(ner.move_names)

In [8]:
ner.labels

('CARDINAL',
 'COURSE',
 'DATE',
 'EDUCATION',
 'EVENT',
 'EXPERIENCE',
 'FAC',
 'GPE',
 'INVOLVEMENT',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'SKILLS',
 'SOCIALS',
 'TENURE',
 'TIME',
 'WORK_OF_ART')

In [9]:
resume = """Rupesh Ghimire
rupeshghimire007@gmail.com | +977-9868155925

EDUCATION
Pashchimanchal Campus, IOE, TU
Bachelor in Computer Engineering
2019-Present
Fusemachines
Micro-Degree in Artificial Intelligence
2023-Present
Fellowship Scholar
LINKS
Github:// rupeshghimire7
LinkedIn:// rupesh-ghimire7
Medium:// rupeshghimire7
LeetCode:// rupeshghimire7
INVOLVEMENTS
NAAMI | ANAIS-Student Ambassador
Apr 2023 - Jun 2023
i-CES | Django Mentor
Jan 2023 - Feb 2023 | Pokhara
Made attendees familiar with Python and Django's MVC
architecture, ORM, Templates and Rest Framework.
Coding Competition (GCES) - 2023
Code with Coffee (i-CES) - 2022
PROJECTS
LIVER CIRRHOSIS PREDICTION
Fusemachines ML Final Project | (Feb 2023- Apr 2023)
Worked on the multiclass classification problem with
various classifiers to predict the stage of patients' liver.
Deployed on Flask.
BCT Study Room
Software Engineering Project | (May 2022- Jul 2022)
A django web-app to enable discussion in groups.
BookSuggestor
COURSEWORKSDBMS Project
Leveraged RAW SQL for database management using
MySQL in Django bypassing Django's ORM.
UNDERGRADUATETelegram Chat Bot
Data Structures & Algorithms
Operating Systems
Database Management System
Software Engineering
Artificial Intelligence
Computer NetworksPersonal Project
Used Telegram API for configuration and python for
implementation.
Machine Learning AlgorithmsACHIEVEMENTS
FUSEMACHINES
PROGRAMMING SKILLS
EXPERIENCED
Python | Django
Tailwind CSS | HTML5 | CSS3
INTERMEDIATE
Pytorch
MySQL
Pandas | Numpy
Matplotlib | Seaborn | Scikit-Learn
RESTful API
C/C++
FAMILIAR
Tensorflow/ Keras
JavaScript | ReactJS
ML Projects
Personal Project
Collection of Regression and Classification projects.
HultPrize 2021
To build viable food enterprises to create jobs, stimulate
economies, reimagine supply chains, and improve outcomes for
10,000,000 people by 2030.
OnCampus: 1st Runner Up
Regional Summit: Participant via Wildcard
Golden Jubilee Scholarship Scheme
Awarded from Embassy of India, Kathmandu for
Undergrad Studies.
LANGUAGES
Nepali - Native Proficiency
English - Professional Working Proficiency"""

In [10]:
resume = resume.replace('\n', ' ').replace('|', '')

### TRAINER FUNCTION

Takes list train data of form:
        
        [("Pizza is a common fast food.", {"entities": [(0, 5, "FOOD")]}),
        ("Pasta is an italian recipe", {"entities": [(0, 5, "FOOD")]})]
        
Resume Doc is spacy doc object of resume:

In [11]:
def trainer(DATA):
    
    # TRAINING THE MODEL
    with nlp.disable_pipes(*unaffected_pipes):

        # Training for n iterations
        for iteration in range(10):

            # Shuffling examples before every iteration
            random.shuffle(DATA)
            losses = {}

            # Batch up the examples using spaCy's minibatch
            batches = minibatch(DATA, size=compounding(4.0, 32.0, 1.001))

            for batch in batches:
                examples = []
                for text, annotation in batch:
                    example = Example.from_dict(nlp.make_doc(text), annotation)
                    examples.append(example)

                nlp.update(
                    examples,  
                    sgd=optimizer,
                    drop=0.5,  
                    losses=losses,
                )

            print("Losses", losses)

### Data Generator function

Label -> "SKILLS"

List of data -> ['React', 'Django', 'Machine Learning', 'AI']  

In [12]:
def data_generator(label,data_list,level=None):
    training_data = []

    for i in range(len(data_list)):
        data = data_list[i]
        start_index = 0
        end_index = start_index + len(data)
        training_data.append((data, {"entities": [(start_index, end_index, label)]}))

    return training_data

### View Results

In [13]:
def view_results(doc):
    for ent in doc.ents:
        print(f"{ent.label_}: {ent.text}")

### Creating data for tecnhical skills i.e. Entity Label : SKILLS

In [14]:
technical_skills = [
    # Programming Languages
    "Python", "Java", "C++", "JavaScript", "HTML", "CSS", "Ruby", "Go", "Swift", "Kotlin",
    "TypeScript", "Rust", "Scala", "PHP", "C#", "Objective-C",

    # Web Development
    "Django", "Flask", "Node.js", "Express.js", "React", "Angular", "Vue.js", "Next.js",
    "Spring Boot", "Ruby on Rails", "ASP.NET", "Meteor", "HTML5", "CSS3", "Bootstrap",

    # Databases
    "SQL", "MySQL", "PostgreSQL", "MongoDB", "Redis", "SQLite", "Firebase", "Cassandra",

    # Machine Learning / Data Science
    "TensorFlow", "PyTorch", "Scikit-learn", "Keras", "Pandas", "NumPy", "Matplotlib",
    "Seaborn", "NLTK", "Spacy", "Scrapy", "Beautiful Soup",

    # Cloud Computing
    "AWS", "Azure", "Google Cloud Platform (GCP)", "Docker", "Kubernetes", "Heroku",

    # DevOps
    "Jenkins", "Travis CI", "GitLab CI", "Ansible", "Terraform", "Docker Compose",

    # Version Control
    "Git", "GitHub", "Bitbucket", "GitLab",

    # Mobile Development
    "React Native", "Flutter", "Xamarin", "SwiftUI", "Android SDK",

    # Frameworks
    "React", "Angular", "Vue.js", "Django", "Flask", "Ruby on Rails", "Spring Boot", "Express.js",

    # Libraries
    "Pandas", "NumPy", "Matplotlib", "Seaborn", "NLTK", "Spacy", "Scrapy", "Beautiful Soup",

    # Frontend Technologies
    "React", "Angular", "Vue.js", "Next.js", "TypeScript", "Webpack", "Babel", "SASS",

    # Backend Technologies
    "Node.js", "Django", "Flask", "Spring Boot", "Ruby on Rails", "Express.js", "PHP",

    # Mobile Frameworks
    "React Native", "Flutter", "Xamarin", "SwiftUI", "Android SDK",

    # Networking
    "TCP/IP", "HTTP/HTTPS", "DNS", "Load Balancing", "Firewalls", "Proxy Servers",

    # Security
    "Cybersecurity", "Penetration Testing", "Cryptography", "OWASP", "SSL/TLS",

    # Operating Systems
    "Linux", "Windows", "macOS", "Unix",

    # Other Technologies
    "Blockchain", "Serverless", "Microservices", "RESTful API", "GraphQL",

    # Project Management / Agile
    "Scrum", "Kanban", "Agile", "JIRA", "Trello",

    # Miscellaneous
    "Data Manipulation", "Natural Language Processing (NLP)", "Computer Vision",
]
level = ['beginner', 'intermediate', 'expert']

In [15]:
TRAIN_DATA = data_generator("SKILLS",technical_skills, level)

for data in TRAIN_DATA[:5]:
    print(data)
for data in TRAIN_DATA[-5:]:
    print(data)
print("TRAIN_DATA length", len(TRAIN_DATA))

('Python', {'entities': [(0, 6, 'SKILLS')]})
('Java', {'entities': [(0, 4, 'SKILLS')]})
('C++', {'entities': [(0, 3, 'SKILLS')]})
('JavaScript', {'entities': [(0, 10, 'SKILLS')]})
('HTML', {'entities': [(0, 4, 'SKILLS')]})
('JIRA', {'entities': [(0, 4, 'SKILLS')]})
('Trello', {'entities': [(0, 6, 'SKILLS')]})
('Data Manipulation', {'entities': [(0, 17, 'SKILLS')]})
('Natural Language Processing (NLP)', {'entities': [(0, 33, 'SKILLS')]})
('Computer Vision', {'entities': [(0, 15, 'SKILLS')]})
TRAIN_DATA length 136


In [16]:
view_results(nlp(resume))

PERSON: Rupesh Ghimire rupeshghimire007@gmail.com
PERSON: Pashchimanchal Campus
ORG: IOE
PERSON: TU Bachelor
DATE: 2019
PERSON: rupeshghimire7
ORG: rupeshghimire7 INVOLVEMENTS NAAMI  ANAIS-Student
DATE: 2023 - Jun 2023
DATE: Jan 2023 - Feb 2023
PERSON: Django
ORG: MVC
PERSON: Templates
FAC: Rest Framework
GPE: Flask
DATE: May 2022- Jul 2022
ORG: BookSuggestor COURSEWORKSDBMS Project Leveraged RAW SQL
GPE: Django
PERSON: Django
ORG: UNDERGRADUATETelegram Chat Bot Data Structures & Algorithms Operating Systems Database Management System Software Engineering Artificial Intelligence Computer NetworksPersonal Project
PERSON: Machine Learning
PERSON: Numpy Matplotlib
PERSON: Seaborn  Scikit-Learn
PERSON: Keras JavaScript  ReactJS
ORG: ML Projects Personal Project Collection of Regression and Classification
CARDINAL: 10,000,000
DATE: 2030
PERSON: OnCampus
CARDINAL: 1st
PERSON: Participant
PERSON: Wildcard Golden Jubilee Scholarship Scheme Awarded
GPE: Embassy
GPE: India
GPE: Kathmandu
PERSON:

In [17]:
trainer(DATA= TRAIN_DATA)

Losses {'ner': 207.89096086251678}
Losses {'ner': 100.04769532120554}
Losses {'ner': 72.37185845894317}
Losses {'ner': 66.0976101893466}
Losses {'ner': 42.52660317358459}
Losses {'ner': 28.068101785829885}
Losses {'ner': 15.049703359775833}
Losses {'ner': 12.398957576528375}
Losses {'ner': 7.972060559081265}
Losses {'ner': 10.40278832094074}


In [18]:
view_results(nlp(resume))

SKILLS: Rupesh Ghimire rupeshghimire007@gmail.com
SKILLS: +977-9868155925  
SKILLS: EDUCATION Pashchimanchal Campus
SKILLS: TU Bachelor
SKILLS: in Computer Engineering 2019-Present Fusemachines Micro-Degree
SKILLS: Artificial Intelligence 2023-Present Fellowship Scholar
SKILLS: rupeshghimire7
SKILLS: rupesh-ghimire7 Medium:// rupeshghimire7
SKILLS: LeetCode:// rupeshghimire7 INVOLVEMENTS NAAMI  ANAIS-Student Ambassador Apr 2023 - Jun 2023
SKILLS: i-CES
SKILLS: Django Mentor
SKILLS: 2023 - Feb 2023
SKILLS: Pokhara Made
SKILLS: familiar
SKILLS: with
SKILLS: Python and Django's MVC
SKILLS: ,
SKILLS: ORM, Templates
SKILLS: Rest Framework
SKILLS: Coding Competition
SKILLS: 2023 Code with Coffee
SKILLS: i-CES) - 2022 PROJECTS LIVER CIRRHOSIS PREDICTION Fusemachines ML Final Project  (Feb 2023- Apr 2023) Worked
SKILLS: classifiers
SKILLS: .
SKILLS: Deployed on Flask
SKILLS: BCT Study Room Software Engineering Project
SKILLS: 2022- Jul 2022
SKILLS: web-app
SKILLS: .
SKILLS: BookSuggestor COURS

**It seems like it is showing everything as skill. We will still train our model further on more data and labels and see how it performs. If the final result obtained isn't good, we will train our data directly on resume data instead of separate data**

### Train Tenure (for eg: Jan 2023 - Jan 2024)

In [19]:
date_phrases = [
    "January 2020 - Present",
    "2018 - 2022",
    "Dec 2015 - Mar 2018",
    "July 2021 - August 2023",
    "September 2019 - Nov 2020",
    "2020 - Present",
     "Oct 2018 - Apr 2020",
    "May 2016 - September 2019",
    "June 2019 - Dec 2021",
    "2005 - 2010",
    "Mar 2017 - Present",
    "July 2022",
    "Jan 2014 - Jun 2015",
    "August 2020 - Current",
    "Fall 2018 - Spring 2022",
    "Q3 2019 - Q1 2021",
    "Dec '19 - Feb '22",
    "3/2015 - 5/2018",
    "Apr 2008",
    "Summer 2016"
]

In [20]:
TENURE_DATA = data_generator("TENURE",date_phrases, level)

for data in TENURE_DATA[:5]:
    print(data)
for data in TENURE_DATA[-5:]:
    print(data)
print("TENURE_DATA length", len(TENURE_DATA))

('Computer Vision', {'entities': [(0, 15, 'SKILLS')]})
('Android SDK', {'entities': [(0, 11, 'SKILLS')]})
('Android SDK', {'entities': [(0, 11, 'SKILLS')]})
('Unix', {'entities': [(0, 4, 'SKILLS')]})
('NumPy', {'entities': [(0, 5, 'SKILLS')]})
('macOS', {'entities': [(0, 5, 'SKILLS')]})
('CSS', {'entities': [(0, 3, 'SKILLS')]})
('Docker', {'entities': [(0, 6, 'SKILLS')]})
('Azure', {'entities': [(0, 5, 'SKILLS')]})
('Kubernetes', {'entities': [(0, 10, 'SKILLS')]})
TRAIN_DATA length 136


In [21]:
trainer(TENURE_DATA)

Losses {'ner': 69.92621201228422}
Losses {'ner': 59.01772405664411}
Losses {'ner': 51.14020713698119}
Losses {'ner': 59.6613085282479}
Losses {'ner': 69.73185156157706}
Losses {'ner': 47.085694319714094}
Losses {'ner': 23.607129734009504}
Losses {'ner': 19.894113314076094}
Losses {'ner': 15.208815962891094}
Losses {'ner': 10.083870277058534}


In [22]:
view_results(nlp(resume))

TENURE: Rupesh Ghimire rupeshghimire007@gmail.com  
TENURE: Pashchimanchal Campus, IOE, TU Bachelor in Computer Engineering 2019-Present Fusemachines Micro-Degree in Artificial Intelligence 2023-Present Fellowship Scholar LINKS Github:// rupeshghimire7
TENURE: rupesh-ghimire7 Medium:// rupeshghimire7 LeetCode:// rupeshghimire7 INVOLVEMENTS NAAMI  ANAIS-Student Ambassador Apr 2023 - Jun 2023 i-CES  Django Mentor Jan 2023 - Feb 2023  
TENURE: Pokhara Made
TENURE: Python and Django's MVC
TENURE: ORM, Templates and Rest Framework.
TENURE: Coding Competition (GCES) - 2023 Code with Coffee (i-CES) - 2022 PROJECTS LIVER CIRRHOSIS PREDICTION Fusemachines ML Final Project  (Feb 2023- Apr 2023) Worked on the multiclass classification problem
TENURE: Deployed on Flask
TENURE: BCT Study Room Software Engineering Project  (May 2022- Jul 2022)
TENURE: BookSuggestor COURSEWORKSDBMS Project Leveraged RAW SQL for database management using MySQL in Django bypassing Django's ORM
TENURE: UNDERGRADUATETele