RESUME CLASSIFIER

1. Tokenization
2. StopWords
3. Rule Based Matching

    1. Token Based Matching
    2. Phrase Matcher
    3. Dependency Matcher

In [1]:
# import the libraries
import os
import sys
import spacy
#import docx
from tqdm import tqdm
import pandas as pd
import numpy as np
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import json 
import random
import re
from spacy.matcher import Matcher, PhraseMatcher
from os import listdir
from os.path import isfile, join 
from io import StringIO
from collections import Counter
import pickle
import plotly.express as px
import plotly.graph_objects as go
print("All libraries are imported")

All libraries are imported


In [2]:
df = pd.read_csv('UpdatedResumeDataSet.csv')
df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [3]:
first_resume = df['Resume'][0]
print(first_resume[:100])

Skills * Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), Sql, Java, 


In [4]:
# Load the pretrained model
nlp = spacy.load('en_core_web_sm')

In [5]:
# initiate the matcher with vocabulary

### extract the names

In [6]:
def get_name(resume_text):
    nlp_text = nlp(resume_text)
    pattern = [{'POS':'PROPN'}, {'POS':'PROPN'}]
    matcher = Matcher(nlp.vocab)
    matcher.add('RESUME', [pattern])
    matches = matcher(nlp_text)
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        return span.text

In [7]:
print(get_name('My name is Prabhavith Reddy'))
print(get_name(first_resume))

Prabhavith Reddy
Programming Languages


In [8]:
data = {'resume': []}
for file in os.listdir('batch_1')[:100]:
    with open('batch_1\\'+file, 'r', encoding='utf-8') as f:
        data['resume'].append(f.read().strip())
        f.close()

In [9]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,resume
0,"Arghya Das\nMy Profile\n\nPROJECT ENGINEER, Ba..."
1,CURRICULUM VITAE ...
2,VALLABH KALAGI\n\nSachin Jadhav\nE-mail: sachi...
3,RESUME\n\nNITIN JAIN \n\n\n\n\n\n \n\n...
4,SANTOSH KUMAR DHAL\n\n\nMobile No: +8553782862...


In [10]:
for i in range(10):
    resume = df['resume'][i]
    print(get_name(resume))

Arghya Das
CURRICULUM VITAE
VALLABH KALAGI
NITIN JAIN
SANTOSH KUMAR
Prem Prakash
CURRICULUM VITAE
CURRICULUM VITAE
Sunil Kumar
ALKA TIWARI


### extract contact numbers

In [11]:
def get_contact(text):
    pattern = re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
    phone = re.findall(pattern, text)
    if phone:
        number = ''.join(phone[0])
        if len(number)>10:
            return '+'+number
        else:
            return number

In [12]:
for i in range(10):
    resume = df['resume'][i]
    print(get_contact(resume))

9972364371
8734897900
9561199772
8010475082
8553782862
9654454943
9664269288
9748636525
9743784702
9599200393


### extract the email address

In [13]:
def get_email(text):
    pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
    email = re.findall(pattern, text)
    if email:
        return email[0]

In [14]:
for i in range(10):
    resume = df['resume'][i]
    print(get_email(resume))

arg234@gmail.com
Darjipooja3638@gmail.com
sachinjadhav58@gmail.com
nitin9482@gmail.com
santoshkumardhal1993@gmail.com
premgautam958@gmail.com
Prajaktashinde2211@gmail.com
saikatdutta030@gmail.com
sunilkumarbt01@gmail.com
alkatiwari124@gmail.com


### extract educational qualifications

In [28]:
EDUCATION = ['BE', 'B.E.', 'B.E', 
             'BS','B.S', 'ME', 
             'M.E', 'M.E.', 'MS', 
             'M.S', 'BTECH', 'B.TECH', 
             'M.TECH', 'MTECH', 'SSC', 'HSC','CBSE','ICSE','X', 'XII']

In [29]:
def get_education(text):
    doc = nlp(text)
    # sentence tokenizer
    doc = [sent.string.strip() for sent in doc.sents]
    edu = {}
    # extract the degrees
    for index, text in enumerate(doc):
        for t in text.split():
            # replace the special symbols 
            t=re.sub(r'[?|!|,|,|.|$]', r'',t)
            if t in EDUCATION and not nlp.vocab[t].is_stop:
                edu[t] = text+doc[index]
    # extract year
    education=[]
    for key in edu.keys():
        pattern = re.compile(r'\d{4}')
        year = re.search(pattern, edu[key])
        if year:
            education.append((key, ''.join(year[0])))
        else:
            education.append(key)
    return education

In [30]:
for i in range(10):
    resume = df['resume'][i]
    print(get_education(resume))

[('MS', '2005')]
[('HSC', '2012'), ('SSC', '2012')]
[]
['MS']
[]
[]
[('HSC', '2012'), 'SSC']
[]
[]
[('XII', '2015'), ('CBSE', '2009'), ('X', '2008')]


### extract the skills


In [31]:
data = pd.read_csv('https://raw.githubusercontent.com/robinsones/Freelancer-Shiny-App/master/skills.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,ciphertext,skill_name,skill_rank
0,10,~01d3ad50155896e4ae,econometrics,2
1,11,~01d3ad50155896e4ae,excel-vba,10
2,12,~01d3ad50155896e4ae,financial-analysis,9
3,13,~01d3ad50155896e4ae,financial-modeling,8
4,14,~01d3ad50155896e4ae,financial-writing,5


In [45]:
def get_skills(text):
    doc = nlp(text)
    noun_chunks = doc.noun_chunks
    
    # removing the stopwords and implement the word tokenizer
    tokens=[token.text for token in doc if not token.is_stop]
    skills = list(data['skill_name'].values)
    skillset = set()
    # check the one grams
    for token in tokens:
        if token.lower() in skills:
            skillset.add(token.lower())
    # check for bigrams and trigrams
    for token in noun_chunks:
        token = token.text.lower().strip()
        if token in skills:
            skillset.add(token.lower())
    return skillset

In [46]:
for i in range(10):
    resume = df['resume'][i]
    print(get_skills(resume))

{'research', 'html', 'sql', 'javascript', 'asp.net', 'css', 'ado.net', 'english', 'analytics'}
{'cooking', 'html', 'sql', 'writing', 'hindi', 'c', 'xhtml', 'asp.net', 'vb.net', 'english', 'sales', 'analytics', 'seo'}
{'scripting', 'sales', 'com', 'html', 'sql', 'glassfish', 'crm', 'xml', 'splunk', 'java', 'jboss', 'english', 'itil', 'analytics', 'hindi'}
{'accounting', 'research', 'animation'}
{'scripting', 'animation', 'transcription', 'javascript', 'html', 'jquery', 'sql', 'english', 'bootstrap', 'c', 'asp.net', 'css', 'ajax', 'iis', 'hindi'}
{'s', 'english', 'cooking', 'hindi'}
{'english', 'lms', 'scheme', 'music', 'hindi'}
{'accounting', 'filing', 'basic', 'bengali', 'english', 'sales', 'hindi'}
{'research', 'html', 'writing', 'indexing', 'selling', 'hindi', 'english', 'sales', 'analytics', 'seo'}
{'cics', 'robotics', 'jcl', 'sketching', 'cooking', 'sql', 'mysql', 'cobol', 'sas'}


### extract roles

In [47]:
# Don't have the dataset which contains all the roles in industry, this is similar to skills

### COMPILE ALL INFORMATION

In [48]:
def compile_info(text):
    info = {}
    info['Name'] = get_name(text)
    info['Contact'] = get_contact(text)
    info['Email'] = get_email(text)
    info['Education'] = get_education(text)
    info['Skills'] = get_skills(text)
    return info

In [50]:
all_docs = []
for i in tqdm(range(df.shape[0])):
    all_docs.append(compile_info(df['resume'][i]))

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [03:03<00:00,  1.83s/it]


In [53]:
pd.DataFrame(all_docs).head()

Unnamed: 0,Name,Contact,Email,Education,Skills
0,Arghya Das,9972364371,arg234@gmail.com,"[(MS, 2005)]","{research, html, sql, javascript, asp.net, css..."
1,CURRICULUM VITAE,8734897900,Darjipooja3638@gmail.com,"[(HSC, 2012), (SSC, 2012)]","{cooking, html, sql, writing, hindi, c, xhtml,..."
2,VALLABH KALAGI,9561199772,sachinjadhav58@gmail.com,[],"{scripting, sales, com, html, sql, glassfish, ..."
3,NITIN JAIN,8010475082,nitin9482@gmail.com,[MS],"{accounting, research, animation}"
4,SANTOSH KUMAR,8553782862,santoshkumardhal1993@gmail.com,[],"{scripting, animation, transcription, javascri..."


### END