# Resume Parser with Spacy

This notebook is used to parse resumes and extract their Universities, companies, skills, total work experience, Programming Experience and Database experience.

## Libraries

In [1]:
import pandas
import os
import numpy
import math
import natsort 
from pdfminer.high_level import extract_text
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import spacy
import pickle
import random
from spacy.training.example import Example
from collections import defaultdict
import re
from datetime import date
import configurations as regex

## Data Preparation

In [2]:
# reading extracted contents from resumes 
test_data = pandas.read_excel('input.xlsx')

In [3]:
# consider the last 117 resumes as test data
test_data = test_data[100:]

In [4]:
path = './train'
files = os.listdir(path)

In [6]:
# sorting the filenames
files = natsort.natsorted(files)

In [7]:
# considering the last 117 files alone
files = files[100:]

In [9]:
# extracting resume contents 
resume_text_list = []
for f in files:
    resume_text = extract_text(path+'/'+f)
    resume_text_list.append(resume_text)

In [10]:
test_data['resume_text'] = resume_text_list

In [12]:
def cleanResume(resumeText):
    
    ''' This function is used to clean the resume contents i.e., removing URLS, punctuations, newline and extra whitespaces'''
        
    resumeText = re.sub('httpS+s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    resumeText = re.sub('\n', ' ', resumeText)  # remove newline
    
    return resumeText

In [13]:
# calling the cleanResume function
test_data['cleaned_resume'] = test_data.resume_text.apply(lambda x: cleanResume(x))

In [15]:
# taking out only the cleaned resume contents
cleaned_resume = test_data[['cleaned_resume']]

## Load the pre-trained model

Now let us load the spacy model which was trained on 300 resumes

In [23]:
nlp_model = spacy.load('nlp_model_with_100_resumes_tuned')

In [24]:
nlp_model

<spacy.lang.en.English at 0x1abeb8c72b0>

## University Parsing

First, let us parse the universities from each resume from the test set

In [255]:
prediction_college_dict = defaultdict(list)
index = 100
for r in cleaned_resume['cleaned_resume']:
    doc = nlp_model(r)  # passing the resume text to the model
    index += 1
    prediction_college_dict[index] = []  # model might return more than one string and thatswhy we are having list here.
    for ent in doc.ents:
        if ent.label_ == 'College Name':
            prediction_college_dict[index].append(ent.text)

In [257]:
for key,value in prediction_college_dict.items():
    prediction_college_dict[key] = list(set(value))

In [259]:
# Getting the actual universities for each resume from the test data for comparing with the model results

actual_college_dict = {}
index = 100
for idx in range(100,217):
    index = idx + 1
    actual_college_dict[index] = []
    if (isinstance(test_data.loc[idx]['UniversityofUSAttendees'], float) and (numpy.isnan(test_data.loc[216]['UniversityofUSAttendees']))):
        actual_college_dict[index] = test_data.loc[idx]['UniversityofIndianAttendees']
    else:
        actual_college_dict[index] = test_data.loc[idx]['UniversityofUSAttendees']

In [261]:
# creating two lists: actual universities and prediction universities. This step is required for creating a dataframe.
actual = []
prediction = []
for idx in range(101,218):
    actual.append(actual_college_dict[idx])
    prediction.append(prediction_college_dict[idx])

In [263]:
# creating a college dataframe containing both the actual and predicted universities
college_df = pandas.DataFrame(list(zip(actual,prediction)),columns=['Actual College Name','Predicted College Name'])

In [265]:
# exporting the college dataframe to excel file
college_df.to_excel('College Predictions.xlsx',header=True,index=False)

## Company parsing

Now, let us parse the companies from each resume from the test set

In [266]:
company_dict = defaultdict(list)
index = 100
for r in cleaned_resume['cleaned_resume']:
    doc = nlp_model(r)    # passing the resume text to the model
    index += 1
    company_dict[index] = []   # model might return more than one string and thatswhy we are having list here.
    for ent in doc.ents:
        if ent.label_ == 'Companies worked at':
            company_dict[index].append(ent.text)

In [269]:
prediction_company_list = []

In [270]:
for key,value in company_dict.items():
    prediction_company_list.append(value)

In [274]:
# creating a company dataframe containing the predicted companies
company_df = pandas.DataFrame(list(zip(prediction_company_list)),columns=['Predicted Companies'])

In [276]:
# exporting the company dataframe to excel file
company_df.to_excel('Company_Predictions.xlsx',header=True,index=False)

## Skills parsing

Now, let us parse the skills from each resume from the test set

In [23]:
skills_dict = defaultdict(list)
index = 100
for r in cleaned_resume['cleaned_resume']:
    doc = nlp_model(r)   # passing the resume text to the model
    index += 1
    skills_dict[index] = []   # model might return more than one string and thatswhy we are having list here
    for ent in doc.ents:
        if ent.label_ == 'Skills':
            skills_dict[index].append(ent.text)

In [27]:
skills_list = []
for k,v in exp_dict.items():
    skills_list.append(v)

In [30]:
# creating a skills dataframe containing the predicted skills
skills_df = pandas.DataFrame(list(zip(skills_list)),columns=['Predicted Skills'])

In [33]:
# exporting the skills dataframe to excel file
skills_df.to_excel('Skills_Prediction.xlsx',header=True,index=False)

## Work Experience Parsing

Now, let us parse the total years of work experience for each resume

In [121]:
# list of possible section keywords that can be found in a resume
experience = ['experience', 'work experience', 'workexperience', 'professional experience', 'professionalexperience',
             'industry experience','industryexperience','industrial experience','industrialexperience']

In [83]:
experience_dict = defaultdict(list)
index = 100
for res_text in cleaned_resume['cleaned_resume']:  # loop over the resume text
    
    index += 1
    
    start_year = -1
    end_year = -1
    
    resume_text = res_text.lower()  # convert the entire resume text to lowercase
    exp_text_found = ''
    for exp in experience:  # loop over all possible experience keywords
        exp_text = resume_text.split(exp)
        if len(exp_text) > 1:  # if any one of the experience section keyword found
            exp_text_found = exp_text[-1]
            
    if len(exp_text_found) == 0:  # if there is no experience section keyword found
        experience_dict[index] = 0   # assigning experience as 0
    
    else:
        regular_expression = re.compile(regex.date_range, re.IGNORECASE)    # reg expression to check for date range
        regex_result = re.search(regular_expression,exp_text_found)
        
        while regex_result:   # if a date range is found
            date_range = regex_result.group()
            year_regex = re.compile(regex.year)  # reg expression to check for a year
            year_result = re.search(year_regex, date_range)
            if (start_year == -1) or (int(year_result.group()) <= start_year):
                start_year = int(year_result.group())

            if date_range.lower().find('present') != -1:  # if there is a present string in the date range
                end_year = date.today().year # current year
            else:
                year_result = re.search(year_regex, date_range[year_result.end():])
                if (end_year == -1) or (int(year_result.group()) >= end_year):
                    end_year = int(year_result.group())
                
            exp_text_found = exp_text_found[regex_result.end():]
            regex_result = re.search(regular_expression, exp_text_found)
            
        experience_dict[index] = end_year - start_year   # total experience = end working year - start working year

In [203]:
pred_exp = list(experience_dict.values())

In [205]:
# creating an experience dataframe containing the total years of work experience
experience_df = pandas.DataFrame(pred_exp,columns=['Pred_exp'])

In [207]:
# exporting the experience dataframe to excel file
experience_df.to_excel('Work Experience Prediction.xlsx',index=False,header=False)

## Classifiers

Now, let us build a random forest classifier to predict whether the candidate have programming and database experience or not.
For this, let's read the all the resumes and then consider first 100 resumes as train and the remaining 117 resumes as test dataset.

In [None]:
# reading extracted contents from resumes
input_data = pandas.read_excel('input.xlsx')

In [None]:
# getting the resume filenames
path = './train'
files = os.listdir(path)

In [None]:
# sort the resume filenames
files = natsort.natsorted(files)

In [None]:
# extract the resume contents from the resume

resume_text_list = []
for f in files:
    resume_text = extract_text(path+'/'+f)
    resume_text_list.append(resume_text)

In [None]:
# append a new resume content column 
input_data['resume_text'] = resume_text_list

In [None]:
# calling the cleanResume function
input_data['cleaned_resume'] = input_data.resume_text.apply(lambda x: cleanResume(x))

## Programming Experience Prediction

First, let's create a classifier to predict the programming experience

In [None]:
input_data['ProgrammingExperience'].value_counts()

In [None]:
programming_clf = input_data[['ProgrammingExperience','cleaned_resume']]

In [None]:
#using LabelEncoder to label Yes to 1 and No to 0
var_mod = ['ProgrammingExperience']
le = LabelEncoder()
for i in var_mod:
    programming_clf[i] = le.fit_transform(programming_clf[i])

In [None]:
# all resume text values
requiredText = programming_clf['cleaned_resume'].values

In [None]:
requiredTarget = programming_clf['ProgrammingExperience'].values

In [None]:
# build a TFidfVectorizer matrix out of the given resume text
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    stop_words='english',
    max_features=None)
word_vectorizer.fit(requiredText)
WordFeatures = word_vectorizer.transform(requiredText)

In [None]:
feature_array = numpy.array(word_vectorizer.get_feature_names())  
tfidf_sorting = numpy.argsort(WordFeatures.toarray()).flatten()[::-1]

In [None]:
n = 1000
top_n = feature_array[tfidf_sorting][:n]  # taking out top 1000 important features/words

In [None]:
# again build a TfIDFVectorizer considering the top 1000 important features as vocabulary
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    stop_words='english',
    vocabulary=top_n)
word_vectorizer.fit(requiredText)
WordFeatures = word_vectorizer.transform(requiredText)

In [None]:
# take first 100 resumes as train and the remaining 117 resumes as test
X_train = WordFeatures[:100]
y_train = requiredTarget[:100]
X_test = WordFeatures[100:]
y_test = requiredTarget[100:]

In [None]:
# build a randomforest classifier for making this prediction
clf = RandomForestClassifier()
clf.fit(X_train, y_train)  # model fit with the train data
prediction = clf.predict(X_test)  # predict on the test data

In [None]:
# find accuracy for the prediction
accuracy_score(y_test,prediction)

## Database Experience Prediction

Now, let us focus on the Database experience and build a classifier to predict whether each resume have a database experience or not. Here, we are using the TfIdfVectorizer built above.

In [None]:
input_data['DatabaseExperience'].value_counts()

In [None]:
database_clf = input_data[['DatabaseExperience','cleaned_resume']]

In [None]:
# now my target is DatabaseExperience
requiredTarget = database_clf['DatabaseExperience'].values

In [None]:
# consider first 100 resumes as train and the remaining 117 resumes as test
X_train = WordFeatures[:100]
y_train = requiredTarget[:100]
X_test = WordFeatures[100:]
y_test = requiredTarget[100:]

In [None]:
# build a randomforest classifier to predict the database experience
clf = RandomForestClassifier()
clf.fit(X_train, y_train)    # model fit with the train data
prediction = clf.predict(X_test)   # predict on the test data

In [None]:
# find accuracy for the database experience prediction
accuracy_score(y_test,prediction)