# Resume NER
## Extract Information from Resumes using Named Entity Recognition

### Data Exploration and Preselection
In this notebook the dataset is loaded and examined.
Furthermore the features to train are selected and filtered out from the dataset

In [2]:
# import os
import json

dataset_path = "./data/Entity Recognition in Resumes.json"

## open file and convert resume entries to json
with open(dataset_path,encoding="utf8") as f:
    lines = f.readlines()

all_resumes = [json.loads(line) for line in lines]


## data conversion method
def convert_data(data):
    """
    Creates NER training data in Spacy format from JSON dataset
    Outputs the Spacy training data which can be used for Spacy training.
    """
    text = data['content']
    entities = []
    if data['annotation'] is not None:
        for annotation in data['annotation']:
            # only a single point in text annotation.
            point = annotation['points'][0]
            labels = annotation['label']
            # handle both list of labels or a single label.
            if not isinstance(labels, list):
                labels = [labels]
            for label in labels:
                # dataturks indices are both inclusive [start, end] but spacy is not [start, end)
                entities.append((point['start'], point['end'] + 1, label))
    return (text, {"entities": entities})
   
##  convert all resumes to spacy format
converted_resumes = [convert_data(resume) for resume in all_resumes]
print(len(converted_resumes))

## filter out resumes where resume entities list is None
converted_resumes = [i for i in converted_resumes if i[1]['entities']] 
print(len(converted_resumes))

## filter out duplicate resumes tagged differently

unique_resumes = []
unique_resumes_text = set()
for res in converted_resumes:
    if res[0] not in unique_resumes_text:
        unique_resumes.append(res)
        unique_resumes_text.add(res[0])
print(len(unique_resumes)) 
#converted_resumes = unique_resumes

701
690
532


In [3]:
## print one sample resume for better understanding of data
converted_resume = converted_resumes[42]

text = converted_resume[0]
entities_list = converted_resume[1]['entities']
## print label and text for each entity
for entity in entities_list:
    print('\033[1m' + entity[2] + '\033[0m', ": ", text[entity[0]:entity[1]])


[1mSkills[0m :  SKILL SET • ASP.NET, C# • QA tools

• Coding and modularization • Excellent communication skills

• VB, VB.net, ASP • Technical specifications creation

• HTML • System backups

• Sql server 2005, Oracle • System upgrades

• Java/C/C++ • Excellent problem-solving abilities

Navas Najeer Koya 3
[1mLocation[0m :  Mangalore
[1mSkills[0m :  C# (Less than 1 year), .NET, SQL Server, Css, Html5

[1mGraduation Year[0m :   2014
[1mLocation[0m :  Mangalore
[1mLocation[0m :  Mangalore
[1mDegree[0m :  Bachelor of Computer Application
[1mGraduation Year[0m :   2014
[1mCompanies worked at[0m :  Infosys
[1mDesignation[0m :  Test Engineer

[1mCompanies worked at[0m :  Infosys
[1mDesignation[0m :  Test Engineer

[1mGraduation Year[0m :   2014
[1mCompanies worked at[0m :  Infosys
[1mDesignation[0m :  System Engineer
[1mLocation[0m :  Mangalore
[1mLocation[0m :  Mangalore
[1mDesignation[0m :  Test Engineer

[1mName[0m :  Navas Koya


In [4]:
## collect names of all entities in complete resume dataset
all_labels = list()
for res in converted_resumes:
    ## entity list of res
    entity_list = [i[2] for i in res[1]['entities']]
    ## extend all_labels with labels of entities 
    all_labels += entity_list
    
## set of unique values
unique_labels = list(set(all_labels))
## print labels to collect the right labels for later training
x = [print(i) for i in unique_labels]

Graduation Year
Companies worked at
University
abc
Certifications
Can Relocate to
Links
links
Name
Years of Experience
Degree
training
Skills
state
Location
Relocate to
projects
College
des
Rewards and Achievements
UNKNOWN
College Name
Designation
Email Address
Address


In [5]:
chosen_entity_labels = ['Degree','Companies worked at', 'Designation']
## for each chosen entity label, count how many documents have a labeled entity for that label, and how many labeled entities total there are 

for chosen in chosen_entity_labels:
    found_docs_with_entity = 0
    entity_count = 0
    for resume in converted_resumes:
        entity_list = resume[1]["entities"]
        _,_,labels = zip(*entity_list)
        if chosen in labels:
            found_docs_with_entity+=1
            entity_count+=len([l for l in labels if l == chosen])
    print("Docs with {}: {}".format(chosen,found_docs_with_entity))
    print("Total count of {}: {}".format(chosen,entity_count))
print("Docs total: {}".format(len(converted_resumes)))

Docs with Degree: 606
Total count of Degree: 1012
Docs with Companies worked at: 627
Total count of Companies worked at: 2830
Docs with Designation: 650
Total count of Designation: 2842
Docs total: 690


In [6]:
resumes = converted_resumes

## this method gathers all resumes which have all of the chosen entites above.
def gather_candidates(dataset,entity_labels):
    candidates = list()
    for resume in dataset:
        res_ent_labels = list(zip(*resume[1]["entities"]))[2]
        if set(entity_labels).issubset(res_ent_labels):
            candidates.append(resume)
    return candidates

training_data = gather_candidates(resumes, chosen_entity_labels)
print("Gathered {} training examples".format(len(training_data)))

## filter all annotation based on filter list
def filter_ents(ents, filter):
    filtered = [ent for ent in ents if ent[2] in filter]
    return filtered

## remove all but relevant (chosen) entity annotations and store in X variable 
X = [[i[0], dict(entities=filter_ents(i[1]["entities"], chosen_entity_labels))] for i in training_data]



Gathered 547 training examples


In [7]:
from helpers.spacy_train_resume_ner import train_spacy_ner

def remove_bad_data(training_data):
    model, baddocs = train_spacy_ner(training_data, debug=True, n_iter=1)
    # training data is list of lists with each list containing a text and annotations
    # baddocs is a set of strings/resume texts.
    # filter bad docs and store filter result (good docs) in filtered variable
    filtered = [data for data in training_data if data[0] not in baddocs]
    print("Unfiltered training data size: ",len(training_data))
    print("Filtered training data size: ", len(filtered))
    print("Bad data size: ", len(baddocs))
    return filtered

## remove faulty documents that will throw errors in spacy training
X = remove_bad_data(X)

Created blank 'en' model
Exception thrown when processing doc:
('Nida Khan\nTech Support Executive - Teleperformance for Microsoft\n\nJaipur, Rajasthan - Email me on Indeed: indeed.com/r/Nida-Khan/6c9160696f57efd8\n\n• To be an integral part of the organization and enhance my knowledge to utilize it in a productive\nmanner for the growth of the company and the global.\n\nINDUSTRIAL TRAINING\n\n• BHEL, (HEEP) HARIDWAR\nOn CNC System&amp; PLC Programming.\n\nWORK EXPERIENCE\n\nTech Support Executive\n\nTeleperformance for Microsoft -\n\nSeptember 2017 to Present\n\nprocess.\n• 21 months of experience in ADFC as Phone Banker.\n\nEDUCATION\n\nBachelor of Technology in Electronics & communication Engg\n\nGNIT institute of Technology -  Lucknow, Uttar Pradesh\n\n2008 to 2012\n\nClass XII\n\nU.P. Board -  Bareilly, Uttar Pradesh\n\n2007\n\nClass X\n\nU.P. Board -  Bareilly, Uttar Pradesh\n\n2005\n\nSKILLS\n\nMicrosoft office, excel, cisco, c language, cbs. (4 years)\n\nhttps://www.indeed.com/

In [8]:
## split to test train validation data
def train_test_split(X, train_percent, test_percent):
    train_size = int(len(X) * train_percent)
    test_size = int(len(X) * test_percent)
    
    train = X[:train_size]
    tmp = X[train_size:]
    
    test = tmp[:test_size]
    valid = tmp[test_size:]
    
    return train, test, valid
#0.7 train, 0.2 dev, 0,1 test
train, test, valid = train_test_split(X, 0.65, 0.15)
print('all:  ',len(X))
print('test:  ',len(test))
print('valid:',len(valid))
print('train:',len(train))
assert(len(train) + len(test) + len(valid) == len(X)) 

all:   546
test:   81
valid: 111
train: 354


In [39]:
import spacy
import numpy as np
from spacy.gold import biluo_tags_from_offsets
import pandas as pd
from IPython.display import display, HTML

custom_nlp,_= train_spacy_ner(train,n_iter=20)

Created blank 'en' model
Losses {'ner': 28189.021530729293}
Losses {'ner': 13750.925065617488}
Losses {'ner': 24827.47379087636}
Losses {'ner': 18116.78973425692}
Losses {'ner': 26471.958610855974}
Losses {'ner': 20526.388423813136}
Losses {'ner': 18653.082368638366}
Losses {'ner': 16315.142706626793}
Losses {'ner': 12524.617815063197}
Losses {'ner': 8341.22201218992}
Losses {'ner': 6415.795565967112}
Losses {'ner': 6240.065474748248}
Losses {'ner': 5366.414414152686}
Losses {'ner': 4760.398092845442}
Losses {'ner': 4407.362758204549}
Losses {'ner': 4245.000548476351}
Losses {'ner': 3710.4525312784963}
Losses {'ner': 3781.0111048796603}
Losses {'ner': 3385.3498940029613}
Losses {'ner': 3367.0447317343696}


In [40]:
## convert data to bilou format
def make_bilou_df(nlp,resume):
    """
    param nlp - a trained spacy model
    param resume - a resume from our train or test set
    """
    doc = nlp(resume[0])
    bilou_ents_predicted = biluo_tags_from_offsets(doc, [(ent.start_char,ent.end_char,ent.label_)for ent in doc.ents])
    bilou_ents_true = biluo_tags_from_offsets(doc, [(ent[0], ent[1], ent[2]) for ent in resume[1]["entities"]])
    
    doc_tokens = [tok.text for tok in doc]
    bilou_df = pd.DataFrame()
    bilou_df["Tokens"] =doc_tokens
    bilou_df["Tokens"] = bilou_df["Tokens"].str.replace("\\s+","") 
    bilou_df["Predicted"] = bilou_ents_predicted
    bilou_df["True"] = bilou_ents_true
    return bilou_df 

## convet data to flair format
def clear_data(data_as_bilou):
    file = ''
    for idx,df in enumerate(data_as_bilou):
        df2 = pd.DataFrame()
        df2["text"] = df["Tokens"]
        df2["ner"] = df["True"]

        # remove unwanted whitespace and/or newline token rows from dataframe
        df2 = df2[df2.text.str.strip() != '']
        df2 = df2[df2.text.str.strip() != '\n']
        df2 = df2[df2.text.str.strip() != '\r']
        #reset index since columns were removed
        df2 = df2.reset_index(drop=True)

        # insert newlines after each "sentence" and convert to csv
        indexes = df2.index[df2.text.str.strip() == '.'].tolist()
        indexes +=(df2.index[df2.text.str.strip() == '•'].tolist())
        Indexes=sorted(indexes)
        l_mod = [0] + indexes + [max(indexes,  default=0) + 1]
        list_of_dfs = [df2.iloc[l_mod[n]:l_mod[n + 1]] for n in range(len(l_mod)-1)]

        as_csv_s = [i.to_csv(None,sep = " ",encoding = "utf-8",index = False,header = False,line_terminator="\n") for i in list_of_dfs]
        
        for i in as_csv_s:
            file += i + "\n"
    return file

# make .csv strings from train, test, valid for use in flair 
def schema_for_flair(nlp, train, test, valid):    
    # makes a list of pandas dataframes, one for each resume. 
    training_data_as_bilou = [make_bilou_df(nlp,res) for res in train]
    test_data_as_bilou = [make_bilou_df(nlp,res) for res in test]
    valid_data_as_bilou = [make_bilou_df(nlp,res) for res in valid]
    
    
    # return variables
    training_file = clear_data(training_data_as_bilou)
    test_file = clear_data(test_data_as_bilou)
    valid_file = clear_data(valid_data_as_bilou)
   
    return training_file, test_file, valid_file

training,testing,validation = schema_for_flair(custom_nlp,train,test,valid)

with open("./data/train_res_bilou_f.txt",'w+',encoding="utf-8") as f: 
    f.write(training)
with open("./data/test_res_bilou_f.txt",'w+',encoding="utf-8") as f: 
    f.write(testing)
with open("./data/valid_res_bilou_f.txt",'w+',encoding="utf-8") as f: 
    f.write(validation)
print("Done!")

Done!
