# Resume NER
## Extract Information from Resumes using Named Entity Recognition
---
### Training the model
In this part a model on our data is trained with Flair NLP  and the results are evaluated.

Run this code on google colab.

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import os
os.chdir("/content/gdrive/My Drive/SAKI/data") 

In [0]:
! pip install flair

In [0]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

## describes file structure
columns = {0: 'text', 1: 'ner'}

## folder where training and test data are
data_folder = '/content/gdrive/My Drive/SAKI/flair'


#train_file = 'train_res_bilou_nd.txt'
#test_file = 'test_res_bilou_nd.txt'
#dev_file = 'valid_res_bilou_nd.txt'
train_file = 'train_res_bilou_f.txt'
test_file = 'test_res_bilou_f.txt'
dev_file = 'valid_res_bilou_f.txt'


## init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file=train_file,
                              test_file=test_file,
                              dev_file=dev_file)
print(corpus)

## make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type='ner')
print(tag_dictionary.idx2item)


2019-06-18 17:23:16,078 Reading data from /content/gdrive/My Drive/SAKI/flair
2019-06-18 17:23:16,079 Train: /content/gdrive/My Drive/SAKI/flair/train_res_bilou_nd.txt
2019-06-18 17:23:16,085 Dev: /content/gdrive/My Drive/SAKI/flair/valid_res_bilou_nd.txt
2019-06-18 17:23:16,086 Test: /content/gdrive/My Drive/SAKI/flair/test_res_bilou_nd.txt
Corpus: 10199 train + 3120 dev + 2748 test sentences
[b'<unk>', b'O', b'B-Designation', b'I-Designation', b'L-Designation', b'"B-Companies', b'"L-Companies', b'U-Degree', b'U-Designation', b'-', b'B-Degree', b'I-Degree', b'L-Degree', b'"I-Companies', b'"U-Companies', b'<START>', b'<STOP>']


In [0]:
## initialize embeddings
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings,FlairEmbeddings
from typing import List

embedding_types: List[TokenEmbeddings] = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

## initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type='ner',
                                        use_crf=True)

In [0]:
## prepare hyperparameter optimization

from hyperopt import hp
from flair.hyperparameter.param_selection import SearchSpace, Parameter

search_space = SearchSpace()
search_space.add(
        Parameter.EMBEDDINGS,
        hp.choice,
        options=[
            StackedEmbeddings([WordEmbeddings("glove")]),
            StackedEmbeddings(
                [
                    WordEmbeddings("glove"),
                    FlairEmbeddings("news-forward"),
                    FlairEmbeddings("news-backward"),
                ]
            ),
        ],
    )
search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128])
search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2])
search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[8, 16, 32])

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
## optimize hyperparameters

from flair.hyperparameter.param_selection import SequenceTaggerParamSelector , OptimizationValue

## create the parameter selector
optimizer = SequenceTaggerParamSelector(
        corpus, 'ner', 'resources/results', max_epochs=3
    )

## start the optimization
optimizer.optimize(search_space, max_evals=100)

In [None]:
## initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

model_name = 'resources/taggers/resume-ner-1-nd'

## start training
trainer.train(model_name,
              learning_rate=0.1,
              mini_batch_size=32,
              #anneal_with_restarts=True,
              max_epochs=75
             ,checkpoint=True)




In [None]:
from pathlib import Path
from flair.trainers import ModelTrainer
from flair.training_utils import EvaluationMetric

## continue trainng if aborted
checkpoint = tagger.load_checkpoint(Path('resources/taggers/resume-ner-1-nd/checkpoint.pt'))
trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)
trainer.train('resources/taggers/resume-ner-1-nd',
              EvaluationMetric.MICRO_F1_SCORE,
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=150,
              checkpoint=True)

In [0]:
from flair.visual.training_curves import Plotter

## plot training curves and weights
plotter = Plotter()
plotter.plot_training_curves('./resources/taggers/resume-ner-1/loss.tsv')
plotter.plot_weights('./resources/taggers/resume-ner-1/weights.txt')

In [38]:
import os
import pandas as pd 


## clculate entity-level evaluation metrics 

search_string = ['Degree', 'Companies', 'Designation'] 
test = pd.read_csv('./resources/taggers/resume-ner-1-nd/test.tsv', delim_whitespace=True, engine="python",names=['Text','Predicted','True','val'])

data = []
for label in search_string:
    # variables to store results for all resumes for one entity type
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    for _,tres_df in test.iterrows():
        # calculate true false positives and false negatives for each resume
        if (tres_df["Predicted"] == tres_df["True"]) & (label in tres_df["Predicted"]): 
          true_positives += 1
        if (tres_df["Predicted"] != tres_df["True"]) & (label in tres_df["Predicted"]):
          false_positives += 1
        if (tres_df["Predicted"] != tres_df["True"]) & (label in tres_df["True"]):
          false_negatives += 1
        
    
    print("For label '{}' tp: {} fp: {} fn: {}".format(label,true_positives,false_positives,false_negatives))

    precision = 0.0 if true_positives == 0 else float(true_positives) / (true_positives + false_positives)
    recall =  0.0 if true_positives == 0 else float(true_positives) / (true_positives + false_negatives)
    f1 =  0.0 if true_positives == 0 else 2 * ((precision * recall)/(precision + recall))
    row = [precision,recall,f1]
    data.append(row)
    

metric_df = pd.DataFrame(data, columns = ['precision', 'recall', 'f1'], index = ['Degree', 'Companies worked at', 'Designation']) 
metric_df.loc['avg'] = metric_df.mean()

print("Entity-Level evaluation:")

display(metric_df)

For label 'Degree' tp: 201 fp: 88 fn: 133
For label 'Companies' tp: 874 fp: 333 fn: 363
For label 'Designation' tp: 665 fp: 258 fn: 353
Entity-Level evaluation:


Unnamed: 0,precision,recall,f1
Degree,0.695502,0.601796,0.645265
Companies worked at,0.724109,0.706548,0.715221
Designation,0.720477,0.653242,0.685214
avg,0.713363,0.653862,0.6819
