# Resume NER
## Extract Information from Resumes using Named Entity Recognition
---
### Training the model
In this part a model on our data is trained with Flair NLP  and the results are evaluated.

Run this code on google colab.

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import os
os.chdir("/content/gdrive/My Drive/SAKI/data") 

In [0]:
! pip install flair

In [0]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

## describes file structure
columns = {0: 'text', 1: 'ner'}

## folder where training and test data are
data_folder = '/content/gdrive/My Drive/SAKI/flair'


#train_file = 'train_res_bilou_nd.txt'
#test_file = 'test_res_bilou_nd.txt'
#dev_file = 'valid_res_bilou_nd.txt'
train_file = 'train_res_bilou_f.txt'
test_file = 'test_res_bilou_f.txt'
dev_file = 'valid_res_bilou_f.txt'


## init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file=train_file,
                              test_file=test_file,
                              dev_file=dev_file)
print(corpus)

## make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type='ner')
print(tag_dictionary.idx2item)


2019-06-18 17:23:16,078 Reading data from /content/gdrive/My Drive/SAKI/flair
2019-06-18 17:23:16,079 Train: /content/gdrive/My Drive/SAKI/flair/train_res_bilou_nd.txt
2019-06-18 17:23:16,085 Dev: /content/gdrive/My Drive/SAKI/flair/valid_res_bilou_nd.txt
2019-06-18 17:23:16,086 Test: /content/gdrive/My Drive/SAKI/flair/test_res_bilou_nd.txt
Corpus: 10199 train + 3120 dev + 2748 test sentences
[b'<unk>', b'O', b'B-Designation', b'I-Designation', b'L-Designation', b'"B-Companies', b'"L-Companies', b'U-Degree', b'U-Designation', b'-', b'B-Degree', b'I-Degree', b'L-Degree', b'"I-Companies', b'"U-Companies', b'<START>', b'<STOP>']


In [0]:
## initialize embeddings
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings,FlairEmbeddings
from typing import List

embedding_types: List[TokenEmbeddings] = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

## initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type='ner',
                                        use_crf=True)

In [0]:
## prepare hyperparameter optimization

from hyperopt import hp
from flair.hyperparameter.param_selection import SearchSpace, Parameter

search_space = SearchSpace()
search_space.add(
        Parameter.EMBEDDINGS,
        hp.choice,
        options=[
            StackedEmbeddings([WordEmbeddings("glove")]),
            StackedEmbeddings(
                [
                    WordEmbeddings("glove"),
                    FlairEmbeddings("news-forward"),
                    FlairEmbeddings("news-backward"),
                ]
            ),
        ],
    )
search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128])
search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2])
search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[8, 16, 32])

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
## optimize hyperparameters

from flair.hyperparameter.param_selection import SequenceTaggerParamSelector , OptimizationValue

## create the parameter selector
optimizer = SequenceTaggerParamSelector(
        corpus, 'ner', 'resources/results', max_epochs=3
    )

## start the optimization
optimizer.optimize(search_space, max_evals=100)

  0%|          | 0/100 [00:00<?, ?it/s, best loss: ?]2019-06-17 21:27:23,100 ----------------------------------------------------------------------------------------------------
2019-06-17 21:27:23,101 Evaluation run: 1
2019-06-17 21:27:23,102 Evaluating parameter combination:
2019-06-17 21:27:23,104 	dropout: 0.15619438301333338
2019-06-17 21:27:23,105 	embeddings: StackedEmbeddings [/root/.flair/embeddings/glove.gensim]
2019-06-17 21:27:23,105 	hidden_size: 128
2019-06-17 21:27:23,106 	learning_rate: 0.1
2019-06-17 21:27:23,107 	mini_batch_size: 8
2019-06-17 21:27:23,108 	rnn_layers: 2
2019-06-17 21:27:23,109 ----------------------------------------------------------------------------------------------------
2019-06-17 21:27:23,291 ----------------------------------------------------------------------------------------------------
2019-06-17 21:27:23,292 Training run: 1
2019-06-17 21:27:23,306 ------------------------------------------------------------------------------------------

AttributeError: ignored

In [0]:
## initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

model_name = 'resources/taggers/resume-ner-1-nd'

## start training
trainer.train(model_name,
              learning_rate=0.1,
              mini_batch_size=32,
              #anneal_with_restarts=True,
              max_epochs=75
             ,checkpoint=True)




2019-06-18 06:45:42,175 ----------------------------------------------------------------------------------------------------
2019-06-18 06:45:42,180 Evaluation method: MICRO_F1_SCORE
2019-06-18 06:45:43,333 ----------------------------------------------------------------------------------------------------
2019-06-18 06:45:47,678 epoch 1 - iter 0/319 - loss 103.13024139
2019-06-18 06:46:44,530 epoch 1 - iter 31/319 - loss 11.70843418
2019-06-18 06:47:27,332 epoch 1 - iter 62/319 - loss 8.28453587
2019-06-18 06:48:19,459 epoch 1 - iter 93/319 - loss 6.88372860
2019-06-18 06:49:18,264 epoch 1 - iter 124/319 - loss 6.14227489
2019-06-18 06:49:59,425 epoch 1 - iter 155/319 - loss 5.52124013
2019-06-18 06:51:03,850 epoch 1 - iter 186/319 - loss 5.24744341
2019-06-18 06:51:55,229 epoch 1 - iter 217/319 - loss 4.93243011
2019-06-18 06:53:08,875 epoch 1 - iter 248/319 - loss 4.65407707
2019-06-18 06:53:54,413 epoch 1 - iter 279/319 - loss 4.40063443
2019-06-18 06:54:30,444 epoch 1 - iter 310/3

In [0]:
from pathlib import Path
from flair.trainers import ModelTrainer
from flair.training_utils import EvaluationMetric

## continue trainng if aborted
checkpoint = tagger.load_checkpoint(Path('resources/taggers/resume-ner-1-nd/checkpoint.pt'))
trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)
trainer.train('resources/taggers/resume-ner-1-nd',
              EvaluationMetric.MICRO_F1_SCORE,
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=150,
              checkpoint=True)

2019-06-18 17:27:27,687 loading file resources/taggers/resume-ner-1-nd/checkpoint.pt
2019-06-18 17:27:36,446 ----------------------------------------------------------------------------------------------------
2019-06-18 17:27:36,451 Evaluation method: MICRO_F1_SCORE
2019-06-18 17:27:37,661 ----------------------------------------------------------------------------------------------------
2019-06-18 17:27:40,500 epoch 34 - iter 0/319 - loss 0.39117417
2019-06-18 17:28:20,101 epoch 34 - iter 31/319 - loss 0.54501318
2019-06-18 17:29:04,491 epoch 34 - iter 62/319 - loss 0.51542739
2019-06-18 17:29:51,333 epoch 34 - iter 93/319 - loss 0.52494561
2019-06-18 17:30:37,111 epoch 34 - iter 124/319 - loss 0.52957665
2019-06-18 17:31:19,691 epoch 34 - iter 155/319 - loss 0.55303560
2019-06-18 17:31:59,491 epoch 34 - iter 186/319 - loss 0.54449522
2019-06-18 17:32:59,205 epoch 34 - iter 217/319 - loss 0.54191026
2019-06-18 17:34:03,836 epoch 34 - iter 248/319 - loss 0.53935465
2019-06-18 17:34:4

{'dev_loss_history': [tensor(1.0936, device='cuda:0'),
  tensor(1.0961, device='cuda:0'),
  tensor(1.0984, device='cuda:0'),
  tensor(1.0930, device='cuda:0'),
  tensor(1.1016, device='cuda:0'),
  tensor(1.0993, device='cuda:0'),
  tensor(1.0999, device='cuda:0'),
  tensor(1.0983, device='cuda:0'),
  tensor(1.0979, device='cuda:0'),
  tensor(1.0975, device='cuda:0'),
  tensor(1.0974, device='cuda:0'),
  tensor(1.0971, device='cuda:0'),
  tensor(1.0968, device='cuda:0'),
  tensor(1.0969, device='cuda:0'),
  tensor(1.0970, device='cuda:0'),
  tensor(1.0979, device='cuda:0'),
  tensor(1.0976, device='cuda:0')],
 'dev_score_history': [0.6245,
  0.6216,
  0.6192,
  0.621,
  0.6211,
  0.6192,
  0.6184,
  0.6188,
  0.6191,
  0.6208,
  0.6221,
  0.6215,
  0.6208,
  0.6211,
  0.6211,
  0.6201,
  0.6214],
 'test_score': 0.6263,
 'train_loss_history': [0.5428262556608194,
  0.5454841540151255,
  0.5383724374550637,
  0.5435990104013849,
  0.5390514056417262,
  0.5450503251115356,
  0.528659676589

In [0]:
from flair.visual.training_curves import Plotter

## plot training curves and weights
plotter = Plotter()
plotter.plot_training_curves('./resources/taggers/resume-ner-1/loss.tsv')
plotter.plot_weights('./resources/taggers/resume-ner-1/weights.txt')

In [38]:
import os
import pandas as pd 


## clculate entity-level evaluation metrics 

search_string = ['Degree', 'Companies', 'Designation'] 
test = pd.read_csv('./resources/taggers/resume-ner-1-nd/test.tsv', delim_whitespace=True, engine="python",names=['Text','Predicted','True','val'])

data = []
for label in search_string:
    # variables to store results for all resumes for one entity type
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    for _,tres_df in test.iterrows():
        # calculate true false positives and false negatives for each resume
        if (tres_df["Predicted"] == tres_df["True"]) & (label in tres_df["Predicted"]): 
          true_positives += 1
        if (tres_df["Predicted"] != tres_df["True"]) & (label in tres_df["Predicted"]):
          false_positives += 1
        if (tres_df["Predicted"] != tres_df["True"]) & (label in tres_df["True"]):
          false_negatives += 1
        
    
    print("For label '{}' tp: {} fp: {} fn: {}".format(label,true_positives,false_positives,false_negatives))

    precision = 0.0 if true_positives == 0 else float(true_positives) / (true_positives + false_positives)
    recall =  0.0 if true_positives == 0 else float(true_positives) / (true_positives + false_negatives)
    f1 =  0.0 if true_positives == 0 else 2 * ((precision * recall)/(precision + recall))
    row = [precision,recall,f1]
    data.append(row)
    

metric_df = pd.DataFrame(data, columns = ['precision', 'recall', 'f1'], index = ['Degree', 'Companies worked at', 'Designation']) 
metric_df.loc['avg'] = metric_df.mean()

print("Entity-Level evaluation:")

display(metric_df)

For label 'Degree' tp: 201 fp: 88 fn: 133
For label 'Companies' tp: 874 fp: 333 fn: 363
For label 'Designation' tp: 665 fp: 258 fn: 353
Entity-Level evaluation:


Unnamed: 0,precision,recall,f1
Degree,0.695502,0.601796,0.645265
Companies worked at,0.724109,0.706548,0.715221
Designation,0.720477,0.653242,0.685214
avg,0.713363,0.653862,0.6819
