In [1]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-0.0.12.tar.gz (21 kB)
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l- \ done
[?25h  Created wheel for seqeval: filename=seqeval-0.0.12-py3-none-any.whl size=7423 sha256=affdbef1a57032c9d3426db61b3f08e2829f2de5a155a054aafa0a208205b526
  Stored in directory: /root/.cache/pip/wheels/dc/cc/62/a3b81f92d35a80e39eb9b2a9d8b31abac54c02b21b2d466edc
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-0.0.12


In [2]:
import pandas as pd
import logging
import re
import json
import numpy as np
import torch
from tqdm import trange
from tqdm import tqdm_notebook as tqdm
from transformers import BertForTokenClassification, BertTokenizerFast
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from seqeval.metrics import classification_report
from sklearn.metrics import confusion_matrix



In [3]:
MAX_LEN = 500
EPOCHS = 6
MODEL_PATH = '../input/bert-base-uncased'
TOKENIZER = BertTokenizerFast('../input/bert-base-uncased/vocab.txt', lowercase=True)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
df = pd.read_json('../input/resume-entities-for-ner/Entity Recognition in Resumes.json', lines=True)

In [5]:
df.head()

Unnamed: 0,content,annotation,extras
0,Abhishek Jha\nApplication Development Associat...,"[{'label': ['Skills'], 'points': [{'start': 12...",
1,Afreen Jamadar\nActive member of IIIT Committe...,"[{'label': ['Email Address'], 'points': [{'sta...",
2,"Akhil Yadav Polemaina\nHyderabad, Telangana - ...","[{'label': ['Skills'], 'points': [{'start': 37...",
3,Alok Khandai\nOperational Analyst (SQL DBA) En...,"[{'label': ['Skills'], 'points': [{'start': 80...",
4,Ananya Chavan\nlecturer - oracle tutorials\n\n...,"[{'label': ['Degree'], 'points': [{'start': 20...",


In [6]:
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    try:
        training_data = []
        lines=[]
        with open(dataturks_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content'].replace("\n", " ")
            entities = []
            data_annotations = data['annotation']
            if data_annotations is not None:
                for annotation in data_annotations:
                    #only a single point in text annotation.
                    point = annotation['points'][0]
                    labels = annotation['label']
                    # handle both list of labels or a single label.
                    if not isinstance(labels, list):
                        labels = [labels]

                    for label in labels:
                        point_start = point['start']
                        point_end = point['end']
                        point_text = point['text']
                        
                        lstrip_diff = len(point_text) - len(point_text.lstrip())
                        rstrip_diff = len(point_text) - len(point_text.rstrip())
                        if lstrip_diff != 0:
                            point_start = point_start + lstrip_diff
                        if rstrip_diff != 0:
                            point_end = point_end - rstrip_diff
                        entities.append((point_start, point_end + 1 , label))
            training_data.append((text, {"entities" : entities}))
        return training_data
    except Exception as e:
        logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
        return None

def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data

In [7]:
data = trim_entity_spans(convert_dataturks_to_spacy('../input/resume-entities-for-ner/Entity Recognition in Resumes.json'))

In [8]:
def get_label(offset, labels):
    if offset[0] == 0 and offset[1] == 0:
        return 'O'
    for label in labels:
        if offset[1] >= label[0] and offset[0] <= label[1]:
            return label[2]
    return 'O'

In [9]:
tags_vals = ["UNKNOWN", "O", "Name", "Degree","Skills","College Name","Email Address","Designation","Companies worked at","Graduation Year","Years of Experience","Location"]
tag2idx = {t: i for i, t in enumerate(tags_vals)}
idx2tag = {i:t for i, t in enumerate(tags_vals)}

In [10]:
def process_resume(data, tokenizer, tag2idx, max_len, is_test=False):
    tok = tokenizer.encode_plus(data[0], max_length=max_len, return_offsets_mapping=True)
    curr_sent = {'orig_labels':[], 'labels': []}
    
    padding_length = max_len - len(tok['input_ids'])
    
    if not is_test:
        labels = data[1]['entities']
        labels.reverse()
        for off in tok['offset_mapping']:
            label = get_label(off, labels)
            curr_sent['orig_labels'].append(label)
            curr_sent['labels'].append(tag2idx[label])
        curr_sent['labels'] = curr_sent['labels'] + ([0] * padding_length)
    
    curr_sent['input_ids'] = tok['input_ids'] + ([0] * padding_length)
    curr_sent['token_type_ids'] = tok['token_type_ids'] + ([0] * padding_length)
    curr_sent['attention_mask'] = tok['attention_mask'] + ([0] * padding_length)
    return curr_sent

In [11]:
class ResumeDataset(Dataset):
    def __init__(self, resume, tokenizer, tag2idx, max_len, is_test=False):
        self.resume = resume
        self.tokenizer = tokenizer
        self.is_test = is_test
        self.tag2idx = tag2idx
        self.max_len = max_len
    
    def __len__(self):
        return len(self.resume)
    
    def __getitem__(self, idx):
        data = process_resume(self.resume[idx], self.tokenizer, self.tag2idx, self.max_len, self.is_test)
        return {
            'input_ids': torch.tensor(data['input_ids'], dtype=torch.long),
            'token_type_ids': torch.tensor(data['token_type_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(data['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(data['labels'], dtype=torch.long),
            'orig_label': data['orig_labels']
        }

In [12]:
total = len(data)
train_data, val_data = data[:180], data[180:]

In [13]:
train_d = ResumeDataset(train_data, TOKENIZER, tag2idx, MAX_LEN)
val_d = ResumeDataset(val_data, TOKENIZER, tag2idx, MAX_LEN)

In [14]:
train_sampler = RandomSampler(train_d)
train_dl = DataLoader(train_d, sampler=train_sampler, batch_size=8)

val_dl = DataLoader(val_d, batch_size=4)

In [15]:
def get_hyperparameters(model, ff):

    # ff: full_finetuning
    if ff:
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "gamma", "beta"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay_rate": 0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay_rate": 0.0,
            },
        ]
    else:
        param_optimizer = list(model.classifier.named_parameters())
        optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

    return optimizer_grouped_parameters

In [16]:
def get_special_tokens(tokenizer, tag2idx):
    vocab = tokenizer.get_vocab()
    pad_tok = vocab["[PAD]"]
    sep_tok = vocab["[SEP]"]
    cls_tok = vocab["[CLS]"]
    o_lab = tag2idx["O"]

    return pad_tok, sep_tok, cls_tok, o_lab

In [17]:
def annot_confusion_matrix(valid_tags, pred_tags):

    """
    Create an annotated confusion matrix by adding label
    annotations and formatting to sklearn's `confusion_matrix`.
    """

    # Create header from unique tags
    header = sorted(list(set(valid_tags + pred_tags)))

    # Calculate the actual confusion matrix
    matrix = confusion_matrix(valid_tags, pred_tags, labels=header)

    # Final formatting touches for the string output
    mat_formatted = [header[i] + "\t\t\t" + str(row) for i, row in enumerate(matrix)]
    content = "\t" + " ".join(header) + "\n" + "\n".join(mat_formatted)

    return content

In [18]:
def flat_accuracy(valid_tags, pred_tags):
    return (np.array(valid_tags) == np.array(pred_tags)).mean()

In [19]:
model = BertForTokenClassification.from_pretrained(MODEL_PATH, num_labels=len(tag2idx))
model.to(DEVICE);
optimizer_grouped_parameters = get_hyperparameters(model, True)
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [20]:
MAX_GRAD_NORM = 1.0

In [21]:
def train_and_save_model(
    model,
    tokenizer,
    optimizer,
    epochs,
    idx2tag,
    tag2idx,
    max_grad_norm,
    device,
    train_dataloader,
    valid_dataloader
):

    pad_tok, sep_tok, cls_tok, o_lab = get_special_tokens(tokenizer, tag2idx)
    
    epoch = 0
    for _ in trange(epochs, desc="Epoch"):
        epoch += 1

        # Training loop
        print("Starting training loop.")
        model.train()
        tr_loss, tr_accuracy = 0, 0
        nb_tr_examples, nb_tr_steps = 0, 0
        tr_preds, tr_labels = [], []

        for step, batch in enumerate(train_dataloader):
            # Add batch to gpu
            
            # batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch['input_ids'], batch['attention_mask'], batch['labels']
            b_input_ids, b_input_mask, b_labels = b_input_ids.to(device), b_input_mask.to(device), b_labels.to(device)

            # Forward pass
            outputs = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask,
                labels=b_labels,
            )
            loss, tr_logits = outputs[:2]

            # Backward pass
            loss.backward()

            # Compute train loss
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

            # Subset out unwanted predictions on CLS/PAD/SEP tokens
            preds_mask = (
                (b_input_ids != cls_tok)
                & (b_input_ids != pad_tok)
                & (b_input_ids != sep_tok)
            )

            tr_logits = tr_logits.cpu().detach().numpy()
            tr_label_ids = torch.masked_select(b_labels, (preds_mask == 1))
            preds_mask = preds_mask.cpu().detach().numpy()
            tr_batch_preds = np.argmax(tr_logits[preds_mask.squeeze()], axis=1)
            tr_batch_labels = tr_label_ids.to("cpu").numpy()
            tr_preds.extend(tr_batch_preds)
            tr_labels.extend(tr_batch_labels)

            # Compute training accuracy
            tmp_tr_accuracy = flat_accuracy(tr_batch_labels, tr_batch_preds)
            tr_accuracy += tmp_tr_accuracy

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(
                parameters=model.parameters(), max_norm=max_grad_norm
            )

            # Update parameters
            optimizer.step()
            model.zero_grad()

        tr_loss = tr_loss / nb_tr_steps
        tr_accuracy = tr_accuracy / nb_tr_steps

        # Print training loss and accuracy per epoch
        print(f"Train loss: {tr_loss}")
        print(f"Train accuracy: {tr_accuracy}")
        
        
        """
        Validation loop
        """ 
        print("Starting validation loop.")

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        predictions, true_labels = [], []

        for batch in valid_dataloader:

            b_input_ids, b_input_mask, b_labels = batch['input_ids'], batch['attention_mask'], batch['labels']
            b_input_ids, b_input_mask, b_labels = b_input_ids.to(device), b_input_mask.to(device), b_labels.to(device)

            with torch.no_grad():
                outputs = model(
                    b_input_ids,
                    token_type_ids=None,
                    attention_mask=b_input_mask,
                    labels=b_labels,
                )
                tmp_eval_loss, logits = outputs[:2]

            # Subset out unwanted predictions on CLS/PAD/SEP tokens
            preds_mask = (
                (b_input_ids != cls_tok)
                & (b_input_ids != pad_tok)
                & (b_input_ids != sep_tok)
            )

            logits = logits.cpu().detach().numpy()
            label_ids = torch.masked_select(b_labels, (preds_mask == 1))
            preds_mask = preds_mask.cpu().detach().numpy()
            val_batch_preds = np.argmax(logits[preds_mask.squeeze()], axis=1)
            val_batch_labels = label_ids.to("cpu").numpy()
            predictions.extend(val_batch_preds)
            true_labels.extend(val_batch_labels)

            tmp_eval_accuracy = flat_accuracy(val_batch_labels, val_batch_preds)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += b_input_ids.size(0)
            nb_eval_steps += 1

        # Evaluate loss, acc, conf. matrix, and class. report on devset
        pred_tags = [idx2tag[i] for i in predictions]
        valid_tags = [idx2tag[i] for i in true_labels]
        cl_report = classification_report(valid_tags, pred_tags)
        conf_mat = annot_confusion_matrix(valid_tags, pred_tags)
        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_steps

        # Report metrics
        print(f"Validation loss: {eval_loss}")
        print(f"Validation Accuracy: {eval_accuracy}")
        print(f"Classification Report:\n {cl_report}")
        print(f"Confusion Matrix:\n {conf_mat}")

In [22]:
train_and_save_model(
    model, 
    TOKENIZER, 
    optimizer, 
    EPOCHS, 
    idx2tag, 
    tag2idx, 
    MAX_GRAD_NORM, 
    DEVICE, 
    train_dl, 
    val_dl
)

Epoch:   0%|          | 0/6 [00:00<?, ?it/s]

Starting training loop.
Train loss: 0.9121079846568729
Train accuracy: 0.7761088067106691
Starting validation loop.


Epoch:  17%|█▋        | 1/6 [00:12<01:04, 12.90s/it]

Validation loss: 0.591684103012085
Validation Accuracy: 0.8294253597808664
Classification Report:
                      precision    recall  f1-score   support

             Skills       0.00      0.00      0.00       869
        Designation       0.00      0.00      0.00        89
               Name       0.00      0.00      0.00        41
           Location       0.00      0.00      0.00        54
      Email Address       0.76      0.80      0.78      1130
       College Name       0.00      0.00      0.00        33
    Graduation Year       0.00      0.00      0.00        16
Companies worked at       0.00      0.00      0.00        59
             Degree       0.00      0.00      0.00        35
Years of Experience       0.00      0.00      0.00         5

          micro avg       0.76      0.39      0.51      2331
          macro avg       0.37      0.39      0.38      2331

Confusion Matrix:
 	College Name Companies worked at Degree Designation Email Address Graduation Year Loc

Epoch:  33%|███▎      | 2/6 [00:24<00:50, 12.63s/it]

Validation loss: 0.4092428505420685
Validation Accuracy: 0.8697032689668168
Classification Report:
                      precision    recall  f1-score   support

             Skills       0.82      0.44      0.57       869
        Designation       0.11      0.04      0.06        92
               Name       0.34      0.56      0.43        41
           Location       0.02      0.02      0.02        53
      Email Address       0.74      0.91      0.81      1130
       College Name       0.00      0.00      0.00        33
    Graduation Year       0.00      0.00      0.00        16
Companies worked at       0.08      0.03      0.05        60
             Degree       0.00      0.00      0.00        35
Years of Experience       0.00      0.00      0.00         5

          micro avg       0.68      0.62      0.65      2334
          macro avg       0.67      0.62      0.62      2334

Confusion Matrix:
 	College Name Companies worked at Degree Designation Email Address Graduation Year Lo

Epoch:  50%|█████     | 3/6 [00:37<00:37, 12.50s/it]

Validation loss: 0.37221054136753084
Validation Accuracy: 0.8554678214530224
Classification Report:
                      precision    recall  f1-score   support

             Skills       0.41      0.73      0.53       869
        Designation       0.22      0.18      0.20        89
               Name       0.84      0.90      0.87        41
           Location       0.51      0.54      0.52        54
      Email Address       0.77      0.90      0.83      1130
       College Name       0.16      0.24      0.19        33
    Graduation Year       0.00      0.00      0.00        16
Companies worked at       0.14      0.24      0.18        59
             Degree       0.15      0.14      0.14        35
Years of Experience       0.00      0.00      0.00         5

          micro avg       0.55      0.75      0.64      2331
          macro avg       0.57      0.75      0.64      2331

Confusion Matrix:
 	College Name Companies worked at Degree Designation Email Address Graduation Year L

Epoch:  67%|██████▋   | 4/6 [00:49<00:24, 12.43s/it]

Validation loss: 0.3218224748969078
Validation Accuracy: 0.8899422432687878
Classification Report:
                      precision    recall  f1-score   support

             Skills       0.61      0.64      0.62       869
        Designation       0.38      0.36      0.37        92
               Name       0.80      0.90      0.85        41
           Location       0.51      0.60      0.55        53
      Email Address       0.82      0.85      0.84      1130
       College Name       0.19      0.24      0.21        33
    Graduation Year       0.00      0.00      0.00        16
Companies worked at       0.20      0.30      0.24        60
             Degree       0.29      0.23      0.25        35
Years of Experience       0.00      0.00      0.00         5

          micro avg       0.68      0.71      0.69      2334
          macro avg       0.68      0.71      0.69      2334

Confusion Matrix:
 	College Name Companies worked at Degree Designation Email Address Graduation Year Lo

Epoch:  83%|████████▎ | 5/6 [01:01<00:12, 12.34s/it]

Validation loss: 0.3967111185193062
Validation Accuracy: 0.8574283177569489
Classification Report:
                      precision    recall  f1-score   support

             Skills       0.42      0.79      0.55       869
        Designation       0.30      0.35      0.32        89
               Name       0.83      0.93      0.87        41
           Location       0.53      0.72      0.61        54
      Email Address       0.82      0.86      0.84      1130
       College Name       0.25      0.42      0.31        33
    Graduation Year       0.00      0.00      0.00        16
Companies worked at       0.16      0.36      0.22        59
             Degree       0.46      0.54      0.50        35
Years of Experience       0.00      0.00      0.00         5

          micro avg       0.56      0.78      0.65      2331
          macro avg       0.61      0.78      0.67      2331

Confusion Matrix:
 	College Name Companies worked at Degree Designation Email Address Graduation Year Lo

Epoch: 100%|██████████| 6/6 [01:13<00:00, 12.26s/it]

Validation loss: 0.4421268403530121
Validation Accuracy: 0.8616699157194712
Classification Report:
                      precision    recall  f1-score   support

             Skills       0.43      0.73      0.54       869
        Designation       0.30      0.25      0.27        92
               Name       0.88      0.93      0.90        41
           Location       0.64      0.64      0.64        53
      Email Address       0.82      0.80      0.81      1130
       College Name       0.25      0.30      0.27        33
    Graduation Year       1.00      0.06      0.12        16
Companies worked at       0.17      0.32      0.22        60
             Degree       0.57      0.49      0.52        35
Years of Experience       1.00      0.20      0.33         5

          micro avg       0.57      0.72      0.64      2334
          macro avg       0.62      0.72      0.65      2334

Confusion Matrix:
 	College Name Companies worked at Degree Designation Email Address Graduation Year Lo




In [23]:
torch.save(
    {
        "epoch": EPOCHS,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
    },
    'model_e6.tar',
)