**Objective:**

Use BERT Tokenizer to build a NER Tagger for Party Recognition from rental agreements

**Results:**

80% + recall for Party1/Party2 tags on provided validation set

**Contents:**


1.   Read the BIO annotated data
2.   Undersample paragraphs without any tags in them to have a ratio of 1:1
3. Determine Max Length of the words in each para/text and truncate/add padding
4. Adapt and preserve the labels to match BERT's word piece tokenization
5. Create attention masks and split data to train and test
6. Use BERT-BASE-UNCASED pretrained model with the help of BERT Tokenizer
7. Finetune all model layers for about 20 epochs
8. Develop classification report
9. Testing on Provided Validation Data



In [1]:
import os
os.getcwd()
from google.colab import drive
import os
drive.mount('/content/gdrive')

#IF YOU ARE CLONING FOR THE FIRST TIME in colab please uncomment all the below 5 lines of code. 
#Put git user name and password in appropriate places

root_path = 'gdrive/My Drive/Colab Notebooks/' #change dir to folder where you want to clone
os.chdir('/content/' + root_path)
##!git clone https://github.com/selfishhari/rental_meta_extraction.git

root_path = 'rental_meta_extraction/' #change dir to your project folder's src

os.chdir(root_path)

!git branch -r | grep -v '\->' | while read remote; do git branch --track "${remote#origin/}" "$remote"; done

os.getcwd()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
fatal: A branch named 'master' already exists.


'/content/gdrive/My Drive/Colab Notebooks/rental_meta_extraction'

In [2]:
!pip install transformers



In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import os, sys
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

torch.__version__

Using TensorFlow backend.


'1.5.0+cu101'

In [4]:
annotated_data = pd.read_pickle("data/02_processed/annotated_df_v1.pickle")

annotated_data.sample(5)

Unnamed: 0,fname,paras,para_windows,para_windows_proc,annotations,annot_count
1,62126501-Rental-Agreement,This Rental Agreement is made and executed on ...,RENTAL AGREEMENT This Rental Agreement is made...,rental agreement this rental agreement is made...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1
9,47854715-RENTAL-AGREEMENT,The lessor or their authorized agents shall be...,The said deposit shall be refundable to the le...,the said deposit shall be refundable to the le...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1
30,116950326-December-2012-Rental-Agreement,Megan’s Law,Fill out a) or b) as it appliesoto youf situat...,fill out a) or b) as it appliesoto youf situat...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1
36,323828497-Rental-Agreement-Micky,VEHICLES & GARAGE USE:,"ALTERATIONS: Tenant shall make no alterations,...","alterations: tenant shall make no alterations,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1
19,6683129-House-Rental-Contract-Geraldine-Galina...,"Surname, Name:",SUBLETTING: Without first requesting permissio...,subletting: without first requesting permissio...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1


In [5]:
print("size of sentences with annotations", annotated_data.loc[annotated_data.annot_count != 1, :].shape)

print("size of sentences without any annotations", annotated_data.loc[annotated_data.annot_count == 1, :].shape)

print("Severely unbalanced. Hence will resample")

na = annotated_data.loc[annotated_data.annot_count == 1, :].sample(303)

ad = annotated_data.loc[annotated_data.annot_count != 1, :]

annotated_data = pd.concat([ad, na], ignore_index=True)

print("size of sentences with annotations after undersampling no annotations", annotated_data.loc[annotated_data.annot_count != 1, :].shape)

print("size of sentences without any annotations after undersampling", annotated_data.loc[annotated_data.annot_count == 1, :].shape)


size of sentences with annotations (303, 6)
size of sentences without any annotations (1235, 6)
Severely unbalanced. Hence will resample
size of sentences with annotations after undersampling no annotations (303, 6)
size of sentences without any annotations after undersampling (303, 6)


In [6]:
annotated_data["para_windows_proc"] = annotated_data.para_windows_proc.apply(lambda x: x.replace(".", " "))

annotated_data["para_windows_proc"] = annotated_data.para_windows_proc.apply(lambda x: x.replace(",", " "))

annotated_data["ann_length"] = annotated_data["annotations"].apply(lambda x: len(x))

annotated_data["text_length"] = annotated_data["para_windows_proc"].apply(lambda x: len(x.split()))

(~(annotated_data["text_length"] == annotated_data["ann_length"])).sum()

print("all annotations and text lengths are now matching")

all annotations and text lengths are now matching


In [7]:
sentences = annotated_data["para_windows_proc"].apply(lambda x: x.split()).tolist()

labels = annotated_data["annotations"].tolist()

print(annotated_data.shape, len(sentences), len(labels))

(606, 8) 606 606


In [8]:
annotated_data["text_length"].quantile([0.5, 0.7, 0.8, 0.9, 0.99, 1])

0.50     59.00
0.70     82.50
0.80     99.00
0.90    134.50
0.99    233.35
1.00    350.00
Name: text_length, dtype: float64

Since 90% of text length is less than 141, I am going to choose **140** as initial **MAX_LEN**

In [0]:
MAX_LEN = 140
bs = 64


In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [11]:
torch.cuda.get_device_name(0)

'Tesla P100-PCIE-16GB'

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [0]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels


In [0]:
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(sentences, labels)
]


In [0]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]

labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [0]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")


In [17]:
tag_values = ["O", "B-P1", "I-P1", "B-P2", "I-P2", "PAD"]

tag2idx = {t: i for i, t in enumerate(tag_values)}

tag2idx

{'B-P1': 1, 'B-P2': 3, 'I-P1': 2, 'I-P2': 4, 'O': 0, 'PAD': 5}

In [0]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")


In [19]:
tags

array([[0, 0, 0, ..., 5, 5, 5],
       [0, 0, 0, ..., 5, 5, 5],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 5, 5, 5],
       [0, 0, 0, ..., 5, 5, 5],
       [0, 0, 0, ..., 5, 5, 5]])

In [0]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [0]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=2019, test_size=0.1)

tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2019, test_size=0.1)


In [0]:
tr_inputs = torch.tensor(tr_inputs)

val_inputs = torch.tensor(val_inputs)

tr_tags = torch.tensor(tr_tags)

val_tags = torch.tensor(val_tags)

tr_masks = torch.tensor(tr_masks)

val_masks = torch.tensor(val_masks)

In [0]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)


In [24]:
import transformers
from transformers import BertForTokenClassification, AdamW

transformers.__version__



'2.11.0'

In [0]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)
model.cuda();

In [0]:
FULL_FINETUNING = True

if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)


In [0]:
from transformers import get_linear_schedule_with_warmup

epochs = 25
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)


In [28]:
!pip install seqeval
!nvidia-smi

Mon Jun  8 03:36:08 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   57C    P0    35W / 250W |   1267MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

In [0]:
from seqeval.metrics import f1_score, accuracy_score

In [30]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []

for _ in trange(epochs, desc="Epoch"):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.

    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)


    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
    print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
    print()

Epoch:   0%|          | 0/25 [00:00<?, ?it/s]

Average train loss: 0.43401476244131726


Epoch:   4%|▍         | 1/25 [00:06<02:46,  6.95s/it]

Validation loss: 0.1876133680343628
Validation Accuracy: 0.9667355371900826
Validation F1-Score: 0

Average train loss: 0.15541490995221668


Epoch:   8%|▊         | 2/25 [00:13<02:39,  6.93s/it]

Validation loss: 0.16871924698352814
Validation Accuracy: 0.9667355371900826
Validation F1-Score: 0

Average train loss: 0.13554130329026115


Epoch:  12%|█▏        | 3/25 [00:20<02:32,  6.92s/it]

Validation loss: 0.11841312050819397
Validation Accuracy: 0.9667355371900826
Validation F1-Score: 0

Average train loss: 0.0963006400399738


Epoch:  16%|█▌        | 4/25 [00:27<02:25,  6.91s/it]

Validation loss: 0.08685363829135895
Validation Accuracy: 0.9725206611570248
Validation F1-Score: 0.4097560975609757

Average train loss: 0.06808476522564888


Epoch:  20%|██        | 5/25 [00:34<02:18,  6.91s/it]

Validation loss: 0.061242904514074326
Validation Accuracy: 0.9816115702479339
Validation F1-Score: 0.5581395348837209

Average train loss: 0.051277068754037224


Epoch:  24%|██▍       | 6/25 [00:41<02:11,  6.91s/it]

Validation loss: 0.047659922391176224
Validation Accuracy: 0.9855371900826446
Validation F1-Score: 0.633587786259542

Average train loss: 0.040366317249006696


Epoch:  28%|██▊       | 7/25 [00:48<02:04,  6.90s/it]

Validation loss: 0.03937966376543045
Validation Accuracy: 0.9857438016528925
Validation F1-Score: 0.6080586080586081

Average train loss: 0.031646628553668656


Epoch:  32%|███▏      | 8/25 [00:55<01:57,  6.90s/it]

Validation loss: 0.03559859097003937
Validation Accuracy: 0.9880165289256199
Validation F1-Score: 0.6466165413533834

Average train loss: 0.025918257526225515


Epoch:  36%|███▌      | 9/25 [01:02<01:50,  6.90s/it]

Validation loss: 0.029571710154414177
Validation Accuracy: 0.9913223140495868
Validation F1-Score: 0.6942148760330579

Average train loss: 0.022679692341221705


Epoch:  40%|████      | 10/25 [01:09<01:43,  6.89s/it]

Validation loss: 0.02687137760221958
Validation Accuracy: 0.9923553719008265
Validation F1-Score: 0.7563025210084032

Average train loss: 0.018143709955943957


Epoch:  44%|████▍     | 11/25 [01:15<01:36,  6.89s/it]

Validation loss: 0.026477549225091934
Validation Accuracy: 0.9911157024793389
Validation F1-Score: 0.7438016528925621

Average train loss: 0.014570386666390631


Epoch:  48%|████▊     | 12/25 [01:22<01:29,  6.89s/it]

Validation loss: 0.02357451058924198
Validation Accuracy: 0.9921487603305785
Validation F1-Score: 0.7634854771784233

Average train loss: 0.0120726992479629


Epoch:  52%|█████▏    | 13/25 [01:29<01:22,  6.89s/it]

Validation loss: 0.019701460376381874
Validation Accuracy: 0.993801652892562
Validation F1-Score: 0.8068669527896997

Average train loss: 0.010930041575597392


Epoch:  56%|█████▌    | 14/25 [01:36<01:15,  6.90s/it]

Validation loss: 0.017978090792894363
Validation Accuracy: 0.9940082644628099
Validation F1-Score: 0.8240343347639485

Average train loss: 0.009314440863413943


Epoch:  60%|██████    | 15/25 [01:43<01:08,  6.89s/it]

Validation loss: 0.01646634377539158
Validation Accuracy: 0.9950413223140496
Validation F1-Score: 0.8558951965065502

Average train loss: 0.009156979723936982


Epoch:  64%|██████▍   | 16/25 [01:50<01:02,  6.90s/it]

Validation loss: 0.015236816368997097
Validation Accuracy: 0.9960743801652893
Validation F1-Score: 0.8669527896995708

Average train loss: 0.007832939063923227


Epoch:  68%|██████▊   | 17/25 [01:57<00:55,  6.90s/it]

Validation loss: 0.014242805540561676
Validation Accuracy: 0.9958677685950413
Validation F1-Score: 0.8608695652173913

Average train loss: 0.006762658452822102


Epoch:  72%|███████▏  | 18/25 [02:04<00:48,  6.90s/it]

Validation loss: 0.014294332824647427
Validation Accuracy: 0.9960743801652893
Validation F1-Score: 0.8793103448275862

Average train loss: 0.006126289152436786


Epoch:  76%|███████▌  | 19/25 [02:11<00:41,  6.90s/it]

Validation loss: 0.012969590723514557
Validation Accuracy: 0.9962809917355372
Validation F1-Score: 0.9004329004329005

Average train loss: 0.006314619754751523


Epoch:  80%|████████  | 20/25 [02:18<00:34,  6.91s/it]

Validation loss: 0.013515891507267952
Validation Accuracy: 0.9960743801652893
Validation F1-Score: 0.8755364806866952

Average train loss: 0.005694108497765329


Epoch:  84%|████████▍ | 21/25 [02:24<00:27,  6.91s/it]

Validation loss: 0.010902205482125282
Validation Accuracy: 0.9973140495867768
Validation F1-Score: 0.9391304347826088

Average train loss: 0.004636222781199548


Epoch:  88%|████████▊ | 22/25 [02:31<00:20,  6.90s/it]

Validation loss: 0.010930899530649185
Validation Accuracy: 0.996900826446281
Validation F1-Score: 0.9264069264069263

Average train loss: 0.004883849915737907


Epoch:  92%|█████████▏| 23/25 [02:38<00:13,  6.90s/it]

Validation loss: 0.011430526152253151
Validation Accuracy: 0.996694214876033
Validation F1-Score: 0.9137931034482759

Average train loss: 0.004536408103174633


Epoch:  96%|█████████▌| 24/25 [02:45<00:06,  6.89s/it]

Validation loss: 0.011072388850152493
Validation Accuracy: 0.9971074380165289
Validation F1-Score: 0.9396551724137931

Average train loss: 0.004360001094432341


Epoch: 100%|██████████| 25/25 [02:52<00:00,  6.90s/it]

Validation loss: 0.010879842564463615
Validation Accuracy: 0.9971074380165289
Validation F1-Score: 0.9396551724137931






In [31]:
from seqeval.metrics import precision_score, classification_report


print(classification_report(valid_tags, pred_tags))

           precision    recall  f1-score   support

       P2       0.88      0.94      0.91        32
       P1       0.93      0.98      0.95        81

micro avg       0.92      0.96      0.94       113
macro avg       0.92      0.96      0.94       113



In [32]:
!nvidia-smi

Mon Jun  8 03:39:03 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    44W / 250W |  10559MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

**Testing on validation set**

In [33]:
annotated_data_valid = pd.read_pickle("data/02_processed/annotated_df_valid.pickle")

annotated_data_valid["para_windows_proc"] = annotated_data_valid.para_windows_proc.apply(lambda x: x.replace(".", " "))

annotated_data_valid["para_windows_proc"] = annotated_data_valid.para_windows_proc.apply(lambda x: x.replace(",", " "))

annotated_data_valid["ann_length"] = annotated_data_valid["annotations"].apply(lambda x: len(x))

annotated_data_valid["text_length"] = annotated_data_valid["para_windows_proc"].apply(lambda x: len(x.split()))

print((~(annotated_data_valid["text_length"] == annotated_data_valid["ann_length"])).sum())

print("all annotations and text lengths are now matching")

sentences_valid = annotated_data_valid["para_windows_proc"].apply(lambda x: x.split()).tolist()

labels_valid = annotated_data_valid["annotations"].tolist()

print(annotated_data_valid.shape, len(sentences_valid), len(labels_valid))

tokenized_texts_and_labels_valid = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(sentences_valid, labels_valid)
]

tokenized_texts_valid = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels_valid]

labels_valid = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels_valid]

input_ids_valid = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_valid],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

tags_valid = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_valid],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")
attention_masks_valid = [[float(i != 0.0) for i in ii] for ii in input_ids_valid]

vl_inputs = torch.tensor(input_ids_valid)

vl_tags = torch.tensor(tags_valid)

vl_masks = torch.tensor(attention_masks_valid)

vl_data = TensorDataset(vl_inputs, vl_masks, vl_tags)
vl_sampler = SequentialSampler(vl_data)
vl_dataloader = DataLoader(vl_data, sampler=vl_sampler, batch_size=bs)


0
all annotations and text lengths are now matching
(252, 8) 252 252


In [34]:
model.eval()


eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

predictions , true_labels = [], []

for batch in vl_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    # Telling the model not to compute or store gradients,
    # saving memory and speeding up validation
    with torch.no_grad():
        # Forward pass, calculate logit predictions.
        # This will return the logits rather than the loss because we have not provided labels.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
    # Move logits and labels to CPU
    logits = outputs[1].detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Calculate the accuracy for this batch of test sentences.
    eval_loss += outputs[0].mean().item()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    true_labels.extend(label_ids)

eval_loss = eval_loss / len(vl_dataloader)

print("Validation loss: {}".format(eval_loss))

pred_tags_vl = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                              for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]

valid_tags_vl = [tag_values[l_i] for l in true_labels
                              for l_i in l if tag_values[l_i] != "PAD"]

print("Validation Accuracy: {}".format(accuracy_score(pred_tags_vl, valid_tags_vl)))
print("Validation F1-Score: {}".format(f1_score(pred_tags_vl, valid_tags_vl)))
print()

Validation loss: 0.02437988921883516
Validation Accuracy: 0.9963228957196208
Validation F1-Score: 0.8123249299719887



In [35]:
from seqeval.metrics import precision_score, classification_report

#10 epochs
print(classification_report(valid_tags_vl, pred_tags_vl))

           precision    recall  f1-score   support

       P1       0.82      0.88      0.85        93
       P2       0.72      0.82      0.77        77

micro avg       0.78      0.85      0.81       170
macro avg       0.78      0.85      0.81       170

