In [1]:
import preprocess as pp
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW,BartTokenizerFast, BartForSequenceClassification
%matplotlib inline
import itertools
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score,accuracy_score,confusion_matrix, precision_score,recall_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import tqdm
import torch.nn as nn
import numpy as np
import sys
import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification

In [2]:
!nvidia-smi

Sun Jan 10 09:02:13 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 00000000:04:00.0 Off |                    0 |
| N/A   32C    P0    24W / 250W |      8MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   31C    P0    24W / 250W |      8MiB / 16160MiB |      0%      Default |
|       

## Functions

In [3]:
def accuracy(out_logits,labels):
    labels = labels.detach().cpu().numpy()
    predicted = out_logits.detach().cpu().numpy()
    predicted = np.argmax(predicted,axis=1).reshape(labels.shape)
    return np.mean(labels == predicted)

def val_params(model,val_loader):
    temp = model.eval()
    num_batches = 0
    loss_sum = 0
    accuracy_sum =0
    y_true = []
    y_pred = []
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.no_grad():
            outputs = model.forward(input_ids, attention_mask=attention_mask,labels=labels)
        loss = criterion(outputs.logits.view(-1,2),labels.view(-1))
        y_pred = y_pred + torch.argmax(outputs.logits,axis=1).detach().cpu().tolist()
        y_true = y_true + labels.detach().cpu().tolist()
        loss_sum += loss.item()
        accuracy_sum += accuracy(outputs.logits,labels)
        num_batches +=1
    return loss_sum/num_batches,accuracy_sum/num_batches,f1_score(y_true,y_pred)

class DefinitionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    

def plot_confusion_matrix(cm, classes,
                        normalize=False,
                        title='Confusion matrix',
                        cmap=plt.cm.Blues,
                         figsize = (10,10)):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        cm =np.round(cm,2)
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    
    plt.rcParams["figure.figsize"] = figsize
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    
    

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

def get_dataloader(dir_path,tokenizer, batch_size=32):
    x,y,tags = pp.get_data_slt(dir_path)
    encodings = tokenizer(x,is_split_into_words=True,  padding=True, truncation=True,return_tensors="pt")
    dataset = DefinitionDataset(encodings,y)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return data_loader

In [4]:
dir_path = '../data/deft_files/train/'
dir_path_val = '../data/deft_files/dev/'
model_name =  'roberta-base'
#'bert-base-uncased'
#"facebook/bart-large"
Model = RobertaForSequenceClassification
ModelTokenizer = RobertaTokenizer
path_to_save = "../model/roberta_cls/" 

In [5]:
x,y,tags = pp.get_data_slt(dir_path)

In [6]:
device = torch.device('cuda:2') if torch.cuda.is_available() else torch.device('cpu')

In [7]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.15, random_state=42)

In [8]:
w = np.mean(y_train)
weights = [0.2,0.8]

## Tokenization

In [9]:
tokenizer = ModelTokenizer.from_pretrained(model_name,add_prefix_space=True)

In [10]:
train_encodings = tokenizer(x_train,is_split_into_words=True,  padding=True, truncation=True,return_tensors="pt")
val_encodings = tokenizer(x_val,is_split_into_words=True,  padding=True, truncation=True,return_tensors="pt")

In [11]:
train_dataset = DefinitionDataset(train_encodings,y_train)
val_dataset = DefinitionDataset(val_encodings,y_val)

In [12]:
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [13]:
model = Model.from_pretrained(model_name,num_labels=2).to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

## Training

In [14]:
optim = AdamW(model.parameters(), lr=1e-5)
weight=torch.tensor(weights,dtype=torch.float).to(device)
criterion = nn.CrossEntropyLoss(weight=weight)
mse_criterion = nn.MSELoss()

In [15]:
EPOCHS=10
prev_val_acc = -1
temp = model.train()
for epoch in range(EPOCHS):
    loss_sum = 0
    accuracy_sum = 0
    num_batch = 0
    pbar = tqdm.tqdm(train_loader)
    for batch in pbar:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask,labels=labels)
        loss = criterion(outputs.logits.view(-1,2),labels.view(-1))
        loss = loss 
        #loss = outputs[0]
        loss.backward()
        optim.step()
        loss_sum += loss.item()
        accuracy_sum += accuracy(outputs.logits,labels)
        num_batch+=1
        pbar.set_description("Epoch: %s, Train loss: %f, Train accuracy: %f"%(epoch,loss_sum/num_batch,accuracy_sum/num_batch))
    
    val_metric = val_params(model,val_loader)
    sys.stdout.write("         Val loss: %f, Val accuracy: %f, Val f1: %f"%val_metric)
    sys.stdout.flush()
    
    #Breaking criteria
    if prev_val_acc > val_metric[2]:
        break
    
    prev_val_acc = val_metric[2]
    
    #saving model checkpoint
    model.save_pretrained(path_to_save)
    tokenizer.save_pretrained(path_to_save)

Epoch: 0, Train loss: 0.360355, Train accuracy: 0.791181: 100%|██████████| 643/643 [03:43<00:00,  2.88it/s]


         Val loss: 0.305267, Val accuracy: 0.826115, Val f1: 0.742974

Epoch: 1, Train loss: 0.256077, Train accuracy: 0.872440: 100%|██████████| 643/643 [03:40<00:00,  2.91it/s]


         Val loss: 0.290623, Val accuracy: 0.845669, Val f1: 0.761340

Epoch: 2, Train loss: 0.181473, Train accuracy: 0.917217: 100%|██████████| 643/643 [03:42<00:00,  2.89it/s]


         Val loss: 0.329775, Val accuracy: 0.858096, Val f1: 0.771773

Epoch: 3, Train loss: 0.114951, Train accuracy: 0.949618: 100%|██████████| 643/643 [03:43<00:00,  2.88it/s]


         Val loss: 0.403433, Val accuracy: 0.869518, Val f1: 0.777985

Epoch: 4, Train loss: 0.077362, Train accuracy: 0.969236: 100%|██████████| 643/643 [03:42<00:00,  2.89it/s]


         Val loss: 0.482561, Val accuracy: 0.869243, Val f1: 0.776303

## Evaluation

In [16]:
def get_model_output(data_loader):
    y_true = []
    y_pred = []
    t=model.eval()
    pbar = tqdm.tqdm(data_loader)
    for data in pbar:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['labels'].to(device)
        output = model(input_ids, attention_mask=attention_mask)
        pred_labels = torch.argmax(output.logits,dim=1).detach().cpu().numpy().tolist()
        labels_numpy = labels.detach().cpu().numpy().tolist()
        y_true = y_true + labels_numpy
        y_pred = y_pred + pred_labels

    return y_true,y_pred

In [17]:
dl = get_dataloader(dir_path_val,tokenizer,batch_size=16)
y_true,y_pred = get_model_output(dl)

100%|██████████| 74/74 [00:03<00:00, 23.77it/s]


In [19]:
r = recall_score(y_true,y_pred)
p = precision_score(y_true,y_pred)
f1 = f1_score(y_true,y_pred)
a = accuracy_score(y_true,y_pred)
res = pd.DataFrame({
    'accuracy': a,
    'f1': f1,
    'recall':r,
    'Precision':p
},index=[0])
res

Unnamed: 0,accuracy,f1,recall,Precision
0,0.888041,0.789137,0.80719,0.771875
