In [1]:
import preprocess as pp
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW,BartTokenizerFast, BartForSequenceClassification
%matplotlib inline
import itertools
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score,accuracy_score,confusion_matrix, precision_score,recall_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import tqdm
import torch.nn as nn
import numpy as np
import sys
import pandas as pd
from transformers import RobertaTokenizerFast, RobertaForTokenClassification,get_cosine_schedule_with_warmup
from transformers import BertForTokenClassification,BertModel
from functools import reduce
import spacy
from torchcrf import CRF

## Functions

In [2]:
def accuracy(out_logits,labels):
    if type(out_logits) != torch.Tensor:
        pred = [val for vals in out_logits for val in vals]
        target = [val for vals in labels for val in vals]
        return np.mean(np.array(pred)==np.array(target))
    else:
        pred = out_logits.argmax(dim=2)
        return torch.mean((pred[labels!=-100] == labels[labels!=-100]).float()).item()

def val_params(model,val_loader):
    temp = model.eval()
    num_batches = 0
    loss_sum = 0
    accuracy_sum =0
    y_true = []
    y_pred = []
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.no_grad():
            outputs = model.forward(input_ids, attention_mask=attention_mask,labels=labels)
        #loss = criterion(outputs.logits.view(-1,2),labels.view(-1))
        loss = outputs[0]
        if len(outputs)!=1:
            pred = outputs[1].argmax(dim=2)
            y_pred = y_pred + pred[labels!=-100].view(-1).tolist()
            y_true = y_true + labels[labels!=-100].view(-1).tolist()
        else:
            labels = list(map(lambda label,mask:label[mask==1].tolist(),labels,attention_mask))
            pred = model.decode(input_ids, attention_mask=attention_mask)
            pred = [val for vals in pred for val in vals]
            labels = [val for vals in labels for val in vals]
            y_pred = y_pred + pred
            y_true = y_true + labels
            
        loss_sum += loss.item()
        num_batches +=1
    f1_micro = f1_score(np.array(y_true).reshape(-1),np.array(y_pred).reshape(-1),average='micro')
    f1_macro = f1_score(np.array(y_true).reshape(-1),np.array(y_pred).reshape(-1),average='macro')
    acc = accuracy_score(np.array(y_true).reshape(-1),np.array(y_pred).reshape(-1))
    return loss_sum/num_batches,acc,f1_micro,f1_macro

def get_ytyp(model,val_loader):
    temp = model.eval()
    y_true = []
    y_pred = []
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.no_grad():
            outputs = model.forward(input_ids, attention_mask=attention_mask,labels=labels)
        #loss = criterion(outputs.logits.view(-1,2),labels.view(-1))
        loss = outputs[0]
        pred = outputs[1].argmax(dim=2)
        
        y_pred = y_pred + pred[labels!=-100].view(-1).tolist()
        y_true = y_true + labels[labels!=-100].view(-1).tolist()
        
    return y_pred,y_true

def get_model_out(model,val_loader):
    temp = model.eval()
    y_true = []
    y_pred = []
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.no_grad():
            outputs = model.forward(input_ids, attention_mask=attention_mask,labels=labels)
        #loss = criterion(outputs.logits.view(-1,2),labels.view(-1))
        loss = outputs[0]
        pred = outputs[1].argmax(dim=2)
        
        
        y_pred = y_pred + pred[labels!=-100].view(-1).tolist()
        y_true = y_true + labels[labels!=-100].view(-1).tolist()
        loss_sum += loss.item()
        accuracy_sum += accuracy(outputs[1],labels)
        num_batches +=1
    cm = confusion_matrix(np.array(y_true).reshape(-1),np.array(y_pred).reshape(-1))
    return cm


class DefinitionERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    

def plot_confusion_matrix(cm, classes,
                        normalize=False,
                        title='Confusion matrix',
                        cmap=plt.cm.Blues,
                         figsize = (10,10)):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        cm =np.round(cm,2)
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    
    plt.rcParams["figure.figsize"] = figsize
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    
    

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

def get_dataloader(dir_path,tokenizer, batch_size=32):
    x,y,tags = pp.get_data_slt(dir_path)
    encodings = tokenizer(x,is_split_into_words=True,  padding=True, truncation=True,return_tensors="pt")
    dataset = DefinitionDataset(encodings,y)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return data_loader

def get_encodings(text,tags,tokenizer):
    encodings  = tokenizer(text, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True,return_tensors='pt')
    seq_len = encodings.input_ids.shape[1]
    labels = [[lab2id[tok] for tok in seq] for seq in tags]
    enc_labels = []
    input_ids = []
    out_enc = {}
    for off_map_seq,lab_seq in zip(encodings.offset_mapping,labels):
        tags = np.ones(seq_len,dtype=int) * -100
        
        
        tags[(off_map_seq[:,0] == 0) & (off_map_seq[:,1] != 0)]= lab_seq
        
        enc_labels.append(tags)
        

    encodings['labels'] = torch.tensor(enc_labels)  
    
    return encodings



def get_encodings_crf(text,tags,tokenizer):
    encodings  = tokenizer(text, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True,return_tensors='pt')
    seq_len = encodings.input_ids.shape[1]
    labels = [[lab2id[tok] for tok in seq] for seq in tags]
    enc_labels = []
    input_ids = []
    out_enc = {}
    for off_map_seq,lab_seq,in_id in zip(encodings.offset_mapping,labels,encodings.input_ids):
        inp_id = [0]*seq_len
        tags = [lab2id[' O']]*seq_len
        in_id_nopad = in_id[off_map_seq[:,0]==0][in_id[off_map_seq[:,0]==0]!=0].tolist()
        inp_id[:len(in_id_nopad)]=in_id_nopad
        tags[1:(len(lab_seq)+1)]= lab_seq
        
        input_ids.append(inp_id)
        enc_labels.append(tags)
        

    out_enc['labels'] = torch.tensor(enc_labels) 
    out_enc['input_ids'] = torch.tensor(input_ids) 
    out_enc['attention_mask']=torch.logical_and(out_enc['input_ids']!=0 , out_enc['input_ids']!=102).type(torch.ByteTensor)
    
    return out_enc


In [3]:
!nvidia-smi

Sun Feb  7 19:31:41 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 00000000:04:00.0 Off |                    0 |
| N/A   30C    P0    24W / 250W |      8MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   28C    P0    24W / 250W |      8MiB / 16160MiB |      0%      Default |
|       

In [4]:
!ps aux grep 3165672

USER         PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
root        1613  3.0  0.0      0     0 ?        R    Jan29 414:53 [nv_queue]
root        1977  3.8  0.0      0     0 ?        R    Jan29 520:53 [nv_queue]
gsinha1   576946 44.0  0.0  11716  3604 pts/16   Rs+  19:31   0:00 ps aux grep 3


## Setup

In [5]:
device = torch.device('cuda:2') if torch.cuda.is_available() else torch.device('cpu')

In [6]:
torch.tensor([1,2]).to(device)

tensor([1, 2], device='cuda:2')

In [7]:
dir_path = '../data/deft_files/train/'
dir_path_val = '../data/deft_files/dev/'
model_name = 'bert-large-cased'  #  "facebook/bart-large" 'roberta-base'
Model = BertForTokenClassification
ModelTokenizer = BertTokenizerFast #BertTokenizerFast
path_to_save = "../model/berta_ner/" 

## Data Processing

In [8]:
texts_train,_,tags_train = pp.get_data_slt(dir_path)
texts_val,_,tags_val = pp.get_data_slt(dir_path_val)

In [9]:
unique_tags = {}
for seq in tags_train:
    for tok in seq:
        if tok not in unique_tags.keys():
            unique_tags[tok] = 1
        else:
            unique_tags[tok] += 1
            
    

In [10]:
id2lab = sorted(list(unique_tags.keys()))
classes = id2lab.copy()
id2lab = {val:key for val,key in enumerate(id2lab)}
id2lab[-100] = 'other'
lab2id = {key:val for val,key in id2lab.items()}

In [11]:
lab2id

{' B-Alias-Term': 0,
 ' B-Alias-Term-frag': 1,
 ' B-Definition': 2,
 ' B-Definition-frag': 3,
 ' B-Qualifier': 4,
 ' B-Referential-Definition': 5,
 ' B-Referential-Term': 6,
 ' B-Secondary-Definition': 7,
 ' B-Term': 8,
 ' B-Term-frag': 9,
 ' I-Alias-Term': 10,
 ' I-Definition': 11,
 ' I-Definition-frag': 12,
 ' I-Qualifier': 13,
 ' I-Referential-Definition': 14,
 ' I-Referential-Term': 15,
 ' I-Secondary-Definition': 16,
 ' I-Term': 17,
 ' I-Term-frag': 18,
 ' O': 19,
 'other': -100}

In [12]:
weights = [1]*len(id2lab)
for lab,num in unique_tags.items():
    weights[lab2id[lab]] = 1000/num

In [13]:
weight = torch.tensor(weights).to(device)

In [14]:
weight

tensor([1.3774e+00, 3.3333e+02, 1.6496e-01, 1.1765e+01, 6.1728e+00, 3.2468e+00,
        7.1429e+00, 2.0877e+00, 1.5126e-01, 1.2500e+02, 1.1628e+00, 1.1476e-02,
        1.0471e+00, 9.5694e-01, 1.5129e+00, 8.6207e+00, 1.2155e-01, 1.0470e-01,
        3.3333e+02, 2.8978e-03, 1.0000e+00], device='cuda:2')

In [15]:
tokenizer = ModelTokenizer.from_pretrained(model_name,add_prefix_space=True)

In [16]:
train_encodings = get_encodings(texts_train,tags_train,tokenizer)
val_encodings = get_encodings(texts_val,tags_val,tokenizer)

In [17]:
i=79
df = pd.DataFrame({
    'tokens':tokenizer.convert_ids_to_tokens(val_encodings['input_ids'][i].tolist()),
    'label':list(map(lambda x: id2lab[x],val_encodings['labels'][i].tolist()))
})

df.head(50)

Unnamed: 0,tokens,label
0,[CLS],other
1,Systems,B-Term
2,biology,I-Term
3,is,O
4,the,B-Definition
5,study,I-Definition
6,of,I-Definition
7,whole,I-Definition
8,biological,I-Definition
9,systems,I-Definition


In [18]:
train_dataset = DefinitionERDataset(train_encodings)
val_dataset = DefinitionERDataset(val_encodings)

In [19]:
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

## Model

In [20]:
class DefinitionExtraction(nn.Module):
    def __init__(self,num_labels):
        super(DefinitionExtraction, self).__init__()
        self.bert = BertModel.from_pretrained('bert-large-cased')
        self.dense = nn.Linear(1024,1024)
        self.classification = nn.Linear(1024,num_labels)
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(p=0.1)
        self.crf = CRF(num_labels,batch_first=True)
        self.criterion = nn.CrossEntropyLoss()
    
    def forward(self,input_ids,attention_mask=None,labels=None):
        x = self.bert(input_ids.long())
        x = self.tanh(self.dense(x.last_hidden_state))
        x = self.dropout(x)
        logits = self.classification(x)
        loss = self.crf.forward(logits,labels,attention_mask)
        return [loss]
    
    def decode(self,input_ids,attention_mask=None):
        x = self.bert(input_ids.long())
        x = self.tanh(self.dense(x.last_hidden_state))
        x = self.dropout(x)
        logits = self.classification(x)
        output = self.crf.decode(logits,attention_mask)
        return output
    
    def call(self,input_ids,attention_mask=None,labels=None):
        return self.forward(input_ids,attention_mask,labels)

In [21]:
#model = DefinitionExtraction(len(id2lab)).to(device)

In [22]:
model = Model.from_pretrained(model_name,num_labels=len(id2lab)).to(device)
#model = nn.DataParallel(model,device_ids=[0,1,3],output_device=1)

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-c

## Training

In [23]:
EPOCHS=12
optim = AdamW(model.parameters(), lr=1e-5)
sched = get_cosine_schedule_with_warmup(optim,
                                        num_warmup_steps=100,
                                        num_training_steps=(len(tags_train)/BATCH_SIZE)*EPOCHS)

In [24]:
prev_val_acc = -1
temp = model.train()
for epoch in range(EPOCHS):
    loss_sum = 0
    accuracy_sum = 0
    num_batch = 0
    pbar = tqdm.tqdm(train_loader)
    for batch in pbar:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask,labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        sched.step()
        loss_sum += loss.item()
        if len(outputs)==1:
            out = model.decode(input_ids, attention_mask=attention_mask)
            outputs.append(out)
            labels = list(map(lambda label,mask:label[mask==1].tolist(),labels,attention_mask))
        accuracy_sum += accuracy(outputs[1],labels)
        num_batch+=1
        pbar.set_description("Epoch: %s, Train loss: %f, Train accuracy: %f"%(epoch,
                                                                              loss_sum/num_batch,
                                                                              accuracy_sum/num_batch))
    val_metric = val_params(model,val_loader)
    sys.stdout.write("         Val loss: %f, Val accuracy: %f, Val f1-micro: %f, Val f1-macro: %f"%val_metric)
    sys.stdout.flush()
    
    #Breaking criteria
    if prev_val_acc > val_metric[2]:
        pass
    
    prev_val_acc = val_metric[2]
    
    #saving model checkpoint
    model.save_pretrained(path_to_save+str(epoch))
    tokenizer.save_pretrained(path_to_save+str(epoch))

Epoch: 0, Train loss: 0.694759, Train accuracy: 0.797223: 100%|██████████| 756/756 [12:58<00:00,  1.03s/it]


         Val loss: 0.424543, Val accuracy: 0.865361, Val f1-micro: 0.865361, Val f1-macro: 0.223196

Epoch: 1, Train loss: 0.393899, Train accuracy: 0.873018: 100%|██████████| 756/756 [12:46<00:00,  1.01s/it]


         Val loss: 0.401640, Val accuracy: 0.863439, Val f1-micro: 0.863439, Val f1-macro: 0.331345

Epoch: 2, Train loss: 0.285852, Train accuracy: 0.908199: 100%|██████████| 756/756 [12:46<00:00,  1.01s/it]


         Val loss: 0.416888, Val accuracy: 0.866524, Val f1-micro: 0.866524, Val f1-macro: 0.355822

Epoch: 3, Train loss: 0.200731, Train accuracy: 0.933132: 100%|██████████| 756/756 [12:46<00:00,  1.01s/it]


         Val loss: 0.511185, Val accuracy: 0.870725, Val f1-micro: 0.870725, Val f1-macro: 0.329760

Epoch: 4, Train loss: 0.136093, Train accuracy: 0.950112: 100%|██████████| 756/756 [12:45<00:00,  1.01s/it]


         Val loss: 0.572076, Val accuracy: 0.866166, Val f1-micro: 0.866166, Val f1-macro: 0.335091

Epoch: 5, Train loss: 0.103269, Train accuracy: 0.959592: 100%|██████████| 756/756 [12:46<00:00,  1.01s/it]


         Val loss: 0.637583, Val accuracy: 0.860489, Val f1-micro: 0.860489, Val f1-macro: 0.328766

Epoch: 6, Train loss: 0.081992, Train accuracy: 0.963605: 100%|██████████| 756/756 [12:46<00:00,  1.01s/it]


         Val loss: 0.685772, Val accuracy: 0.863171, Val f1-micro: 0.863171, Val f1-macro: 0.335381

Epoch: 7, Train loss: 0.068182, Train accuracy: 0.966254: 100%|██████████| 756/756 [12:46<00:00,  1.01s/it]


         Val loss: 0.755975, Val accuracy: 0.865406, Val f1-micro: 0.865406, Val f1-macro: 0.335497

Epoch: 8, Train loss: 0.058626, Train accuracy: 0.968941: 100%|██████████| 756/756 [12:46<00:00,  1.01s/it]


         Val loss: 0.756920, Val accuracy: 0.862501, Val f1-micro: 0.862501, Val f1-macro: 0.335066

Epoch: 9, Train loss: 0.052772, Train accuracy: 0.970506: 100%|██████████| 756/756 [12:46<00:00,  1.01s/it]


         Val loss: 0.822273, Val accuracy: 0.864914, Val f1-micro: 0.864914, Val f1-macro: 0.317651

Epoch: 10, Train loss: 0.048222, Train accuracy: 0.972710: 100%|██████████| 756/756 [12:46<00:00,  1.01s/it]


         Val loss: 0.826628, Val accuracy: 0.864601, Val f1-micro: 0.864601, Val f1-macro: 0.318961

Epoch: 11, Train loss: 0.046114, Train accuracy: 0.974718: 100%|██████████| 756/756 [12:46<00:00,  1.01s/it]


         Val loss: 0.856139, Val accuracy: 0.865183, Val f1-micro: 0.865183, Val f1-macro: 0.320008

In [30]:
val_metric = val_params(model,val_loader)

In [31]:
val_metric

(251.73746057458825, 0.8533271051849335, 0.8533271051849335, 0.311064348700263)

## Evaluation

In [25]:
tokenizer = ModelTokenizer.from_pretrained(path_to_save+str(0))
model = Model.from_pretrained(path_to_save+str(0)).to(device)
temp = model.eval()

In [26]:
yt,yp = get_ytyp(model,val_loader)
yt = [id2lab[idx] for idx in yt]
yp = [id2lab[idx] for idx in yp]

In [27]:
classes = sorted(list(set(yt+yp)))

In [None]:
cm = confusion_matrix(yt,yp)
plot_confusion_matrix(cm,classes,normalize=True,figsize=(15,15))

In [89]:
import spacy
from nltk.tokenize import word_tokenize
nlp = spacy.load("en_core_web_sm")
#nltk.download('punkt')
text = "SWOT stands for Strengths, Weaknesses, Opportunities, and Threats, and so a SWOT Analysis is a technique for assessing these four aspects of your business."


In [73]:
def get_individual_predictions(i,text=None):
    if text is None:
        custom = False
    else:
        cutome=True

    split_text = texts_val[i]
    tags = tags_val[i]
    if custom:
        split_text = word_tokenize(text)
        tags = [' O']*len(split_text)
    enc = get_encodings([split_text],[tags],tokenizer)
    lab = enc.labels[0]
    with torch.no_grad():
        output = model(enc.input_ids.to(device),enc.attention_mask.to(device))
    pred_out = output[0].argmax(dim=2)[0]
    pred_out=pred_out[lab!=-100]
    lab = lab[lab!=-100]
    lab = [id2lab[idx] for idx in lab.tolist()]
    pred_out = [id2lab[idx] for idx in pred_out.tolist()]

    res = pd.DataFrame({
        'texts':split_text,
         'targ': lab,
        'pred':pred_out

    })
    return res

## Preliminary Results

In [74]:
res_df = get_individual_predictions(30)
res_df

Unnamed: 0,texts,targ,pred
0,In,O,O
1,1869,O,O
2,",",O,O
3,Stanton,O,O
4,and,O,O
5,Anthony,O,O
6,formed,O,O
7,the,B-Term,B-Term
8,National,I-Term,I-Term
9,Woman,I-Term,I-Term


In [75]:
res_df = get_individual_predictions(1)
res_df

Unnamed: 0,texts,targ,pred
0,Inductive,B-Term,B-Term
1,reasoning,I-Term,I-Term
2,is,O,O
3,a,B-Definition,B-Definition
4,form,I-Definition,I-Definition
5,of,I-Definition,I-Definition
6,logical,I-Definition,I-Definition
7,thinking,I-Definition,I-Definition
8,that,I-Definition,I-Definition
9,uses,I-Definition,I-Definition


In [84]:
res_df = get_individual_predictions(5)
res_df

Unnamed: 0,texts,targ,pred
0,In,O,O
1,United,B-Term,O
2,States,I-Term,O
3,v.,I-Term,I-Term
4,Miller,I-Term,I-Term
5,",",O,O
6,the,B-Definition,B-Definition
7,Supreme,I-Definition,O
8,Court,I-Definition,O
9,upheld,I-Definition,I-Definition


In [88]:
res_df = get_individual_predictions(9)
res_df

Unnamed: 0,texts,targ,pred
0,The,O,O
1,reexamination,O,O
2,of,O,O
3,past,O,O
4,cases,O,O
5,through,O,O
6,DNA,O,O
7,evidence,O,O
8,has,O,O
9,revealed,O,O
