In [None]:
import os
import time
import pickle
import json
import numpy as np
import pandas as pd

import torch
import torch.nn as nn


import matplotlib.pyplot as plt


from collections import defaultdict, Counter
from sklearn.metrics import confusion_matrix, f1_score
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

# Configuration

In [None]:
config={
    'label_batch_size': 64,
    'unlabel_batch_size': 64,
    'max_seq_len': 100,
    'glove_dim': 100,
    'num_labels': 3,
    'ner_format': "BIO",
    'grad_clip_val': 1,
    'max_lr': 0.00013,
    'T': 0.6,
    'weight_decay': 1e-5,
    'num_iterations': 1500,
    'print_every': 50,
    'eval_every':100,
    'save_every': 100,
    'glove_path': '../input/glove6b/glove.6B.100d.txt',
    'unlabeled_datapath': '../input/dataset/train_data.pkl',
    'train_folder': "../input/coleridgeinitiative-show-us-the-data/train",
    'test_folder': '../input/coleridgeinitiative-show-us-the-data/test',
    'ss_folder': '../input/coleridge-semisuperviseddata'
}

model_params={
    "pre_embedd_dim": 100,
    'word_shape_size': 7,
    "word_shape_embedd_dim": 20,
    "hdim": 128,
    "proj_dim": 512,
    "out_dim": 3,
    'max_seq_len': 100,
}

# Preprocessing

In [None]:
def update_annotation(annots, s, l):
    for i in range(s, s+l):
        if annots[i] == 'I':
            continue
        if i == s:
            annots[i]='B'
        else:
            annots[i]='I'

def get_annotated_data(data):
    sentence=data['sentence']
    labels=data['labels'] if data.get('labels', None) else []
    words=word_tokenize(sentence)
    annots=['O']*len(words)
    labels=[word_tokenize(label) for label in labels]
    
    for i, word in enumerate(words):
        for label in labels:
            if words[i:i+len(label)] == label:
                update_annotation(annots, i, len(label))
    return (words, annots)

def read_file(filepath):
    with open(filepath) as file:
        data=file.read()
    return data

def read_json_file(filepath):
    data=json.loads(read_file(filepath))
    return data
def read_pickle(filepath):
    with open(filepath, 'rb') as file:
        data=pickle.load(file)
    return data

In [None]:
class PreprocessData:
    def __init__(self, config):
        self.train_folder=config['train_folder']
        self.ss_folder=config['ss_folder']
        self.glove_path=config['glove_path']
        self.unlabled_path=config['unlabeled_datapath']
    
    def download_glove(self):
        glove_embeddings={}
        with open(self.glove_path) as file:
            for line in file:
                line=line.split()
                word=line[0]
                v=np.array(line[1:]).astype(np.float)
                glove_embeddings[word]=v
        return glove_embeddings

    def get_labeled_data(self):
        pos_data=read_json_file(os.path.join(config['ss_folder'], 'annotation.txt'))
        neg_data=read_json_file(os.path.join(config['ss_folder'], 'negative_sentences_corrected.txt'))
        return (pos_data, neg_data)
    
    def get_data(self):
        (pos_data, neg_data)=self.get_labeled_data()
        unlabeled_data=read_pickle(self.unlabled_path)
        for data in pos_data:
            pub_id=data.get('pub_id', None)
            unlabeled_data.pop(pub_id) if unlabeled_data.get(pub_id, None) else ''
        for data in neg_data:
            pub_id=data.get('pub_id', None)
            unlabeled_data.pop(pub_id) if unlabeled_data.get(pub_id, None) else ''
        
        unlabeled_data=list(unlabeled_data.values())
        pos_data=[get_annotated_data(data) for data in pos_data]
        neg_data=[get_annotated_data(data) for data in neg_data]
        
        glove_embeddings=self.download_glove()
        labeled_data={
            'pos_data': pos_data,
            'neg_data': neg_data
        }
        return (glove_embeddings, labeled_data, unlabeled_data)

In [None]:
%%time
(glove_embeddings, labeled_data, unlabeled_data)=PreprocessData(config).get_data()
lemmatizer=WordNetLemmatizer()

# Data iterators

In [None]:
def get_word_shape(word):
    if len(word)==0:
        return 0
    ch=word[0]
    if ch.isupper():
        return 1
    elif ch.islower():
        return 2
    elif ch.isnumeric():
        return 3
    elif ch == ',':
        return 4
    elif ch == '(':
        return 5
    elif ch==')':
        return 6
    return 0

In [None]:
class LabeledIterator:
    def __init__(self,config,glove_embeddings, labeled_data):
        self.batch_size=config['label_batch_size']
        self.glove_dim=config['glove_dim']
        self.num_labels=config['num_labels']
        self.max_seq_len=config['max_seq_len']
        self.glove_embeddings=glove_embeddings
        self.labeled_data=labeled_data
        
        
        pos_data=labeled_data['pos_data']
        neg_data=labeled_data['neg_data']
        
        pos_ids=list(range(len(pos_data)))
        neg_ids=list(range(len(neg_data)))
        
        pos_train_ids=pos_ids[:-1]
        neg_train_ids=neg_ids[:-1]
        
        pos_val_ids=pos_ids[-1:]
        neg_val_ids=neg_ids[-1:]
        
        self.train_data=([data for i, data in enumerate(pos_data) if i in pos_train_ids]+
                         [data for i, data in enumerate(neg_data) if i in neg_train_ids])
        
        self.val_data  =([data for i, data in enumerate(pos_data) if i in pos_val_ids]+
                         [data for i, data in enumerate(neg_data) if i in neg_val_ids])
        
        
        self.num_train_records=len(self.train_data)
        self.num_val_records=len(self.val_data)
    
    def make_shuffle_data(self, data, shuffle):
        if shuffle:
            np.random.shuffle(data)
            
    def get_data_by_mode(self, mode, shuffle):
        if mode=='val':
            self.make_shuffle_data(self.val_data, shuffle)
            num_records=self.num_val_records
            data=self.val_data
        elif mode == 'train':
            self.make_shuffle_data(self.train_data, shuffle)
            num_records=self.num_train_records
            data=self.train_data
        return num_records, data
   
    def convert_annotation_to_label(self, annot):
        if annot == 'B':
            return 1
        elif annot =='I':
            return 2
        return 0
    
    def convert_rawdata_to_tensors(self, mbs):
        mb_size=len(mbs)
        X=torch.zeros((mb_size, self.max_seq_len, self.glove_dim), dtype=torch.float32)
        X_embedd=torch.zeros( (mb_size, self.max_seq_len), dtype=torch.long)
        y_bios=torch.full((mb_size, self.max_seq_len), 3, dtype=torch.long)
        y_ents=torch.zeros((mb_size, self.max_seq_len), dtype=torch.float32)
        
        for i in range(mb_size):
            (words, annots)=mbs[i]
            words_len=min(len(words), self.max_seq_len)
            for j in range(words_len):
                wshape=get_word_shape(words[j])
                word=words[j].lower()
                
                X_embedd[i][j]=wshape
                y_bios[i][j]=self.convert_annotation_to_label(annots[j])
                if y_bios[i][j] == 1 or y_bios[i][j]==2:
                    y_ents[i][j]=1
                if word in self.glove_embeddings:
                    X[i][j]=torch.tensor(self.glove_embeddings[word], dtype=torch.float32)
                    
        return (X, X_embedd, y_ents, y_bios)
    def get_raw_minibatch(self, mode='val', shuffle=False):
        num_records, data=self.get_data_by_mode(mode, shuffle)
        for i in range(0, num_records, self.batch_size):
            yield data[i:i+self.batch_size]
            
    def get_minibatch(self, mode='val', shuffle=False):
        for mbs in self.get_raw_minibatch(mode, shuffle):
            mbs=self.convert_rawdata_to_tensors(mbs)
            yield mbs
    def get_infinite_minibatch(self, mode='val', shuffle=False):
        while True:
            for mbs in self.get_minibatch(mode, shuffle):
                yield mbs
    def __iter__(self):
        while True:
            for mbs in self.get_raw_minibatch('train', True):
                mbs=self.convert_rawdata_to_tensors(mbs)
                yield mbs

In [None]:
class UnLabeledIterator:
    def __init__(self, config, glove_embeddings, unlabeled_data):
        self.batch_size=config['unlabel_batch_size']
        self.max_seq_len=config['max_seq_len']
        self.glove_dim=config['glove_dim']
        self.glove_embeddings=glove_embeddings
        self.unlabeled_data=unlabeled_data
        self.num_recods=len(unlabeled_data)
    def get_raw_minibatch(self):
        np.random.shuffle(self.unlabeled_data)
        for i in range(0, self.num_recods, self.batch_size):
            yield self.unlabeled_data[i:i+self.batch_size]
    def get_sampled_sentences(self, mbs):
        sentences=[]
        tags=[]
        for i in range( len(mbs) ):
            pos_sents=mbs[i]['pos_sents']
            neg_sents=mbs[i]['neg_sents']
            
            if len(pos_sents)>0:
                np.random.shuffle(pos_sents)
                tags.append(pos_sents[0]['tags'])
                sentences.append(pos_sents[0]['sentence_words'])
            if len(neg_sents)>0:
                np.random.shuffle(neg_sents)
                tags.append(neg_sents[0]['tags'])
                sentences.append(neg_sents[0]['sentence_words'])
        return (sentences, tags)
        
    def convert_rawdata_to_tensors(self, mbs):
        (sentences, tags)=self.get_sampled_sentences(mbs)
        mbs_size=len(sentences)
        X=torch.zeros((mbs_size, self.max_seq_len, self.glove_dim), dtype=torch.float32)
        X_embedd=torch.zeros( (mbs_size, self.max_seq_len), dtype=torch.long)
        y_ents=torch.zeros((mbs_size, self.max_seq_len), dtype=torch.float32)
        y_bios=torch.full((mbs_size, self.max_seq_len), 3, dtype=torch.long)
        slens=torch.zeros(mbs_size, dtype=torch.long)
        
        for i, sentence_words in enumerate(sentences):
            for j in range( min(len(tags[i]), self.max_seq_len) ):
                wshape=get_word_shape(sentence_words[j])
                word=sentence_words[j].lower()
                
                X_embedd[i][j]=wshape
                if word in self.glove_embeddings:
                    X[i][j]=torch.tensor(self.glove_embeddings[word], dtype=torch.float32)
                    
                if tags[i][j] == 1 or tags[i][j]==2:
                    y_ents[i][j]=1.0;
                y_bios[i][j]=tags[i][j]
                
            if len(sentence_words) >= self.max_seq_len:
                slens[i]=self.max_seq_len
            else:
                slens[i]=len(sentence_words)
        return (X, X_embedd, y_ents, y_bios, slens)
        
    def get_minibatch(self):
        for mbs in self.get_raw_minibatch():
            mbs=self.convert_rawdata_to_tensors(mbs)
            yield mbs
            
    def get_infinite_minibatch(self):
        pass
    
    def __iter__(self):
        while True:
            for mbs in self.get_raw_minibatch():
                mbs=self.convert_rawdata_to_tensors(mbs)
                yield mbs

# Model

In [None]:
class NERLMHead1(nn.Module):
    def __init__(self, params):
        super().__init__()
        self.linear=nn.Linear(2*params['hdim'], params['proj_dim'])
        self.bn=nn.BatchNorm1d(params['proj_dim'])
        self.dropout=nn.Dropout(0.3)
        self.relu=nn.ReLU()
        self.out_layer=nn.Linear(params['proj_dim'], params['out_dim'])
    def forward(self, x):
        x=self.linear(x)
        x=x.permute(0, 2, 1)
        x=self.bn(x)
        x=self.dropout(x)
        x=self.relu(x)
        x=x.permute(0, 2, 1)
        x=self.out_layer(x)
        return x

class NERLMHead2(nn.Module):
    def __init__(self, params):
        super().__init__()
        self.linear=nn.Linear(2*params['hdim'], params['proj_dim'])
        self.bn=nn.BatchNorm1d(params['proj_dim'])
        self.dropout=nn.Dropout(0.3)
        self.relu=nn.ReLU()
        self.out_layer=nn.Linear(params['proj_dim'], params['out_dim'])
    def forward(self, x):
        x=self.linear(x)
        x=x.permute(0, 2, 1)
        x=self.bn(x)
        x=self.dropout(x)
        x=self.relu(x)
        x=x.permute(0, 2, 1)
        x=self.out_layer(x)
        return x
    
class ERLMHead1(nn.Module):
    def __init__(self, params):
        super().__init__()
        self.linear=nn.Linear(2*params['hdim'], params['proj_dim'])
        self.bn=nn.BatchNorm1d(params['proj_dim'])
        self.dropout=nn.Dropout(0.3)
        self.relu=nn.ReLU()
        self.out_layer=nn.Linear(params['proj_dim'], params['out_dim'])
    def forward(self, x):
        x=self.linear(x)
        x=x.permute(0, 2, 1)
        x=self.bn(x)
        x=self.dropout(x)
        x=self.relu(x)
        x=x.permute(0, 2, 1)
        x=self.out_layer(x)
        return x
    
class ERLMHead2(nn.Module):
    def __init__(self, params):
        super().__init__()
        self.linear=nn.Linear(2*params['hdim'], params['proj_dim'])
        self.bn=nn.BatchNorm1d(params['proj_dim'])
        self.dropout=nn.Dropout(0.3)
        self.relu=nn.ReLU()
        self.out_layer=nn.Linear(params['proj_dim'], params['out_dim'])
    def forward(self, x):
        x=self.linear(x)
        x=x.permute(0, 2, 1)
        x=self.bn(x)
        x=self.dropout(x)
        x=self.relu(x)
        x=x.permute(0, 2, 1)
        x=self.out_layer(x)
        return x

In [None]:
class NERHead(nn.Module):
    def __init__(self, params):
        super().__init__()
        self.linear=nn.Linear(2*params['hdim'], params['proj_dim'])
        self.bn=nn.BatchNorm1d(params['proj_dim'])
        self.dropout=nn.Dropout(0.3)
        self.relu=nn.ReLU()
        self.out_layer=nn.Linear(params['proj_dim'], params['out_dim'])
    def forward(self, x):
        x=self.linear(x)
        x=x.permute(0, 2, 1)
        x=self.bn(x)
        x=self.dropout(x)
        x=self.relu(x)
        x=x.permute(0, 2, 1)
        x=self.out_layer(x)
        return x
    
class ERHead(nn.Module):
    def __init__(self, params):
        super().__init__()
        self.linear=nn.Linear(2*params['hdim'], params['proj_dim'])
        self.bn=nn.BatchNorm1d(params['proj_dim'])
        self.dropout=nn.Dropout(0.3)
        self.relu=nn.ReLU()
        self.out_layer=nn.Linear(params['proj_dim'], 1)
    def forward(self, x):
        x=self.linear(x)
        x=x.permute(0, 2, 1)
        x=self.bn(x)
        x=self.dropout(x)
        x=self.relu(x)
        x=x.permute(0, 2, 1)
        x=self.out_layer(x)
        return x    

class Model(nn.Module):
    def __init__(self, params):
        super().__init__()
        self.params=params
        self.embedd_layer=nn.Embedding(params['word_shape_size'], params['word_shape_embedd_dim'],
                                       max_norm=1, padding_idx=0)
        self.gru=nn.GRU(params['pre_embedd_dim'] + params['word_shape_embedd_dim'],
                        params['hdim'], num_layers=2,
                        bidirectional=True, dropout=0.3, batch_first=True)
        
        self.ner_head=NERHead(params)
        self.er_head=ERHead(params)
        
        self.ner_lm_head1 = NERLMHead1(params)
        self.ner_lm_head2 = NERLMHead2(params)
        
        self.er_lm_head1=ERLMHead1(params)
        self.er_lm_head2=ERLMHead2(params)
        
    def forward(self, x, x_embedd):
        batch_size=x.shape[0]
        seq_len=x.shape[1]
        x_embedd=self.embedd_layer(x_embedd)
        (h_n, _)=self.gru(torch.cat([x, x_embedd], dim=-1))
        h_n=h_n.view(batch_size, seq_len, 2, self.params['hdim'] )
        h1=h_n[:, :, 0, :]
        h2=h_n[:, :, 1, :]
        
        h=torch.cat([h1, h2], dim=-1)
        y_ent=self.er_head(h)
        y_bios=self.ner_head(h)
        
        y_lm_ent1=self.er_lm_head1(h)
        y_lm_ent2=self.er_lm_head2(h)
        
        y_lm_bios1= self.ner_lm_head1(h)
        y_lm_bios2= self.ner_lm_head2(h)
        
        return {
            'y_ent': y_ent,
            'y_bios': y_bios,
            'y_lm_ent1': y_lm_ent1,
            'y_lm_ent2': y_lm_ent2,
            'y_lm_bios1': y_lm_bios1,
            'y_lm_bios2': y_lm_bios2
        }

# Pretraining

In [None]:
def evaluate(model):
    model.eval()
    true_bio_labels=[];pred_bio_labels=[];
    labeled_iterator=LabeledIterator(config, glove_embeddings, labeled_data)
    
    for mbs in labeled_iterator.get_minibatch(mode='train', shuffle=False):
        (X, X_embedd, y_ents, y_bios)=mbs
        with torch.no_grad():
            y_out=model(X, X_embedd)
            yhat=y_out['y_bios']
            yhat=yhat.argmax(dim=-1)
        
        batch_size=X.shape[0]
        
        y_bios=y_bios.view(-1)
        yhat=yhat.view(-1)
        
        yhat=yhat[y_bios!=3]
        y_bios=y_bios[y_bios!=3]
        
        true_bio_labels+=list(y_bios.numpy())
        pred_bio_labels+=list(yhat.numpy())
    
    cm=confusion_matrix(true_bio_labels, pred_bio_labels)
    micro_fscore_bio=f1_score(true_bio_labels, pred_bio_labels, average='micro')
    macro_fscore_bio=f1_score(true_bio_labels, pred_bio_labels, average='macro')
    
    return (cm, micro_fscore_bio, macro_fscore_bio)

In [None]:
class CustomLoss:
    def __init__(self):
        pass
    
    def get_entity_loss(self, slens, y_ents, logyhat_ent0, logyhat_ent1 ):
        batch_size=y_ents.shape[0]
        loss=torch.tensor(0.0)
        pos_loss=torch.tensor(0.0); neg_loss=torch.tensor(0.0)
        pos_cnt=0; neg_cnt=0
        
        for i in range(batch_size):
            ysum=torch.sum(y_ents[i])
            if ysum==0:
                neg_loss+=(-1 * 0.95*logyhat_ent0[i][:slens[i]]).sum() + (-1 * 0.05 * logyhat_ent1[i][:slens[i]]).sum()
                neg_cnt+=slens[i]
                continue
            for j in range(slens[i]):
                if y_ents[i][j]==1:
                    pos_loss+=((-1 * 0.95*logyhat_ent1[i][j]) + (-1 * 0.05 * logyhat_ent0[i][j])).sum()
                    pos_cnt+=1
        pos_loss/=max(1, pos_cnt)
        neg_loss/=max(1, neg_cnt)
        loss=pos_loss+neg_loss
        return loss
    
    def get_lm1_entity_loss(self, slens, y_ents, logyhat_lm1_ent1, logyhat_lm1_ent0):
        batch_size=y_ents.shape[0]
        loss=torch.tensor(0.0)
        pos_loss=torch.tensor(0.0); neg_loss=torch.tensor(0.0)
        pos_cnt=0; neg_cnt=0
        
        for i in range(batch_size):
            seq_len=slens[i]
            ysum=torch.sum(y_ents[i][1: seq_len])
            if ysum==0:
                neg_loss+=(-1 * 0.95*logyhat_lm1_ent0[i][:seq_len-1]).sum() + (-1 * 0.05 * logyhat_lm1_ent1[i][:seq_len-1]).sum()
                neg_cnt+=seq_len
                continue
            
            for j in range(seq_len-1):
                if y_ents[i][j+1]==1: # If the next label==1; 
                    pos_loss+=((-1 * 0.95*logyhat_lm1_ent1[i][j]) + (-1 * 0.05 * logyhat_lm1_ent0[i][j])).sum()
                    pos_cnt+=1
        pos_loss/=max(1, pos_cnt)
        neg_loss/=max(1, neg_cnt)
        loss=pos_loss+neg_loss
        return loss
    
    def get_lm2_entity_loss(self, slens, y_ents, logyhat_lm2_ent1, logyhat_lm2_ent0):
        batch_size=y_ents.shape[0]
        loss=torch.tensor(0.0)
        pos_loss=torch.tensor(0.0); neg_loss=torch.tensor(0.0)
        pos_cnt=0; neg_cnt=0
        
        for i in range(batch_size):
            seq_len=slens[i]
            ysum=torch.sum(y_ents[i][0: seq_len-1])
            if ysum==0:
                neg_loss+=(-1 * 0.95*logyhat_lm2_ent0[i][1:seq_len]).sum() + (-1 * 0.05 * logyhat_lm2_ent1[i][1:seq_len]).sum()
                neg_cnt+=(seq_len-1)
                continue
                
            for j in range(1, seq_len):
                if y_ents[i][j-1]==1: # If the previous label==1; 
                    pos_loss+=((-1 * 0.95*logyhat_lm2_ent1[i][j]) + (-1 * 0.05 * logyhat_lm2_ent0[i][j])).sum()
                    pos_cnt+=1
        pos_loss/=max(1, pos_cnt)
        neg_loss/=max(1, neg_cnt)
        loss=pos_loss+neg_loss
        return loss
    
    
    def get_bios_loss(self, slens, y_bios, logyhat_bios):
        batch_size=y_bios.shape[0]
        loss=torch.tensor(0.0)
        pos_loss=torch.tensor(0.0); neg_loss=torch.tensor(0.0)
        pos_cnt=0; neg_cnt=0

        for i in range(batch_size):
            ysum=torch.sum(y_bios[i][:slens[i]].sum())
            if ysum==0:
                neg_loss+=(-1*0.95*logyhat_bios[i][:slens[i]][0].sum()) + \
                (-1*0.025*logyhat_bios[i][:slens[i]][1].sum()) + \
                (-1*0.025*logyhat_bios[i][:slens[i]][2].sum())
                neg_cnt+=slens[i].item()
                continue
                
            for j in range(slens[i]):
                if y_bios[i][j]==1:
                    pos_loss+=(-1*0.9*logyhat_bios[i][j][1].sum()) + \
                    (-1*0.08*logyhat_bios[i][j][2].sum()) + \
                    (-1*0.02*logyhat_bios[i][j][0].sum())
                    pos_cnt+=1
                    
                elif y_bios[i][j]==2:
                    pos_loss+=(-1*0.9*logyhat_bios[i][j][2].sum()) + \
                    (-1*0.08*logyhat_bios[i][j][1].sum()) + \
                    (-1*0.02*logyhat_bios[i][j][0].sum())
                    pos_cnt+=1
                    pos_cnt+=1
        pos_loss/=max(1, pos_cnt)
        neg_loss/=max(1, neg_cnt)
        
        loss=pos_loss+neg_loss
        return loss
    
    def get_lm1_bios_loss(self, slens, y_bios, logyhat_lm1_bios):
        batch_size=y_bios.shape[0]
        loss=torch.tensor(0.0)
        pos_loss=torch.tensor(0.0); neg_loss=torch.tensor(0.0)
        pos_cnt=0; neg_cnt=0
        
        for i in range(batch_size):
            seq_len=slens[i].item()
            ysum=torch.sum(y_bios[i][1:seq_len].sum())
            if ysum==0:
                neg_loss+=(-1*0.95*logyhat_lm1_bios[i][:seq_len-1][0].sum()) + \
                (-1*0.025*logyhat_lm1_bios[i][:seq_len-1][1].sum()) + \
                (-1*0.025*logyhat_lm1_bios[i][:seq_len-1][2].sum())
                neg_cnt+=(seq_len-1)
                continue
                
            for j in range(seq_len-1):
                if y_bios[i][j+1]==1: # Next Label is 1
                    pos_loss+=(-1*0.9*logyhat_lm1_bios[i][j][1].sum()) + \
                    (-1*0.08*logyhat_lm1_bios[i][j][2].sum()) + \
                    (-1*0.02*logyhat_lm1_bios[i][j][0].sum())
                    pos_cnt+=1
                    
                elif y_bios[i][j+1]==2:# Next Label is 2
                    pos_loss+=(-1*0.9*logyhat_lm1_bios[i][j][2].sum()) + \
                    (-1*0.08*logyhat_lm1_bios[i][j][1].sum()) + \
                    (-1*0.02*logyhat_lm1_bios[i][j][0].sum())
                    pos_cnt+=1
        pos_loss/=max(1, pos_cnt)
        neg_loss/=max(1, neg_cnt)
        
        loss=pos_loss+neg_loss
        return loss

    
    def get_lm2_bios_loss(self, slens, y_bios, logyhat_lm2_bios):
        batch_size=y_bios.shape[0]
        loss=torch.tensor(0.0)
        pos_loss=torch.tensor(0.0); neg_loss=torch.tensor(0.0)
        pos_cnt=0; neg_cnt=0
        
        for i in range(batch_size):
            seq_len=slens[i].item()
            ysum=torch.sum(y_bios[i][0:seq_len-1].sum())
            if ysum==0:
                neg_loss+=(-1*0.95*logyhat_lm2_bios[i][1:seq_len][0].sum()) + \
                (-1*0.025*logyhat_lm2_bios[i][1:seq_len][1].sum()) + \
                (-1*0.025*logyhat_lm2_bios[i][1:seq_len][2].sum())
                neg_cnt+=seq_len
                continue
                
            for j in range(1, seq_len):
                if y_bios[i][j-1]==1: # Next Label is 1
                    pos_loss+=(-1*0.9*logyhat_lm2_bios[i][j][1].sum()) + \
                    (-1*0.08*logyhat_lm2_bios[i][j][2].sum()) + \
                    (-1*0.02*logyhat_lm2_bios[i][j][0].sum())
                    pos_cnt+=1
                    
                elif y_bios[i][j-1]==2:# Next Label is 2
                    pos_loss+=(-1*0.9*logyhat_lm2_bios[i][j][2].sum()) + \
                    (-1*0.08*logyhat_lm2_bios[i][j][1].sum()) + \
                    (-1*0.02*logyhat_lm2_bios[i][j][0].sum())
                    pos_cnt+=1
        pos_loss/=max(1, pos_cnt)
        neg_loss/=max(1, neg_cnt)
        
        loss=pos_loss+neg_loss
        return loss
    
    
    def get_constraint_loss(self, slens, logyhat_bios, sm_yhat_bios,  logyhat_ent0, logyhat_ent1):
        loss=torch.tensor(0.0)
        batch_size=logyhat_bios.shape[0]
        for i in range(batch_size):
            cur_loss=torch.tensor(0.0)
            for j in range(slens[i]):
                cur_loss+=torch.abs(logyhat_bios[i][j][0] - logyhat_ent0[i][j][0])
                cur_loss += torch.abs(torch.log(1-sm_yhat_bios[i][j][0]+1e-9) - logyhat_ent1[i][j][0])
                
            loss+=(cur_loss/max(1, slens[i]))
        loss/=max(batch_size, 1)
        return loss

In [None]:
class Pretraining:
    def __init__(self, model):
        self.model=model
        self.iter_count=0
        self.total_loss=0.0
        self.ent_loss=0.0
        self.bios_loss=0.0
        self.loss_lm_ent1=0.0
        self.loss_lm_ent2=0.0
        self.loss_bios_lm1=0.0
        self.loss_bios_lm2=0.0
        
        self.constraint_loss=0.0
        self.save_total_loss=0.0
        
        self.loss_=[]
        self.ent_loss_=[]
        self.bios_loss_=[]
        self.constraint_loss_=[]
        self.loss_lm_ent1_=[]
        self.loss_lm_ent2_=[]
        self.loss_bios_lm1_=[]
        self.loss_bios_lm2_=[]
        
        self.criterion1=nn.BCEWithLogitsLoss(reduction='mean')
        self.criterion2=nn.CrossEntropyLoss(ignore_index=3, reduction='mean')
        
        self.logsoftmax=nn.LogSoftmax(dim=-1)
        self.softmax=nn.Softmax(dim=-1)
        self.logsigmoid=nn.LogSigmoid()
        self.sigmoid=nn.Sigmoid()
        
        self.customLoss=CustomLoss()
        self.optimizer=torch.optim.AdamW(model.parameters(), lr=config['max_lr'], weight_decay=config['weight_decay'])
        self.schedular=torch.optim.lr_scheduler.OneCycleLR(self.optimizer,
                                                           max_lr=config['max_lr'], 
                                                           total_steps=config['num_iterations'])
        
        
        #self.schedular=torch.optim.lr_scheduler.StepLR(self.optimizer, 1, gamma=1.02)
        self.unlabelIterator=iter(UnLabeledIterator(config, glove_embeddings, unlabeled_data))
        self.labeledIterator=iter(LabeledIterator(config, glove_embeddings, labeled_data))
    
    def train_ops(self, mbs):
        self.model.train()
        
        (X, X_embedd, y_ents, y_bios, slens)=mbs
        batch_size=X.shape[0]
        seq_len=X.shape[1]
        
        yout=self.model(X, X_embedd)
        
        yhat_ent=yout['y_ent']
        yhat_bios=yout['y_bios']
        yhat_lm_ent1=yout['y_lm_ent1']
        yhat_lm_ent2=yout['y_lm_ent2']
        yhat_lm_bios1=yout['y_lm_bios1']
        yhat_lm_bios2=yout['y_lm_bios2']
        
        
        
        logyhat_ent1=torch.log(self.sigmoid(yhat_ent))
        logyhat_ent0=torch.log(1-self.sigmoid(yhat_ent))
        
        logyhat_lm1_ent1=torch.log(self.sigmoid(yhat_lm_ent1)) # LM1
        logyhat_lm1_ent0=torch.log(1-self.sigmoid(yhat_lm_ent1))# LM1
        
        
        logyhat_lm2_ent1=torch.log(self.sigmoid(yhat_lm_ent2))# LM2
        logyhat_lm2_ent0=torch.log(1-self.sigmoid(yhat_lm_ent2))# LM2
        
        
        logyhat_bios=self.logsoftmax(yhat_bios)
        sm_yhat_bios =self.softmax(yhat_bios) #required for the Constraint loss
        
        
        logyhat_lm1_bios=self.logsoftmax(yhat_lm_bios1)#LM1
        logyhat_lm2_bios=self.logsoftmax(yhat_lm_bios2)#LM2
        
        
        loss_ents=self.customLoss.get_entity_loss(slens, y_ents, logyhat_ent0, logyhat_ent1)
        loss_ent_lm1=self.customLoss.get_lm1_entity_loss(slens, y_ents, logyhat_lm1_ent1, logyhat_lm1_ent0)
        loss_ent_lm2=self.customLoss.get_lm2_entity_loss(slens, y_ents, logyhat_lm2_ent1, logyhat_lm2_ent0)
        loss_bios=self.customLoss.get_bios_loss(slens, y_bios, logyhat_bios)
        loss_bios_lm1=self.customLoss.get_lm1_bios_loss(slens, y_bios, logyhat_lm1_bios)
        loss_bios_lm2=self.customLoss.get_lm2_bios_loss(slens, y_bios, logyhat_lm2_bios)
        loss_constraint=self.customLoss.get_constraint_loss(slens, logyhat_bios, sm_yhat_bios,  logyhat_ent0, logyhat_ent1)

        loss=(loss_ents+loss_bios+loss_constraint) + 0.7*(loss_ent_lm1+loss_ent_lm2+loss_bios_lm1+loss_bios_lm2)
        loss/=7
        
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), config['grad_clip_val'])
        self.optimizer.step()
        self.schedular.step()
        
        losses={
            'loss': loss.item(),
            'loss_ents': loss_ents.item(),
            'loss_bios': loss_bios.item(),
            'loss_constraint': loss_constraint.item(),
            'loss_ent_lm1': loss_ent_lm1.item(),
            'loss_ent_lm2': loss_ent_lm2.item(),
            'loss_bios_lm1': loss_bios_lm1.item(),
            'loss_bios_lm2': loss_bios_lm2.item()
        }
        return losses
    
    def train(self):
        best_loss=None
        best_fscore=None
        t1=time.time()
        self.model.train()
        while self.iter_count<config['num_iterations']:
            self.iter_count+=1
            mbs=next(self.unlabelIterator)
            losses = self.train_ops(mbs)
            
            self.total_loss+=losses['loss']
            self.ent_loss+=losses['loss_ents']
            self.bios_loss+=losses['loss_bios']
            self.constraint_loss+=losses['loss_constraint']
            
            self.loss_lm_ent1+=losses['loss_ent_lm1']
            self.loss_lm_ent2+=losses['loss_ent_lm2']
            
            self.loss_bios_lm1+=losses['loss_bios_lm1']
            self.loss_bios_lm2+=losses['loss_bios_lm2']
            
            self.save_total_loss+=losses['loss']
            
            
            self.loss_.append(losses['loss'])
            self.ent_loss_.append(losses['loss_ents'])
            self.bios_loss_.append(losses['loss_bios'])
            self.constraint_loss_.append(losses['loss_constraint'])
            
            self.loss_lm_ent1_.append(losses['loss_ent_lm1'])
            self.loss_lm_ent2_.append(losses['loss_ent_lm2'])
            self.loss_bios_lm1_.append(losses['loss_bios_lm1'])
            self.loss_bios_lm2_.append(losses['loss_bios_lm2'])
            
            if self.iter_count%config['save_every'] == 0:
                torch.save(model, 'model_{}.pt'.format(self.iter_count))
                (cm, micro_fscore_bio, macro_fscore_bio)=evaluate(model)
                self.save_total_loss/=config['save_every']
                if (best_loss is None) or (best_loss > self.save_total_loss):
                    torch.save(model, 'best_loss_model.pt')
                    best_loss = self.save_total_loss
                    self.save_total_loss=0.0
                if (best_fscore is None) or (best_fscore < macro_fscore_bio):
                    torch.save(model, 'best_fscore_model.pt')
                    best_fscore=macro_fscore_bio
                
                print("====="*10)
                print("Saving Best Model")
                print("Best Loss:{:.4f}".format(best_loss))
                print("Best F-Score:{:.4f}".format(best_fscore))
            
            if self.iter_count%config['eval_every']==0:
                print("Evaluating:")
                print("======"*10)
                (cm, micro_fscore_bio, macro_fscore_bio)=evaluate(model)
                print("Confusion Matrix:")
                print(cm)
                
                print("Micro F-Score ==> {:.4f}".format(micro_fscore_bio))
                print("Macro F-Score ==> {:.4f}".format(macro_fscore_bio))
                print("======"*10)
                print()
                torch.save(model, 'model.pt')

            if self.iter_count%config['print_every']==0:
                t2=time.time()
                print("===="*10)
                print("Iteration:{} | Time Taken:{:.1f}".format(self.iter_count, (t2-t1)/60))
                print()
                print("Total Loss:{:.4f}".format(self.total_loss/config['print_every']))
                print("Entity Loss:{:.4f}".format(self.ent_loss/config['print_every']))
                print("Bios Loss:{:.4f}".format(self.bios_loss/config['print_every']))
                print("Constraint Loss:{:.4f}".format(self.constraint_loss/config['print_every']))
                print()
                print("LM-1 Entity Loss:{:.4f}".format(self.loss_lm_ent1/config['print_every']))
                print("LM-2 Entity Loss:{:.4f}".format(self.loss_lm_ent2/config['print_every']))
                
                print("LM-1 BIOS Loss:{:.4f}".format(self.loss_bios_lm1/config['print_every']))
                print("LM-2 BIOS Loss:{:.4f}".format(self.loss_bios_lm2/config['print_every']))
                
                
                t1=time.time()
                self.total_loss=0
                self.ent_loss=0
                self.bios_loss=0
                self.constraint_loss=0
                
                self.loss_lm_ent1=0
                self.loss_lm_ent2=0
                
                self.loss_bios_lm1=0
                self.loss_bios_lm2=0
                
                print()
    def lr_range_test(self):
        min_lr=1e-5
        max_lr=2e-3
        cnt=0
        lrs=[]
        losses=[]
        while min_lr < max_lr:
            cnt+=1
            min_lr=self.schedular.get_last_lr()[0]
            mbs=next(self.unlabelIterator)
            yout=self.train_ops(mbs)
            losses.append(yout['loss'])
            lrs.append(min_lr)
            if cnt%10==0:
                print("Iteration:{} | LR:{} | Loss:{:3f}".format(cnt, min_lr, yout['loss']))
        return (lrs, losses)

In [None]:
model=Model(model_params)
preTrainer=Pretraining(model)

preTrainer.train()
torch.save(model, 'model.pt')

In [None]:
_, ax=plt.subplots(2, 2, figsize=(15, 7), sharex=True, sharey=True)
ax[0][0].plot(range(len(preTrainer.loss_)) , preTrainer.loss_)
ax[0][1].plot(range(len(preTrainer.ent_loss_)) , preTrainer.ent_loss_)
ax[1][0].plot(range(len(preTrainer.bios_loss_)) , preTrainer.bios_loss_)
ax[1][1].plot(range(len(preTrainer.constraint_loss_)) , preTrainer.constraint_loss_)

ax[0][0].set_title("Total Loss")
ax[0][1].set_title("Entity Loss")
ax[1][0].set_title("Bios Loss")
ax[1][1].set_title("Constraint Loss")

plt.show()

In [None]:
_, ax=plt.subplots(2, 2, figsize=(15, 7), sharex=True, sharey=True)
ax[0][0].plot(range(len(preTrainer.loss_lm_ent1_)) , preTrainer.loss_lm_ent1_)
ax[0][1].plot(range(len(preTrainer.loss_lm_ent2_)) , preTrainer.loss_lm_ent2_)
ax[1][0].plot(range(len(preTrainer.loss_bios_lm1_)) , preTrainer.loss_bios_lm1_)
ax[1][1].plot(range(len(preTrainer.loss_bios_lm2_)) , preTrainer.loss_bios_lm2_)

ax[0][0].set_title("Entity Loss - LM1")
ax[0][1].set_title("Entity Loss - LM2")
ax[1][0].set_title("Bios Loss - LM1")
ax[1][1].set_title("Bios Loss - LM2")

plt.show()