In [1]:
import pandas                       as pd
import numpy                        as np
import torch
import torch.nn                     as nn
import torch.nn.functional          as F
import torch.optim                  as optim
from torchtext                      import data
import torchtext
import re
from sklearn.metrics                import roc_auc_score
from sklearn.metrics                import roc_curve, auc
import matplotlib.pyplot            as plt

# LSTM

In [2]:
torch.backends.cudnn.deterministic = True

In [3]:
TEXT = data.Field(tokenize='spacy',batch_first=True,include_lengths=True, lower = True, )
LABEL = data.LabelField(dtype = torch.float,batch_first=True, lower = True)



In [24]:
path = './Data/cleaned english.csv'
df = pd.read_csv(path)
df.rename(columns = {'Unnamed: 0': 'Unnamed: 0', 'ID': 'ID', 'Text': 'Text', 'Sub-task A': 'label', 'Sub-task B': 'B', 'clean text': 'clean', 'tokenized': 'tokenized', 'normalized lexicon': 'normalized lexicon', 'monolingual': 'monolingual'}, inplace=True)
df.head()

Unnamed: 0.1,Unnamed: 0,ID,Text,label,B,clean,tokenized,normalized lexicon,monolingual
0,0,C45.451,Next part,NAG,NGEN,next part,"['next', 'part']","['next', 'part']",next part
1,1,C47.11,Iii8mllllllm\nMdxfvb8o90lplppi0005,NAG,NGEN,iii8mllllllm mdxfvb8o90lplppi0005,"['iii', '8mllllllm', 'mdxfvb', '8o90lplppi0005']","['iii', '8mllllllm', 'mdxfvb', '8o90lplppi0005']",iii 8mllllllm mdxfvb 8o90lplppi0005
2,2,C33.79,🤣🤣😂😂🤣🤣🤣😂osm vedio ....keep it up...make more v...,NAG,NGEN,osm vedio make vedios,"['osm', 'vedio', 'make', 'vedios']","['osm', 'osf', 'vedic', 'make', 'videos']",osm osf vedic make videos
3,3,C4.1961,What the fuck was this? I respect shwetabh and...,NAG,NGEN,what fuck this? respect shwetabh watching vide...,"['what', 'fuck', 'this', '?', 'respect', 'shwe...","['what', 'fuck', 'fuck', 'this', '?', 'respect...",what fuck fuck this ? respect respect whitish ...
4,4,C10.153,Concerned authorities should bring arundathi R...,NAG,NGEN,concerned authorities bring arundathi roy type...,"['concerned', 'authorities', 'bring', 'arundat...","['concerned', 'authorities', 'bring', 'arundat...",concerned authorities bring arundathi roy roy ...


In [26]:
df = df.dropna()

for i in range(len(df)):
    x = df.monolingual.iloc[i]
    if len(x) <= 0:
        print(df.monolingual.iloc[i])
        df.monolingual.iloc[i] = None
        
df = df.dropna()

In [21]:
df['monolingual']

0                                               next part
1                     iii 8mllllllm mdxfvb 8o90lplppi0005
2                               osm osf vedic make videos
3       what fuck fuck this ? respect respect whitish ...
4       concerned authorities bring arundathi roy roy ...
5                           famous care die hateful talks
6                            best best topic law students
7       even kabir kabir singh singh unaware preeti so...
8                                         she wrong wrong
9                                              6001733614
10                                      hindi hindi movie
11                                                   wait
12                                     very totally agree
14      ushant said exactly ! movie movie comprehend d...
15                     our role model whitish kabir kabir
16      usher luck luck haryana haryana small city cal...
17      arbitrator kamatipura sonagachi area area sett...
18      yeah m

In [6]:
fields = [(None, None), (None, None), (None, None), ('label', LABEL), (None, None), (None, None), (None, None), (None, None), ('monolingual', TEXT)]
training_data=data.TabularDataset(path = path, format = 'csv', fields = fields, skip_header = True)

print(vars(training_data.examples[3]))



{'label': 'nag', 'monolingual': ['what', 'fuck', 'fuck', 'this', '?', 'respect', 'respect', 'whitish', 'watching', 'videos', 'videos', 'long', 'time', 'time', 'review', 'review', 'shit', 'shit', 'nobody', 'watch', 'watch', 'movie', 'movie', 'reality', 'blah', 'you', 'watch', 'watch', 'movies', 'inspiration', 'read', 'books', 'biographies', 'yes', 'influenced', 'shitty', 'shitty', 'fucker', 'anyone', 'takes', 'movie', 'movie', 'seriously', 'fucker', 'and', 'theatre', 'theatre', 'india', 'india', 'reality', 'shit', 'shit', 'seek', 'reality', 'logical', 'scripts', 'characters', 'potholes', 'watching', 'kabir', 'kabir', 'singh', 'singh', 'made', 'views', 'called', 'raw', 'review', 'review', 'and', 'acted', 'forgot', 'movie', 'movie', 'story', 'story', 'pronounced', 'overtype', 'mess', 'wrongly', '?', 'common', '!']}


In [7]:
train_data, valid_data = training_data.split(split_ratio=0.7)

In [8]:
vectors = torchtext.vocab.Vectors('wiki-news-300d-1M.vec', cache = './Cache/Embeddings')

In [9]:
TEXT.build_vocab(train_data,min_freq=3, vectors = vectors)  
LABEL.build_vocab(train_data)

print("Size of topic vocab:",len(TEXT.vocab))
print("Size of label vocab:",len(LABEL.vocab))
print(TEXT.vocab.freqs.most_common(11))  
print(LABEL.vocab.freqs.most_common(14))
print(TEXT.vocab.stoi)

Size of topic vocab: 2012
Size of label vocab: 3
[('movie', 816), ('kabir', 494), ('singh', 431), ('video', 375), ('review', 364), ('?', 297), ('india', 273), ('nice', 259), ('man', 256), ('watch', 237), ('sir', 232)]
[('nag', 2373), ('cag', 316), ('oag', 295)]
defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x11ff90160>>, {'<unk>': 0, '<pad>': 1, 'movie': 2, 'kabir': 3, 'singh': 4, 'video': 5, 'review': 6, '?': 7, 'india': 8, 'nice': 9, 'man': 10, 'watch': 11, 'sir': 12, 'a': 13, 'love': 14, 'character': 15, 'time': 16, 'great': 17, '!': 18, 'wrong': 19, 'you': 20, 'feminism': 21, 'the': 22, '"': 23, 'indian': 24, 't': 25, 'god': 26, 'hollywood': 27, 'this': 28, 'roy': 29, '-': 30, 'movies': 31, 'arundhati': 32, 'feminist': 33, 'story': 34, 'and': 35, 'respect': 36, '=': 37, 'film': 38, '*': 39, 'bollywood': 40, 'brother': 41, 'arjun': 42, 'life': 43, 'agree': 44, 's': 45, 'good': 46, 'guy': 47, 'she': 48, 'make': 49, 'reddy': 50, 'couture': 51, 

In [10]:
labels = len(LABEL.vocab)

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

b_sz = 128

train_loader, val_loader = data.BucketIterator.splits(
    (train_data, valid_data),
    batch_size = b_sz,
    sort_key = lambda x: len(x.monolingual),
    sort_within_batch=True,
    device = device)



In [12]:
class model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim = 12, n_layers = 2, bidir = True, dropout = 0.2):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidir,
                           dropout=dropout,
                           batch_first=True)
        
        self.dense = nn.Linear(hidden_dim * 2, output_dim)
        self.softmax = nn.Softmax()
        
    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        dense_outputs=self.dense(hidden)
        outputs=self.softmax(dense_outputs)
        
        return outputs

In [13]:
vocab_size = len(TEXT.vocab)
embedding_dim = 300
num_hidden_nodes = 32
num_output_nodes = labels
dropout = 0.2

model_ = model(vocab_size, embedding_dim, num_hidden_nodes, num_output_nodes, dropout = dropout)

In [14]:
print(model_)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model_):,} trainable parameters')

pretrained_embeddings = TEXT.vocab.vectors
model_.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

model(
  (embedding): Embedding(2012, 300)
  (lstm): LSTM(300, 32, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (dense): Linear(in_features=64, out_features=3, bias=True)
  (softmax): Softmax(dim=None)
)
The model has 714,387 trainable parameters
torch.Size([2012, 300])


In [15]:
optimizer = optim.Adam(model_.parameters(), lr = 0.001)
criterion = nn.CrossEntropyLoss()

def accuracy(preds, y):
    counts = 0
    for i in range(preds.shape[0]):
      counts += (torch.max(preds[i], 0)[1] == y[i]).float()
      
    return counts/preds.shape[0]
    
model_ = model_.to(device)
criterion = criterion.to(device)

In [16]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()  
    
    for batch in iterator:
        
        optimizer.zero_grad()

        text, text_lengths = batch.monolingual
        label = batch.label

        text = text.to(device)
        label = label.type(torch.LongTensor).to(device)

        predictions = model(text, text_lengths).squeeze()

        try:
              loss = criterion(predictions, label)
              acc = accuracy(predictions, label)
        except:
              continue

        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()  
        epoch_acc += acc.item()

    return epoch_loss / (len(iterator)-1), epoch_acc / (len(iterator)-1)

In [17]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
        
            text, text_lengths = batch.monolingual
            label = batch.label

            text = text.to(device)
            label = label.type(torch.LongTensor).to(device)
            
            predictions = model(text, text_lengths).squeeze()
            
            try:
                  loss = criterion(predictions, label)
                  acc = accuracy(predictions, label)
            except:
                  continue
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / (len(iterator)-1), epoch_acc / (len(iterator)-1)

In [18]:
N_EPOCHS = 20
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    train_loss, train_acc = train(model_, train_loader, optimizer, criterion)
    
    valid_loss, valid_acc = evaluate(model_, val_loader, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model_.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc:.3f}')



RuntimeError: Length of all samples has to be greater than 0, but found an element in 'lengths' that is <= 0

In [None]:
for batch in train_loader:
      optimizer.zero_grad()   
      
      text, text_lengths = batch.monolingual   
      
      predictions = model_(text, text_lengths).squeeze()
      loss = criterion(predictions, batch.label.type(torch.LongTensor))
      
      loss.backward()

# Transformer Based Models

## Data Preparation

In [29]:
import torch
import torchnlp
from torchnlp.encoders import LabelEncoder
from sklearn.model_selection import train_test_split
from multiprocessing import cpu_count

In [30]:
df.head()

Unnamed: 0.1,Unnamed: 0,ID,Text,label,B,clean,tokenized,normalized lexicon,monolingual
0,0,C45.451,Next part,NAG,NGEN,next part,"['next', 'part']","['next', 'part']",next part
1,1,C47.11,Iii8mllllllm\nMdxfvb8o90lplppi0005,NAG,NGEN,iii8mllllllm mdxfvb8o90lplppi0005,"['iii', '8mllllllm', 'mdxfvb', '8o90lplppi0005']","['iii', '8mllllllm', 'mdxfvb', '8o90lplppi0005']",iii 8mllllllm mdxfvb 8o90lplppi0005
2,2,C33.79,🤣🤣😂😂🤣🤣🤣😂osm vedio ....keep it up...make more v...,NAG,NGEN,osm vedio make vedios,"['osm', 'vedio', 'make', 'vedios']","['osm', 'osf', 'vedic', 'make', 'videos']",osm osf vedic make videos
3,3,C4.1961,What the fuck was this? I respect shwetabh and...,NAG,NGEN,what fuck this? respect shwetabh watching vide...,"['what', 'fuck', 'this', '?', 'respect', 'shwe...","['what', 'fuck', 'fuck', 'this', '?', 'respect...",what fuck fuck this ? respect respect whitish ...
4,4,C10.153,Concerned authorities should bring arundathi R...,NAG,NGEN,concerned authorities bring arundathi roy type...,"['concerned', 'authorities', 'bring', 'arundat...","['concerned', 'authorities', 'bring', 'arundat...",concerned authorities bring arundathi roy roy ...


In [31]:
encoder = LabelEncoder(df['label'])

In [32]:
def func(x):
    z = np.zeros(12, dtype = int)
    z[x-1] = 1
    return tuple(z)

In [33]:
def prepare_data(df, label = 'label', text = 'monolingual', drop = ['Unnamed: 0', 'ID','Text','B','clean','tokenized','normalized lexicon'], train = False):
    df_s = df
    df_s['text'] = ''
    for sent in df[text]:
        df_s['text'] += sent
        df_s['text'] += " "
    df_s['labels'] = pd.Series(encoder.batch_encode(list(df[label]))).apply(func)
    df_s = df_s.drop(drop, 'columns')
    train_df, eval_df = train_test_split(df_s, test_size=0.2)

    if(train):
        return train_df, eval_df
    else:
        return eval_df

In [None]:
train_df, eval_df = prepare_data(df, label = 'label', text = 'monolingual', train=True)

In [None]:
eval_df = eval_df.reset_index()
train_df = train_df.reset_index()
eval_df = eval_df.drop('index', 'columns')
train_df = train_df.drop('index', 'columns')

In [None]:
train_df['text'][0]

In [None]:
train_df.shape

In [None]:
args = {
    "output_dir": "outputs/",
    "cache_dir": "cache/",
    "best_model_dir": "outputs/best_model/",

    "fp16": False,
    "fp16_opt_level": "O1",
    "max_seq_length": 128,
    "train_batch_size": 128,
    "eval_batch_size": 128,
    "gradient_accumulation_steps": 1,
    "num_train_epochs": 1,
    "weight_decay": 0,
    "learning_rate": 1e-4,
    "adam_epsilon": 1e-8,
    "warmup_ratio": 0.06,
    "warmup_steps": 0,
    "max_grad_norm": 1.0,
    "do_lower_case": False,

    "logging_steps": 50,
    "evaluate_during_training": False,
    "evaluate_during_training_steps": 2000,
    "evaluate_during_training_verbose": False,
    "use_cached_eval_features": False,
    "save_eval_checkpoints": True,
    "no_cache": False,
    "save_model_every_epoch": True,
    "tensorboard_dir": None,

    "overwrite_output_dir": True,
    "reprocess_input_data": True,

    "process_count": cpu_count() - 2 if cpu_count() > 2 else 1,
    "n_gpu": 1,
    "silent": False,
    "use_multiprocessing": True,

    "wandb_project": None,
    "wandb_kwargs": {},

    "use_early_stopping": True,
    "early_stopping_patience": 3,
    "early_stopping_delta": 0,
    "early_stopping_metric": "eval_loss",
    "early_stopping_metric_minimize": True,

    "manual_seed": None,
    "encoding": None,
    "config": {},
}

## DistilBERT

In [None]:
from simpletransformers.classification import MultiLabelClassificationModel

model = MultiLabelClassificationModel('distilbert', 'distilbert-base-cased', num_labels=12, args = args)

In [None]:
model.train_model(train_df, show_running_loss = True)

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

In [None]:
print(result)

In [None]:
i = np.random.randint(eval_df.shape[0])
print(encoder.decode(torch.tensor(model_outputs[i].argmax()+1)))
print(eval_df['text'].values[i])

In [None]:
!cp -r /content/outputs/checkpoint-5226-epoch-2 ./Cache/Models/DistilBERT/

In [None]:
from simpletransformers.classification import MultiLabelClassificationModel

model = MultiLabelClassificationModel('distilbert', 'distilbert-base-uncased', num_labels=24, args = args)

In [None]:
model.train_model(train_df, show_running_loss = True)

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

In [None]:
print(result)

In [None]:
i = np.random.randint(eval_df.shape[0])
print(encoder.decode(torch.tensor(model_outputs[i].argmax()+1)))
print(eval_df['text'].values[i])

## RoBERTA

In [None]:
from simpletransformers.classification import MultiLabelClassificationModel

model = MultiLabelClassificationModel('roberta', 'distilroberta-base', num_labels=12, args = args)

In [None]:
model.train_model(train_df, show_running_loss = True)

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

In [None]:
print(result)

In [None]:
i = np.random.randint(eval_df.shape[0])
print(encoder.decode(torch.tensor(model_outputs[i].argmax()+1)))
print(eval_df['text'].values[i])

In [None]:
torch.save(model, './Cache/Models/RoBERTA/model_roberta_4.h5')
torch.save(model.state_dict(), './Cache/Models/RoBERTA/model_roberta_6.pt')

In [None]:
i = np.random.randint(eval_df.shape[0])
print(encoder.decode(torch.tensor(model_outputs[i].argmax()+1)))
print(eval_df['text'].values[i])

In [None]:
!cp -r /content/outputs/checkpoint-2613-epoch-1 ./Cache/Models/DistilBERT/

## ALBERT

In [None]:
from simpletransformers.classification import MultiLabelClassificationModel

model = MultiLabelClassificationModel('albert', 'albert-large-v1', num_labels=12, args = args)

In [None]:
model.train_model(train_df, show_running_loss = True)

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

## ROC Curves

In [None]:
# def roc_curve(y_score=model_outputs, y_test=y_test, n_classes = 12, col=2):
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

for col in range(12):
    plt.figure()
    lw = 2
    plt.plot(fpr[col], tpr[col], color='darkorange',
          lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[col])
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Flair: ' + str(encoder.decode(torch.tensor(col+1))))
    plt.legend(loc="lower right")
    plt.savefig('./Cache/Figures/DistilBERT/' + str(col+1) + '.png')
plt.show()