# Install Transformers Library

In [None]:
!pip install transformers
!pip install sentencepiece



In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel
from transformers import XLMRobertaTokenizer, XLMRobertaModel
# specify GPU
device = torch.device("cuda")

# Load Dataset

In [None]:
# Get text data of all available episodes in M2H2

df22 = pd.read_csv("/content/drive/MyDrive/IR project/M2H2-dataset/Main-Dataset/Raw-Text/Ep-22.tsv", sep="\t",header=0)
df25 = pd.read_csv("/content/drive/MyDrive/IR project/M2H2-dataset/Main-Dataset/Raw-Text/Ep-25.tsv", sep="\t",header=0)
#df26 = pd.read_csv("/content/drive/MyDrive/IR project/M2H2-dataset/Main-Dataset/Raw-Text/Ep-26.tsv", sep="\t",header=0)
#df27 = pd.read_csv("/content/drive/MyDrive/IR project/M2H2-dataset/Main-Dataset/Raw-Text/Ep-27.tsv", sep="\t",header=0)
#df30 = pd.read_csv("/content/drive/MyDrive/IR project/M2H2-dataset/Main-Dataset/Raw-Text/Ep-30.tsv", sep="\t",header=0)
#df31 = pd.read_csv("/content/drive/MyDrive/IR project/M2H2-dataset/Main-Dataset/Raw-Text/Ep-31.tsv", sep="\t",header=0)
#df32 = pd.read_csv("/content/drive/MyDrive/IR project/M2H2-dataset/Main-Dataset/Raw-Text/Ep-32.tsv", sep="\t",header=0)
#df33 = pd.read_csv("/content/drive/MyDrive/IR project/M2H2-dataset/Main-Dataset/Raw-Text/Ep-33.tsv", sep="\t",header=0)
df36 = pd.read_csv("/content/drive/MyDrive/IR project/M2H2-dataset/Main-Dataset/Raw-Text/Ep-36.tsv", sep="\t",header=0)
df37 = pd.read_csv("/content/drive/MyDrive/IR project/M2H2-dataset/Main-Dataset/Raw-Text/Ep-37.tsv", sep="\t",header=0)
df38 = pd.read_csv("/content/drive/MyDrive/IR project/M2H2-dataset/Main-Dataset/Raw-Text/Ep-38.tsv", sep="\t",header=0)
df39 = pd.read_csv("/content/drive/MyDrive/IR project/M2H2-dataset/Main-Dataset/Raw-Text/Ep-39.tsv", sep="\t",header=0)
df40 = pd.read_csv("/content/drive/MyDrive/IR project/M2H2-dataset/Main-Dataset/Raw-Text/Ep-40.tsv", sep="\t",header=0)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Make one dataframe and drop irrelevant information
frames = [df22,df25,df26,df27,df30,df31,df32,df33,df36,df37,df38,df39,df40]
df=pd.concat(frames)
df=df.drop(['Scenes','Sl. No.',	'Start_time',	'End_time','Speaker'],axis=1)
df=df.dropna()
df.reset_index(inplace = True)
df.drop('index',axis=1,inplace=True)

In [None]:
# Replace label string with a binary label
df.Label.replace(['humor', 'non-humor'], [1, 0], inplace=True)
df.rename(columns={'Utterance':'text','Label':'label'},inplace=True)
#df

In [None]:
#df = pd.read_csv("spamdata_v2.csv")
df.head()

Unnamed: 0,text,label
0,अरे बाबा! केशू जी।,0
1,डॉल के अलावा माँ और कौन हो सकती है?,0
2,वॉट!,1
3,या!,0
4,प्रेमा जी?,0


In [None]:
df.shape

(6185, 2)

In [None]:
# check class distribution
df['label'].value_counts(normalize = True)

0    0.662247
1    0.337753
Name: label, dtype: float64

# Split train dataset into train, validation and test sets

In [None]:
train_text, temp_text, train_labels, temp_labels = train_test_split(df['text'], df['label'], 
                                                                    random_state=2018, 
                                                                    test_size=0.3, 
                                                                    stratify=df['label'])

# we will use temp_text and temp_labels to create validation and test set
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=2018, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)

# Import BERT Model and BERT Tokenizer

In [None]:
# import BERT-base pretrained model
modell = XLMRobertaModel.from_pretrained('xlm-roberta-base')

# Load the BERT tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Tokenization

In [None]:
max_seq_len = 50

In [None]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)
tokens_test



{'input_ids': [[0, 72855, 40319, 4, 33142, 26460, 15273, 6377, 32, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 82158, 6, 164537, 2191, 48095, 6754, 125, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 126859, 460, 125, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 44580, 182063, 187550, 118564, 460, 125, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 36733, 12028, 5011, 998, 6, 129930, 838, 2191, 16439, 8906, 1253, 32, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 2218, 10924, 4010, 4, 22839, 19487, 1682, 4322, 83727, 12580, 80105, 2653, 1293, 125, 2, 1, 1, 1, 1, 1, 1, 1, 

# Convert Integer Sequences to Tensors

In [None]:
# for train set
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

# for validation set
val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

# for test set
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

# Create DataLoaders

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

# Freeze BERT Parameters

In [None]:
#freeze all the parameters
#for param in modell.parameters():
 #   param.requires_grad = False

# Define Model Architecture

In [None]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
      
      super(BERT_Arch, self).__init__()

      self.bert =modell 
      
      # dropout layer
      self.dropout = nn.Dropout(0.1)
      
      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(768,512)
      
      # dense layer 2 (Output layer)
      self.fc2 = nn.Linear(512,2)

      #softmax activation function
      self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _, cls_hs = self.bert(sent_id, attention_mask=mask,return_dict=False)
      
      x = self.fc1(cls_hs)

      x = self.relu(x)

      x = self.dropout(x)

      # output layer
      x = self.fc2(x)
      
      # apply softmax activation
      x = self.softmax(x)

      return x

In [None]:
# pass the pre-trained BERT to our define architecture
model = BERT_Arch(modell)

# push the model to GPU
model = model.to(device)

In [None]:
# optimizer from hugging face transformers
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-3)

# Find Class Weights

In [None]:
from sklearn.utils.class_weight import compute_class_weight

#compute the class weights
class_wts = compute_class_weight( class_weight = "balanced",
                                        classes = np.unique(train_labels),
                                        y = train_labels )

print(class_wts)

[0.75497035 1.48050616]


In [None]:
# convert class weights to tensor
weights= torch.tensor(class_wts,dtype=torch.float)
weights = weights.to(device)

# loss function
cross_entropy  = nn.NLLLoss(weight=weights) 

# number of training epochs
epochs = 10

# Fine-Tune BERT

In [None]:
# function to train the model
def train():
  
  model.train()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save model predictions
  total_preds=[]
  
  # iterate over batches
  for step,batch in enumerate(train_dataloader):
    
    # progress update after every 50 batches.
    if step % 50 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

    # push the batch to gpu
    batch = [r.to(device) for r in batch]
 
    sent_id, mask, labels = batch

    # clear previously calculated gradients 
    model.zero_grad()        

    # get model predictions for the current batch
    preds = model(sent_id, mask)

    # compute the loss between actual and predicted values
    loss = cross_entropy(preds, labels)

    # add on to the total loss
    total_loss = total_loss + loss.item()

    # backward pass to calculate the gradients
    loss.backward()

    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update parameters
    optimizer.step()

    # model predictions are stored on GPU. So, push it to CPU
    preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

  # compute the training loss of the epoch
  avg_loss = total_loss / len(train_dataloader)
  
  # predictions are in the form of (no. of batches, size of batch, no. of classes).
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  #returns the loss and predictions
  return avg_loss, total_preds

In [None]:
# function for evaluating the model
def evaluate():
  
  print("\nEvaluating...")
  
  # deactivate dropout layers
  model.eval()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save the model predictions
  total_preds = []

  # iterate over batches
  for step,batch in enumerate(val_dataloader):
    
    # Progress update every 50 batches.
    if step % 50 == 0 and not step == 0:
      
      # Calculate elapsed time in minutes.
      elapsed = format_time(time.time() - t0)
            
      # Report progress.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

    # push the batch to gpu
    batch = [t.to(device) for t in batch]

    sent_id, mask, labels = batch

    # deactivate autograd
    with torch.no_grad():
      
      # model predictions
      preds = model(sent_id, mask)

      # compute the validation loss between actual and predicted values
      loss = cross_entropy(preds,labels)

      total_loss = total_loss + loss.item()

      preds = preds.detach().cpu().numpy()

      total_preds.append(preds)

  # compute the validation loss of the epoch
  avg_loss = total_loss / len(val_dataloader) 

  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds

# Start Model Training

In [None]:
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _ = train()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 10
  Batch    50  of    136.
  Batch   100  of    136.

Evaluating...

Training Loss: 0.719
Validation Loss: 0.711

 Epoch 2 / 10
  Batch    50  of    136.
  Batch   100  of    136.

Evaluating...

Training Loss: 0.700
Validation Loss: 0.696

 Epoch 3 / 10
  Batch    50  of    136.
  Batch   100  of    136.

Evaluating...

Training Loss: 0.697
Validation Loss: 0.694

 Epoch 4 / 10
  Batch    50  of    136.
  Batch   100  of    136.

Evaluating...

Training Loss: 0.694
Validation Loss: 0.693

 Epoch 5 / 10
  Batch    50  of    136.
  Batch   100  of    136.

Evaluating...

Training Loss: 0.693
Validation Loss: 0.693

 Epoch 6 / 10
  Batch    50  of    136.
  Batch   100  of    136.

Evaluating...

Training Loss: 0.693
Validation Loss: 0.693

 Epoch 7 / 10
  Batch    50  of    136.
  Batch   100  of    136.

Evaluating...

Training Loss: 0.693
Validation Loss: 0.693

 Epoch 8 / 10
  Batch    50  of    136.
  Batch   100  of    136.

Evaluating...

Training Loss: 0.693
Validat

# Load Saved Model

In [None]:
#load weights of best model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

# Get Predictions for Test Data

In [None]:
# get predictions for test data
with torch.no_grad():
  preds = model(test_seq.to(device), test_mask.to(device))
  preds = preds.detach().cpu().numpy()

In [None]:
preds

array([[-0.68280363, -0.7035988 ],
       [-0.68280363, -0.7035988 ],
       [-0.68280363, -0.7035988 ],
       ...,
       [-0.68280363, -0.7035988 ],
       [-0.68280363, -0.7035988 ],
       [-0.68280363, -0.7035988 ]], dtype=float32)

In [None]:
# model's performance
preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))

              precision    recall  f1-score   support

           0       0.66      1.00      0.80       615
           1       0.00      0.00      0.00       313

    accuracy                           0.66       928
   macro avg       0.33      0.50      0.40       928
weighted avg       0.44      0.66      0.53       928



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# confusion matrix
pd.crosstab(test_y, preds)

col_0,0
row_0,Unnamed: 1_level_1
0,615
1,313


In [None]:
preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,