> In this notebook I try to take you through the complete process for finetuning a DeBerta-v3-base model for the task. Various components of training setup - and how they all connect - are explained, with even occasional references to basic dimensions of tensors to understand the underlying operations. Even though this may not be the most complete work on this data, I hope you leave with a better theoretical understanding of how the model layers interact. Constructive criticism absolutely welcome in the comments! And so are upvotes, lol.

# I. Importing Libraries and Reading Data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from transformers import AutoTokenizer, AutoModel

In [None]:
train_data = pd.read_csv('../input/spooky-author-identification/train.zip')
test_data = pd.read_csv('../input/spooky-author-identification/test.zip')
train_data

In [None]:
CONFIG = {
    'batch_size': 8,
    'model_name': 'microsoft/deberta-v3-base',
    'num_classes': 3,
    'lr': 2e-3,
    'epochs': 7,
    'n_accumulate': 4,
    'weight_decay': 1e-6,
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu')
}

# II. Text Preprocessing

In [None]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                           "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                           "you're": "you are", "you've": "you have"}

In [None]:
def text_clean(text):
    text = re.sub(' +', ' ', text).strip() #Remove leading, trailing and in-between whitespaces
    text = ' '.join([contraction_mapping[word] if word in contraction_mapping else word for word in text.split()])
    text = re.sub('[^a-zA-Z]', ' ', text) #Remove non-alphabetical characters
    #text = ' '.join([word.lower() for word in text.split() if (word not in StopWords) and (len(word) > 2)])
    return text

train_data['text'] = train_data['text'].apply(text_clean)
test_data['text'] = test_data['text'].apply(text_clean)

In [None]:
#Distribution of Train Target Labels
sns.countplot(x = train_data['author'])

# III. Preparing Data for Model
* Train-Validation Splitting
* Creating Custom Dataset Class
* Creating an iterable wrapper called DataLoader for train, val, test data while splitting into batches.

In [None]:
#Encoding Labels and Preparing Training, Validation Sets.
encoder = LabelEncoder()
train_data['author'] = encoder.fit_transform(train_data['author'])
X_train, X_val, y_train, y_val = train_test_split(train_data['text'], train_data['author'], train_size=0.8)
train_df = pd.DataFrame()
val_df = pd.DataFrame()
train_df['text'] = X_train
train_df['author'] = y_train
val_df['text'] = X_val
val_df['author'] = y_val

In [None]:
#Load the BERT model and tokenizer to be used
tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])
bert_model = AutoModel.from_pretrained(CONFIG['model_name'])

In [None]:
#Custom Dataset class, subclass of the Dataset Module.
#Necessarily contains the below 3 methods: __init__, __len__, __getitem__
class CustomDataset(Dataset):
    def __init__(self, text, tokenizer, label=None, train=False):
        self.text  = text
        self.label = label
        self.tokenizer  = tokenizer
        self.train = train
    
    def __len__(self):
        #Number of examples
        return len(self.text)
    
    def __getitem__(self, idx):
        #Return a dict of text, labels, tokenized data at specific idx
        sample_text = str(self.text[idx])
        sample_label = []
        if self.train:
            sample_label = self.label[idx]
        tokenizer_dict = self.tokenizer.encode_plus(
        sample_text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        return_token_type_ids=False,
        return_attention_mask=True,
        truncation=True,
        return_tensors='pt' #pytorch tensor format
        )
        if self.train:
            return {
                'text' : sample_text,
                'input_ids': tokenizer_dict['input_ids'].flatten(),
                'attn_mask': tokenizer_dict['attention_mask'].flatten(),
                'label' : torch.tensor(sample_label, dtype=torch.int64)
            }
        else:
            return {
                'text' : sample_text,
                'input_ids': tokenizer_dict['input_ids'].flatten(),
                'attn_mask': tokenizer_dict['attention_mask'].flatten()
            }
            

In [None]:
#Instantiate custom datasets by passing constructor arguments
data_train = CustomDataset(train_df['text'].to_numpy(), tokenizer, train_df['author'].to_numpy(), train=True)
data_val = CustomDataset(val_df['text'].to_numpy(), tokenizer, val_df['author'].to_numpy(), train=True)
data_test = CustomDataset(test_data['text'].to_numpy(), tokenizer)

#Create iterable dataloaders, which create batches of given size returned as the return type of __getitem__ (dict here)
train_dataloader = DataLoader(data_train, batch_size=CONFIG['batch_size'], pin_memory=True)
val_dataloader = DataLoader(data_val, batch_size=CONFIG['batch_size'], pin_memory=True)
test_dataloader = DataLoader(data_test, batch_size=CONFIG['batch_size'], pin_memory=True)
#How a batch looks:
next(iter(train_dataloader))

# IV. Model Definition and Training

In [None]:
#Custom Pooling Layer to be applied to the model output.last_hidden_state
#Last hidden state returns literally the last hidden state vector for every token for every sequence in the batch.
#Hence Dimension of Last Hidden state = (batch_size, seq_length, hidden_state_dim)

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    
    def forward(self, last_hidden_state, attn_mask): # attention_mask dims= (batch_size, seq_length=512)
        
        # After unsqueeze, (batch_size, seq_length, 1). Further expand to (batch_size, seq_length, hidden_state_dim).
        # expanded_mask creates a mask vector of size hidden_state_dim for all seq_length number of tokens for each 
        # sequence in the batch. Earlier a binary 0/1 mask for each token is now a vector of zeros/ones resp.
        expanded_mask = attn_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        # Weighted sum of hidden_state_vectors with masks.
        masked_embeddings = torch.sum(last_hidden_state * expanded_mask, 1)
        # Sum the mask vectors for all tokens of each sequence. mask_sum returns (batch_size, hidden_state_dim)
        mask_sum = expanded_mask.sum(1)
        # Clamps every value of tensor b/w (min, max). Values < min set to min and values > max to max. 
        mask_sum = torch.clamp(mask_sum, min=1e-9) 
        # Computes the mean embeddings to be returned.
        mean_embeddings = masked_embeddings / mask_sum
        return mean_embeddings

In [None]:
#Create custom NN, always inheriting from nn.Module
class BERT_Classifier(nn.Module):
    def __init__(self, num_classes):
        super(BERT_Classifier, self).__init__()
        self.num_classes = num_classes
        self.bert_layer = bert_model
        self.pooler = MeanPooling()
        self.dropout = nn.Dropout(p=0.2)
        self.linear = nn.Linear(self.bert_layer.config.hidden_size, self.num_classes) #Outputs logits for num_classes
        
    def forward(self, input_ids, attn_mask):
        outputs = self.bert_layer(input_ids=input_ids, attention_mask=attn_mask)
        pooled_output = self.pooler(outputs.last_hidden_state, attn_mask)
        dropout = self.dropout(pooled_output)
        logits = self.linear(dropout)
        #probs = nn.Softmax(dim=1)(logits) - later.
        return logits

In [None]:
#Instantiate model, transfer to gpu device if available
model = BERT_Classifier(CONFIG['num_classes']).to(CONFIG['device'])

In [None]:
#Trial feed forward on one smaple batch.
input_ids = next(iter(train_dataloader))['input_ids'].to(CONFIG['device'])
attn_mask = next(iter(train_dataloader))['attn_mask'].to(CONFIG['device'])
logits = model(input_ids, attn_mask)
probs = nn.Softmax(dim=1)(logits)
torch.max(probs, dim=1)

In [None]:
#Training Essentials
optimizer = AdamW(model.parameters(), lr=CONFIG['lr'], weight_decay=CONFIG['weight_decay'])
scheduler = CosineAnnealingLR(optimizer, T_max=10000, eta_min=2e-5)
loss = nn.CrossEntropyLoss().to(CONFIG['device']) #Categorical CrossEntropy
epochs = CONFIG['epochs']

In [None]:
#Train Loop, iterates over all batches
def train_loop(model, dataloader, loss, optimizer, scheduler):
    model = model.train() #Setting to train mode activates Dropouts, Batch Norm, etc
    batch_losses = []
    pred_correct = 0
    for batch_num, batch in enumerate(dataloader): 
        #Each batch as a Dict from __getitem__, with dict values having batch-size num of elements
        input_ids = batch['input_ids'].to(CONFIG['device'], non_blocking=True)
        attn_mask = batch['attn_mask'].to(CONFIG['device'], non_blocking=True)
        ground_truths = batch['label'].to(CONFIG['device'], non_blocking=True)
        #Set mixed floating point precision for allocating float16 and float32 operations - only for forward prop.
        with torch.cuda.amp.autocast():
            logits = model(input_ids, attn_mask)
            output = nn.Softmax(dim=1)(logits) #Generate probabilites on output of NN
            batch_loss = loss(logits, ground_truths) #Compute Loss
         
        batch_loss = batch_loss / CONFIG['n_accumulate']
        prob, labels = torch.max(output, dim=1) #Extract class labels from max(logits) or max(probs)
        batch_losses.append(batch_loss.item())
        pred_correct += torch.sum(labels == ground_truths) #Compute num of correct predictions for accuracy
        
        #Backprop Steps - Essential.
        batch_loss.backward() #Performs backprop
        #Update parameters only after n_accumulate number of batches, to mimic a larger batch_size.
        if (batch_num + 1) % CONFIG['n_accumulate'] == 0:
            optimizer.step() #Updates parameters
            optimizer.zero_grad(set_to_none=True) #Resets gradients to zero for next batch
            scheduler.step()
    
    return np.mean(batch_losses), pred_correct.double() / len(train_df) #Return train loss, train acc

In [None]:
#Validation Loop, almost same - No backprop
def validation_loop(model, dataloader, loss):
    model = model.eval() #Deactivates dropouts, batch norm etc
    batch_losses = []
    pred_correct = 0
    with torch.no_grad(): #Disable Autograd - No parameter optimization during validation
        for batch in dataloader:
            input_ids = batch['input_ids'].to(CONFIG['device'], non_blocking=True)
            attn_mask = batch['attn_mask'].to(CONFIG['device'], non_blocking=True)
            ground_truths = batch['label'].to(CONFIG['device'], non_blocking=True)
            logits = model(input_ids, attn_mask)
            output = nn.Softmax(dim=1)(logits)
            prob, labels = torch.max(output, dim=1)
            batch_loss = loss(logits, ground_truths)
            batch_losses.append(batch_loss.item())
            pred_correct += torch.sum(labels == ground_truths)
    
    return np.mean(batch_losses), pred_correct.double() / len(val_df) #Returns Val Loss, Val Acc

In [None]:
#To extract predictions on test data
def get_predictions(model, dataloader):
    model.eval()
    pred_labels = []
    pred_probs = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(CONFIG['device'], non_blocking=True)
            attn_mask = batch['attn_mask'].to(CONFIG['device'], non_blocking=True)
            logits = model(input_ids, attn_mask)
            probs = nn.Softmax(dim=1)(logits)
            pred_probs.extend(probs)
            _, labels = torch.max(probs, dim=1)
            pred_labels.extend(labels)
    
    return pred_labels, pred_probs

In [None]:
#Running the train, validation loops over all epochs. All batches over each epoch.
history = defaultdict(list)
for epoch in range(epochs):
    print('Epoch ', epoch+1, ' of', epochs, ' :')
    train_loss, train_acc = train_loop(model, train_dataloader, loss, optimizer, scheduler)
    print('Training Loss: ', train_loss, '    Training Acc: ', train_acc)
    val_loss, val_acc = validation_loop(model, val_dataloader, loss)
    print('Val Loss: ', val_loss, '    Val Acc: ', val_acc)
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)

In [None]:
plt.figure(1)
plt.plot(history['train_loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid()
plt.show()
plt.figure(2)
plt.plot(history['train_acc'], label='Train Accuracy')
plt.plot(history['val_acc'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid()
plt.show()

# V. Making Predictions and Generate Submissions

In [None]:
y_pred, y_prob = get_predictions(model, test_dataloader)

In [None]:
y_prob = torch.stack(y_prob).cpu().numpy()

In [None]:
df = pd.DataFrame()
df['id'] = test_data['id']
df['EAP'] = y_prob[:,0]
df['HPL'] = y_prob[:,1]
df['MWS'] = y_prob[:,2]
df.to_csv('Submission.csv', index=False)
df