In [42]:
import joblib
import os
import torch
import torch.nn as nn
import pandas as pd
import ast
from torchtext.vocab import vocab
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import PreTrainedModel, PretrainedConfig
from functools import partial
from types import SimpleNamespace as sn
import torch.nn.functional as F

from torchinfo import summary
import random
import numpy as np

# For filesystem path handling, generating and displaying confusion matrices, and date-time manipulations
from pathlib import Path
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from datetime import datetime
from torchmetrics import HammingDistance

# For plotting and visualization
import matplotlib.pyplot as plt

In [43]:
# !pip install torchinfo

In [44]:
# Determine the storage location based on the execution environment
# If running on Google Colab, use Google Drive as storage
if 'google.colab' in str(get_ipython()):
    from google.colab import drive  # Import Google Drive mounting utility
    drive.mount('/content/drive')  # Mount Google Drive
    os.chdir('/content//drive//My Drive//Colab_Notebooks//NLP')

    # REPLACE WITH YOUR FOLDER
    project_folder = Path('content//drive//My Drive/Colab_Notebooks/NLP')

# If running locally, specify a different path
else:
    # Set base folder path for storing files on local machine
    # REPLACE WITH YOUR FOLDER
    # FILL THIS ONLY IF YOU ARE RUNNING ON A LOCAL MACHINE

    project_folder = Path('')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [45]:
data = joblib.load('df_multilabel_hw_cleaned.joblib')

In [46]:
data.head()

Unnamed: 0,cleaned_text,Tags,Tag_Number
0,asp query stre dropdown webpage follow control...,c# asp.net,"[0, 9]"
1,run javascript code server java code want run ...,java javascript,"[1, 3]"
2,linq sql throw exception row find change hi li...,c# asp.net,"[0, 9]"
3,run python script php server run nginx web ser...,php python,"[2, 7]"
4,advice write function m try write function res...,javascript jquery,"[3, 5]"


In [47]:
data.Tags.value_counts(normalize = True)

javascript jquery                   0.421469
c# asp.net                          0.168385
java android                        0.145951
php javascript                      0.045649
php jquery                          0.031606
                                      ...   
c# php javascript asp.net jquery    0.000021
javascript asp.net android          0.000021
php javascript iphone               0.000021
java php iphone c++ android         0.000021
javascript php                      0.000021
Name: Tags, Length: 176, dtype: float64

In [48]:
tuple(ast.literal_eval(data.Tag_Number[0]))

(0, 9)

In [49]:
data['Tag_Number'] = data.Tag_Number.apply(lambda x: tuple(ast.literal_eval(x)))

In [50]:
data.head()

Unnamed: 0,cleaned_text,Tags,Tag_Number
0,asp query stre dropdown webpage follow control...,c# asp.net,"(0, 9)"
1,run javascript code server java code want run ...,java javascript,"(1, 3)"
2,linq sql throw exception row find change hi li...,c# asp.net,"(0, 9)"
3,run python script php server run nginx web ser...,php python,"(2, 7)"
4,advice write function m try write function res...,javascript jquery,"(3, 5)"


#### One Hot encoding of target variable

In [51]:
mb = MultiLabelBinarizer()
mb_fit = mb.fit_transform(data.Tag_Number)

In [52]:
mb.classes_

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [53]:
mb_fit

array([[1, 0, 0, ..., 0, 0, 1],
       [0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]])

In [54]:
data['Target_encoded'] = mb_fit.tolist()

In [55]:
data.head()

Unnamed: 0,cleaned_text,Tags,Tag_Number,Target_encoded
0,asp query stre dropdown webpage follow control...,c# asp.net,"(0, 9)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
1,run javascript code server java code want run ...,java javascript,"(1, 3)","[0, 1, 0, 1, 0, 0, 0, 0, 0, 0]"
2,linq sql throw exception row find change hi li...,c# asp.net,"(0, 9)","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
3,run python script php server run nginx web ser...,php python,"(2, 7)","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0]"
4,advice write function m try write function res...,javascript jquery,"(3, 5)","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0]"


## Creating custom torch Dataset

In [56]:
X = data.cleaned_text
y = data.Target_encoded

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=1)

In [58]:
X_train.head()

4787     getscript file reduce load size sense use $ .g...
10968    php javascript calendar start date picker sear...
45369    url address bar php   possible duplicate entir...
20060    pass multivariable new page gridview select gr...
39369    jquery plugin function method follow code esse...
Name: cleaned_text, dtype: object

In [59]:
class CustomDataset(Dataset):
    def __init__(self, X, y):

        self.X = X
        self.y = y

    def __len__(self):

        return(len(self.X))

    def __getitem__(self, idx):

        text = self.X.iloc[idx]
        labels = self.y.iloc[idx]
        sample = (labels, text)

        return sample

In [60]:
train_dataset = CustomDataset(X_train, y_train)
valid_dataset = CustomDataset(X_val, y_val)
test_dataset = CustomDataset(X_test, y_test)

In [61]:
for i, (y, x) in enumerate(train_dataset):
    if i == 10:
        print(i, y, x)

10 [0, 0, 0, 1, 0, 1, 0, 0, 0, 0] jquery click link event trigger jquery click link d mylink


## Create Vocab

In [62]:
def get_vocab(dataset, min_freq):
    """
    Generate a vocabulary from a dataset.

    Args:
        dataset (list of tuple): List of tuples where each tuple contains a label and a text.
        min_freq (int): The minimum frequency for a token to be included in the vocabulary.

    Returns:
        torchtext.vocab.Vocab: Vocabulary object.
    """
    # Initialize a counter object to hold token frequencies
    counter = Counter()

    # Update the counter with tokens from each text in the dataset
    for (l_, text) in dataset:
        counter.update(str(text).split())

    # Create a vocabulary using the counter object
    # Tokens that appear fewer times than `min_freq` are excluded
    my_vocab = vocab(counter, min_freq=min_freq)

    # Insert a '<unk>' token at index 0 to represent unknown words
    my_vocab.insert_token('<unk>', 0)

    # Set the default index to 0
    # This ensures that any unknown word will be mapped to '<unk>'
    my_vocab.set_default_index(0)

    return my_vocab

In [63]:
train_vocab = get_vocab(train_dataset, 2)

In [64]:
len(train_vocab)

89451

In [65]:
# Get the index where the frequency of the word is saved
#train_vocab.get_stoi()

# <Font color = 'pickle'>**Create Collate function**

In [66]:
# Creating a function that will be used to get the indices of words from vocab
def text_pipeline(x, vocab):
    """Converts text to a list of indices using a vocabulary dictionary"""
    return [vocab[token] for token in str(x).split()]

In [67]:
def collate_batch(batch, my_vocab):
    """
    Collates a batch of samples into tensors of labels, texts, and offsets.

    Parameters:
        batch (list): A list of tuples, each containing a label and a text.

    Returns:
        tuple: A tuple containing three tensors:
               - Labels tensor
               - Concatenated texts tensor
               - Offsets tensor indicating the start positions of each text in the concatenated tensor
    """
    # Unpack the batch into separate lists for labels and texts
    labels, texts = zip(*batch)

    # Convert the list of labels into a tensor of dtype int32
    labels = torch.tensor(labels, dtype=torch.long)

    # Convert the list of texts into a list of lists; each inner list contains the vocabulary indices for a text
    list_of_list_of_indices = [text_pipeline(text, my_vocab) for text in texts]

    # Compute the offsets for each text in the concatenated tensor
    offsets = [0] + [len(i) for i in list_of_list_of_indices]
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)

    # Concatenate all text indices into a single tensor
    texts = torch.cat([torch.tensor(i, dtype=torch.int64) for i in list_of_list_of_indices])

    return (texts, offsets), labels

In [68]:
batch_size = 2
check_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           collate_fn=partial(collate_batch, my_vocab = train_vocab),
                                           )

In [69]:
torch.manual_seed(22)
for (text, offset), label in check_loader:
    print(text, offset, label)
    break

tensor([  527,   578,  2624,   283, 33825,   578,     2,   568, 52012,    97,
        54497, 52012, 54498, 54499, 54500,   578, 54501,    78,  7083, 52012,
        54502,  2112, 52012, 54503, 54504,  2112, 52012, 54505,  2112, 52012,
        54506, 54507, 54508,    73,    74,   504, 54509,    76, 54510, 54511,
         2112, 52012, 54506, 54507, 54512,  2112, 54513, 54514,  2112, 52012,
        54515,  2112,   538, 54516,    97,  1960, 37911,   210, 54517,   538,
        54516,    97,   351,  9144, 54518,    97,   964, 19086, 54519,    97,
        54520, 54516, 19089,   527, 54519,   989, 23269, 54521,    97, 54522,
        54521, 54523, 54521,   989,   347,    74,   504,    73,    74,   504,
        54509,    76, 54510,   504,    78, 54524,   504,    19,  5362,  1505,
           55,  1505,    55, 44397,    58,   630,   123,  1505,    55,  2416,
         1505,    55,     4,    16,    44, 44398,   837,   747,  1505,    55,
            4]) tensor([ 0, 99]) tensor([[1, 0, 0, 0, 0, 0, 0, 0

## Model

In [70]:
class CustomBlock(nn.Module):
    def __init__(self, input_dim, output_dim, drop_prob):

        super().__init__()

        self.layers = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            nn.ReLU(),
            nn.Dropout(p=drop_prob),
            nn.BatchNorm1d(num_features=output_dim)
        )
    def forward(self, x):
      return self.layers(x)

In [71]:
class CustomMLP(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim1, hidden_dim2, drop_prob1, drop_prob2, num_outputs):
        super().__init__()

        self.embedding_bag = nn.EmbeddingBag(vocab_size, embedding_dim)
        self.layers = nn.Sequential(
            CustomBlock(embedding_dim , hidden_dim1, drop_prob1),
            CustomBlock(hidden_dim1, hidden_dim2, drop_prob2),
            nn.Linear(hidden_dim2, num_outputs)
        )

    def forward(self, input_tuple):
        data, offsets = input_tuple
        embed_out = self.embedding_bag(data, offsets)
        out = self.layers(embed_out)

        return out

In [72]:
# Define the device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Define the sequential model
model= CustomMLP(vocab_size= len(train_vocab), embedding_dim = 300, hidden_dim1 = 200, hidden_dim2 = 100, drop_prob1=0.5, drop_prob2=0.5, num_outputs=10)

# Move the model to the device
model = model.to(device)

# Generate some dummy input data and offsets, and move them to the device
data = torch.tensor([1, 2, 4, 5, 4], dtype = torch.int32).to(device)
offsets = torch.tensor([0, 2, 4], dtype = torch.int32).to(device)

# Generate summary
summary(model, input_data=[(data, offsets)], device=device, depth =4)


Layer (type:depth-idx)                   Output Shape              Param #
CustomMLP                                [3, 10]                   --
├─EmbeddingBag: 1-1                      [3, 300]                  26,835,300
├─Sequential: 1-2                        [3, 10]                   --
│    └─CustomBlock: 2-1                  [3, 200]                  --
│    │    └─Sequential: 3-1              [3, 200]                  --
│    │    │    └─Linear: 4-1             [3, 200]                  60,200
│    │    │    └─ReLU: 4-2               [3, 200]                  --
│    │    │    └─Dropout: 4-3            [3, 200]                  --
│    │    │    └─BatchNorm1d: 4-4        [3, 200]                  400
│    └─CustomBlock: 2-2                  [3, 100]                  --
│    │    └─Sequential: 3-2              [3, 100]                  --
│    │    │    └─Linear: 4-5             [3, 100]                  20,100
│    │    │    └─ReLU: 4-6               [3, 100]                  -

In [73]:
# def hamming_metric(prediction, original):

#     ham = (prediction != original).sum().item()

#     return ham

In [74]:
# optimizer = torch.optim.SGD(model.parameters(), lr)

In [75]:
def step(inputs, targets, model, device, loss_function=None, optimizer=None):

    # Move the model and data to the device
    model = model.to(device)
    inputs = tuple(input_tensor.to(device) for input_tensor in inputs)

    targets = targets.to(device, dtype=torch.float32)

    # Step 1: Forward pass to get the model's predictions
    outputs = model(inputs)

    # Step 2: Compute the loss using the provided loss function
    if loss_function:
        loss = loss_function(outputs, targets)

    # Step 2: Calculate the number of correctly classified samples
    predicted = (outputs > 0.5).float()
    #correct = (predicted == targets).sum().item()

    # Step 3 and 4: Perform backward pass and update model parameters if an optimizer is provided
    if optimizer:
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=10.0)
        optimizer.step()


    # Return relevant metrics
    if loss_function:
        return loss, outputs, predicted
    else:
        return outputs, predicted

In [76]:
def train_epoch(train_loader, model, device, loss_function, optimizer):

    train_hamming_distance = HammingDistance(task = "multilabel", num_labels = 10).to(device)

    # Set the model to training mode
    model.train()

    # Initialize variables to track running training loss and correct predictions
    running_train_loss = 0.0

    # Iterate over all batches in the training data
    for inputs, targets in train_loader:
        # Perform a forward and backward pass, updating model parameters
        loss, _, predict = step(inputs, targets, model, device, loss_function, optimizer)
        # ham_dist = train_hamming_distance(predict, targets)

        # Update running loss and correct predictions counter
        running_train_loss += loss.item()
        train_hamming_distance.update(predict, targets)

    epoch_hamming_distance = train_hamming_distance.compute()

    # Compute average loss and accuracy for the entire training set
    train_loss = running_train_loss / len(train_loader)
    print(f'Train Loss: {train_loss:.4f} | Train Hamming Distance: {epoch_hamming_distance:.4f}')

    train_hamming_distance.reset()

    return train_loss, epoch_hamming_distance

In [77]:
def val_epoch(valid_loader, model, device, loss_function):

    """
    Validates the model for one epoch using the provided data loader.

    Parameters:
    - valid_loader (torch.utils.data.DataLoader): DataLoader object for the validation set.
    - model (torch.nn.Module): The neural network model to be validated.
    - device (torch.device): The computing device (CPU or GPU).
    - loss_function (torch.nn.Module): The loss function to evaluate the model.

    Returns:
    - val_loss (float): Average validation loss for the epoch.
    - val_acc (float): Validation accuracy for the epoch.
    """
    train_hamming_distance = HammingDistance(task = "multilabel", num_labels = 10).to(device)
    # Set the model to evaluation mode

    model.eval()

    # Initialize variables to track running validation loss and correct predictions
    running_val_loss = 0.0

    for inputs, targets in valid_loader:
        # Perform a forward and backward pass, updating model parameters
        loss, _, predict = step(inputs, targets, model, device, loss_function, optimizer = None)
        # train_hamming_distance = train_hamming_distance(predict, targets)

        # Update running loss and correct predictions counter
        running_val_loss += loss.item()
        train_hamming_distance.update(predict, targets)

    epoch_hamming_distance = train_hamming_distance.compute()

    # Compute average loss and accuracy for the entire training set
    train_loss = running_val_loss / len(valid_loader)
    print(f'Valid Loss: {train_loss:.4f} | Valid Hamming Distance: {epoch_hamming_distance:.4f}')

    train_hamming_distance.reset()

    return train_loss, epoch_hamming_distance

In [78]:
def train(train_loader, valid_loader, model, optimizer, loss_function, epochs, device):

    # Initialize lists to store metrics for each epoch
    train_loss_history = []
    valid_loss_history = []
    train_ham_history = []
    valid_ham_history = []

    # Loop over the number of specified epochs
    for epoch in range(epochs):
        # Train model on training data and capture metrics
        train_loss, train_ham = train_epoch(train_loader, model, device, loss_function, optimizer)

        # Validate model on validation data and capture metrics
        valid_loss, valid_ham = val_epoch(valid_loader, model, device, loss_function)

        # Store metrics for this epoch
        train_loss_history.append(train_loss)
        train_ham_history.append(train_ham)
        valid_loss_history.append(valid_loss)
        valid_ham_history.append(valid_ham)

        # Output epoch-level summary
        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Train Loss: {train_loss:.4f} | Train Hamming distance: {train_ham*100:.2f}%")
        print(f"Valid Loss: {valid_loss:.4f} | Valid Hamming distance: {valid_ham*100:.2f}%")
        print()

    return train_loss_history, train_ham_history, valid_loss_history, valid_ham_history

### Hyperparameters

In [79]:
hyperparameters = sn(
    # model Parameters
    EMBED_DIM=300,
    VOCAB_SIZE=len(train_vocab),
    OUTPUT_DIM=10,
    HIDDEN_DIM1=200,
    HIDDEN_DIM2=100,
    DROP_PROB1=0.5,
    DROP_PROB2=0.5,
    NUM_OUTPUTS=10,

    # training
    EPOCHS=5,
    BATCH_SIZE=128,
    LEARNING_RATE=0.001,
    WEIGHT_DECAY=0.1,
)

## Training Configuration

In [80]:
# Fix seed value
SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

collate_fn = partial(collate_batch, my_vocab = train_vocab)

# Data Loader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=hyperparameters.BATCH_SIZE, shuffle=True,
                                           collate_fn=collate_fn, num_workers=4)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=hyperparameters.BATCH_SIZE, shuffle=False,
                                           collate_fn=collate_fn,  num_workers=4)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=hyperparameters.BATCH_SIZE,  shuffle=False,
                                          collate_fn=collate_fn,  num_workers=4)


loss_function = nn.BCEWithLogitsLoss()

model_nlp = CustomMLP(vocab_size=hyperparameters.VOCAB_SIZE,
                       embedding_dim=hyperparameters.EMBED_DIM,
                       hidden_dim1=hyperparameters.HIDDEN_DIM1,
                       hidden_dim2=hyperparameters.HIDDEN_DIM2,
                       drop_prob1=hyperparameters.DROP_PROB1,
                       drop_prob2=hyperparameters.DROP_PROB2,
                       num_outputs=hyperparameters.NUM_OUTPUTS)

def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.kaiming_normal_(m.weight)
        torch.nn.init.zeros_(m.bias)


# apply initialization recursively  to all modules
model_nlp.apply(init_weights)

optimizer = torch.optim.SGD(model.parameters(), lr = hyperparameters.LEARNING_RATE)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')



### Sanity Check

In [81]:
for input_ , targets  in train_loader:

    # move inputs and outputs to GPUs
    model_nlp = model_nlp.to(device)

    input_ = tuple(input_tensor.to(device) for input_tensor in input_)
    targets = targets.to(device, dtype = torch.float32)

    model_nlp.eval()
    # Forward pass
    output = model_nlp(input_)
    loss = loss_function(output, targets)
    print(f'Actual loss: {loss}')
    break

print(f'Expected Theoretical loss: {np.log(2)}')

Actual loss: 0.711052656173706
Expected Theoretical loss: 0.6931471805599453


### Calling train function

In [82]:
train_losses, train_ham, valid_losses, valid_ham = train(train_loader, valid_loader, model_nlp, optimizer, loss_function, hyperparameters.EPOCHS, device)

Train Loss: 0.9100 | Train Hamming Distance: 0.4209
Valid Loss: 0.7644 | Valid Hamming Distance: 0.3534
Epoch 1/5
Train Loss: 0.9100 | Train Hamming distance: 42.09%
Valid Loss: 0.7644 | Valid Hamming distance: 35.34%

Train Loss: 0.9097 | Train Hamming Distance: 0.4206
Valid Loss: 0.7695 | Valid Hamming Distance: 0.3567
Epoch 2/5
Train Loss: 0.9097 | Train Hamming distance: 42.06%
Valid Loss: 0.7695 | Valid Hamming distance: 35.67%

Train Loss: 0.9086 | Train Hamming Distance: 0.4220
Valid Loss: 0.7626 | Valid Hamming Distance: 0.3484
Epoch 3/5
Train Loss: 0.9086 | Train Hamming distance: 42.20%
Valid Loss: 0.7626 | Valid Hamming distance: 34.84%

Train Loss: 0.9073 | Train Hamming Distance: 0.4195
Valid Loss: 0.7614 | Valid Hamming Distance: 0.3493
Epoch 4/5
Train Loss: 0.9073 | Train Hamming distance: 41.95%
Valid Loss: 0.7614 | Valid Hamming distance: 34.93%

Train Loss: 0.9079 | Train Hamming Distance: 0.4205
Valid Loss: 0.7657 | Valid Hamming Distance: 0.3538
Epoch 5/5
Train Loss