In [None]:
# Fine Tuning DistilBERT for Movie Genre Prediction (MultiLabel Text Classification)
# Data: - We are using IMDB Movies Analysis (kaggle.com)
# Language Model Used: - DistilBERT is a smaller transformer model as compared to BERT. It is created by process of distillation applied to Bert.
# The loss metrics (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.hamming_loss.html) 
# and Hamming Score are used for direct comparison of expected vs predicted.

In [45]:
# Importing python libraries
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
from sklearn.preprocessing import MultiLabelBinarizer
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
logging.basicConfig(level=logging.ERROR)

In [46]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [47]:
# Definition of a method to calculate Hamming score 
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [48]:
# Importing and Pre-Processing the domain data

# Data Collection - Our data is in a CSV file named 'movies_initial.csv'
data = pd.read_csv('movies_initial.csv')

# Feature Selection - Selecting columns genre and fullplot
data = data[['genre','fullplot']]

# Data Cleaning - Drop null values from genre and fullplot
data = data.dropna(subset=['genre', 'fullplot'])

# Creating a copy of the original DataFrame for reference
data_s=data

# Randomize the data before splitting it into training and testing sets
data = data_s.sample(frac=1.0, random_state=42)

# Removing empty rows with no labels/genres
data = data[data['genre'].notnull()]

# Convert into lowercase
data['genre']=data['genre'].str.lower()

# Removing the Unicode non-breaking space character ('\xa0') from the 'genre' column
data['genre'] = data['genre'].str.replace('\xa0', '')

print(data)

                        genre  \
33876          drama, romance   
20454                  comedy   
25431           action, drama   
3451                  western   
36880                thriller   
...                       ...   
6731   comedy, crime, romance   
12344  comedy, drama, musical   
42045        horror, thriller   
938    comedy, crime, romance   
17136                   drama   

                                                fullplot  
33876  Ronnie's (Miley Cyrus) and her younger brother...  
20454  A middle-aged man's conservative life is distu...  
25431  In Los Angeles, an ex-con takes the undergroun...  
3451   After Confederate officer Blayde Hollister's h...  
36880  An engaged couple's backpacking trip in the Ca...  
...                                                  ...  
6731   The hero of the film is an insurance agent who...  
12344  A developer tries to bulldoze a community recr...  
42045  A young man breaks out of rehab to follow a my...  
938    High c

In [49]:
# Applying a lambda function to split genres separated by ', ' and converting them into lists
data['genre'] = data['genre'].apply(lambda x: x.split(', '))

# Creating a MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Transforming the 'genre' column into a binary matrix
genre_matrix = pd.DataFrame(mlb.fit_transform(data['genre']), columns=mlb.classes_, index=data.index)

# Obtaining the number of unique genres
len(mlb.classes_)

# Concatenate the binary matrix with the original dataframe
result_df = pd.concat([data, genre_matrix], axis=1)

# Saving the processed dataframe to a CSV file
result_df.to_csv('trainmovie.csv', index=False)

In [50]:
# Creating a new column 'all_values' containing lists of genre labels for each row
result_df['all_values'] = result_df.iloc[:, 3:].apply(lambda row: row.tolist(), axis=1)

# Creating a new column 'all_values_length' containing the length of 'all_values' lists
#result_df['all_values_length'] = result_df['all_values'].apply(len)

# Displaying specific columns from the DataFrame
# print(result_df[['fullplot', 'all_values', 'all_values_length']])
print(result_df[['fullplot', 'all_values']])

                                                fullplot  \
33876  Ronnie's (Miley Cyrus) and her younger brother...   
20454  A middle-aged man's conservative life is distu...   
25431  In Los Angeles, an ex-con takes the undergroun...   
3451   After Confederate officer Blayde Hollister's h...   
36880  An engaged couple's backpacking trip in the Ca...   
...                                                  ...   
6731   The hero of the film is an insurance agent who...   
12344  A developer tries to bulldoze a community recr...   
42045  A young man breaks out of rehab to follow a my...   
938    High class European thief Gaston Monescu meets...   
17136  Larsen, an aspiring poet in 20's Oslo, leaves ...   

                                              all_values  all_values_length  
33876  [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...                 26  
20454  [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...                 26  
25431  [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0

In [51]:
# Data Collection - Our df is saved in a CSV file named 'trainmovie.csv'
data = pd.read_csv('trainmovie.csv')

In [52]:
# Dropping the 'genre' column from the original DataFrame
data.drop(['genre'], inplace=True, axis=1)

# Creating a new DataFrame 'new_df' with 'text' and 'labels' columns
new_df = pd.DataFrame()
new_df['text'] = data['fullplot']
new_df['labels'] = data.iloc[:, 1:].values.tolist()

In [53]:
# Display first 5 rows in dataframe
new_df.head()

Unnamed: 0,text,labels
0,Ronnie's (Miley Cyrus) and her younger brother...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
1,A middle-aged man's conservative life is distu...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"In Los Angeles, an ex-con takes the undergroun...","[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
3,After Confederate officer Blayde Hollister's h...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,An engaged couple's backpacking trip in the Ca...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


Dataloader is used to for creating training and validation dataloader that load data to the neural network in a defined manner. This is needed because all the data from the dataset cannot be loaded to the memory at once, hence the amount of dataloaded to the memory and then passed to the neural network needs to be controlled.
This control is achieved using the parameters such as batch_size and max_len.
Training and Validation dataloaders are used in the training and validation part of the flow respectively

In [54]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 3
LEARNING_RATE = 3e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

In [55]:
# This class is defined to accept the tokenizer, dataframe and max_length as input 
# and generate tokenized output and tags that is used by the BERT model for training.
# We are using the DistilBERT tokenizer to tokenize the data in the `text` column of the dataframe.
# The tokenizer uses the `encode_plus` method to perform tokenization 
# and generate the necessary outputs, namely: `ids`, `attention_mask`, `token_type_ids` 
# `targets` is the list of categories labled as `0` or `1` in the dataframe.
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len
        print(self.targets)

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [56]:
# Creating the dataset and dataloader for the neural network

# Training Dataset is used to fine tune the model: 80% of the original data
train_size = 0.8

# The MultiLabelDataset class is used to create 2 datasets, for training and for validation.
# Validation Dataset is used to evaluate the performance of the model. The model has not seen this data during training.

train_data=new_df.sample(frac=train_size,random_state=200)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (41244, 2)
TRAIN Dataset: (32995, 2)
TEST Dataset: (8249, 2)
0        [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
1        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
2        [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...
3        [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
4        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                               ...                        
32990    [1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...
32991    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
32992    [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
32993    [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
32994    [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
Name: labels, Length: 32995, dtype: object
0       [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
1       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
4

In [57]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [58]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.

class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, len(mlb.classes_))

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistilBERTClass()
model.to(device)

DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in

In [59]:
# Calculate the loss and the accuracy of models prediction.
# The loss function used will be a combination of Binary Cross Entropy which is implemented as BCELogits Loss in PyTorch.
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [60]:
# Optimizer is used to update the weights of the neural network to improve its performance.
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [61]:
# Definition of a training function that trains the model on the training dataset created above, specified number of times (EPOCH), 
# An epoch defines how many times the complete data will be passed through the network.
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        # print("Output Size:", outputs.size())
        # print("Target Size:", targets.size())

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        loss.backward()
        optimizer.step()

In [62]:
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  0.6937980651855469


4125it [25:21,  2.71it/s]
0it [00:00, ?it/s]

Epoch: 1, Loss:  0.1270284503698349


4125it [25:23,  2.71it/s]
0it [00:00, ?it/s]

Epoch: 2, Loss:  0.11802014708518982


4125it [25:23,  2.71it/s]


In [63]:
# During the validation stage we pass the unseen data(Testing Dataset) to the model. 
# This step determines how good the model performs on the unseen data.
# This unseen data is the 20% of `train.csv` which was seperated during the Dataset creation stage.
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [64]:
# The final output is compared to the actual value. This comparison is then used to calcuate the accuracy of the model.
outputs, targets = validation(testing_loader)
final_outputs = np.array(outputs) >=0.5

1032it [02:25,  7.11it/s]


In [65]:
# Get a measure of our models performance metrics - Hamming Score and Hamming Loss
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

outputs_np=np.array(targets)
targets_np=np.array(final_outputs)

threshold = 0.5
outputs_thresholded = (outputs_np > threshold).astype(int)

from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate metrics
precision = precision_score(targets_np, outputs_thresholded, average='samples')
recall = recall_score(targets_np, outputs_thresholded, average='samples')
f1 = f1_score(targets_np, outputs_thresholded, average='samples')

# Print the metrics as needed
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Hamming Score = 0.5363276356730108
Hamming Loss = 0.0526842759840699
Precision: 0.6194690265486725
Recall: 0.7288459207176627
F1 Score: 0.6346847236894515
