In [181]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizer, DistilBertModel

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch import optim
from torch.optim.lr_scheduler import StepLR

#### Develop simple model with description only

In [182]:
DATA_PATH = "./data/netflix_movies_single_genre.csv"

In [183]:
def read_data(data_path: str, features: list):
    """Read the data, keep only the features needed and drop missing"""

    # Read the data
    df = pd.read_csv(data_path)

    # Get only the description and genre
    df = df.loc[:, features]

    # Remove missing values
    df = df.dropna()

    return df

In [184]:
df = read_data(DATA_PATH, features = ["description", "genre"])

In [185]:
# Label encode the genre

def label_encode_column(df: pd.DataFrame, column_name: str):
  """
  Function to label encode a column

  Args:
      df: The pandas DataFrame containing the column to encode.
      column_name: The name of the column to encode.

  Returns:
      A new pandas DataFrame with the encoded column.
  """

  # Create a label encoder
  le = LabelEncoder()
  le = le.fit(df[column_name])

  # Encode the column
  df[column_name] = le.transform(df[column_name])

  return df, le

In [186]:
df_enc, le = label_encode_column(df=df, column_name = "genre")

In [187]:
def split_data(df: pd.DataFrame, target: str, test_size: float =0.2, random_state=None):
  """
  Splits the DataFrame into train and test sets

  Args:
      df: DataFrame containing the features and target
      target: String with the name of the target column
      test_size: Float between 0.0 and 1.0 representing the proportion of data 
                 allocated to the test set (default is 0.2).
      random_state: Integer seed for random number generation (for reproducibility).

  Returns:
      A tuple of four Pandas DataFrames containing the X and y for the train and test sets

  """

  X = df.drop(target, axis = 1)
  y = df[target]

  # Split the data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
  X_train = X_train.reset_index(drop=True)
  X_test = X_test.reset_index(drop=True)
  y_train = y_train.reset_index(drop=True)
  y_test = y_test.reset_index(drop=True)

  return X_train, X_test, y_train, y_test


In [188]:
X_train, X_test, y_train, y_test = split_data(df, "genre", test_size=0.2, random_state=42)

In [189]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [190]:
class TokenDataLoader(Dataset):
    def __init__(self, X, y):
        self.desc = X.loc[:, "description"]
        self.y = y

        #### Embedding encoder ####

        # Preprocess the text
        encoded_input = tokenizer(self.desc.tolist(), return_tensors="pt", padding=True, truncation=True)

        # Get input IDs and attention mask
        self.input_ids = encoded_input["input_ids"]
        self.attention_mask = encoded_input["attention_mask"]
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_mask[idx], self.y[idx]

In [191]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    return torch.device('mps')

In [192]:
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

In [193]:
class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [194]:
device = get_default_device()
device

device(type='mps')

In [195]:
train_ds = TokenDataLoader(X_train, y_train)
test_ds = TokenDataLoader(X_test, y_test)

In [196]:
batch_size = 32
train_dl = DataLoader(train_ds, batch_size=batch_size,shuffle=True)
valid_dl = DataLoader(test_ds, batch_size=batch_size,shuffle=True)

In [197]:
class SimpleGenreClassifier(nn.Module):
    def __init__(self, embed_model, num_labels):
        super().__init__()
        self.model = embed_model
        self.dropout = nn.Dropout(0.2) 
        self.classifier = nn.Linear(768, num_labels)
        

    def forward(self, X, mask):

        outputs = self.model(input_ids=X, attention_mask=mask)
        outputs = self.dropout(outputs[0])

        logits = self.classifier(outputs[:,0,:].view(-1,768))
                
        return logits


In [198]:
NUM_LABELS = df_enc["genre"].nunique()

In [199]:
model = SimpleGenreClassifier(embed_model=bert_model, num_labels=NUM_LABELS)

In [200]:
model_path = "./simple_genre_classifier.pt"
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

In [201]:
def train_model(model, optim, train_dl):
    criterion = nn.CrossEntropyLoss()
    model.train()
    total = 0
    sum_loss = 0
    for X, mask, y in train_dl:
        batch = y.shape[0]
        logits = model(X, mask)
        loss = criterion(logits.view(-1, NUM_LABELS), y.view(-1))
        optim.zero_grad()
        loss.backward()
        optim.step()
        total += batch
        sum_loss += batch*(loss.item())
    return sum_loss/total

In [202]:
def val_loss(model, valid_dl):
    criterion = nn.CrossEntropyLoss()
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for X, mask, y in valid_dl:
        current_batch_size = y.shape[0]
        logits = model(X, mask)
        loss = criterion(logits.view(-1, NUM_LABELS), y.view(-1))
        sum_loss += current_batch_size*(loss.item())
        total += current_batch_size
        pred = torch.max(logits, 1)[1]
        correct += (pred == y).float().sum().item()
    print("valid loss %.3f and accuracy %.3f" % (sum_loss/total, correct/total))
    return sum_loss/total, correct/total

In [203]:
optimizer = optim.Adam(model.parameters(), lr=0.000001)
scheduler = StepLR(optimizer, step_size=25, gamma=0.99)

def train_loop(model, epochs):
    for i in range(epochs): 
        loss = train_model(model, optimizer, train_dl)
        print("training loss: ", loss)
        val_loss(model, valid_dl)
        scheduler.step()
        torch.save(model.state_dict(), "simple_genre_classifier.pt")

In [180]:
train_loop(model, 10)

training loss:  2.8267989183379703
valid loss 2.606 and accuracy 0.245
training loss:  2.531837686947885
valid loss 2.430 and accuracy 0.296
training loss:  2.4068875849040734
valid loss 2.332 and accuracy 0.304
training loss:  2.2909683512981376
valid loss 2.226 and accuracy 0.322
training loss:  2.1777342743527104
valid loss 2.121 and accuracy 0.360
training loss:  2.0720416711688454
valid loss 2.024 and accuracy 0.390
training loss:  1.9740789142859436
valid loss 1.942 and accuracy 0.413
training loss:  1.8843965080782616
valid loss 1.868 and accuracy 0.424
training loss:  1.8087816787013546
valid loss 1.809 and accuracy 0.434
training loss:  1.741842565239507
valid loss 1.754 and accuracy 0.453


In [204]:
train_loop(model, 5)

training loss:  1.6914872025948495
valid loss 1.720 and accuracy 0.458
training loss:  1.6493290386397945
valid loss 1.691 and accuracy 0.468


### Evaluate Model on test set

In [152]:
y_pred = []
y_true = []

batch_size = 32
valid_dl = DataLoader(test_ds, batch_size=batch_size,shuffle=True)

for X, mask, y in valid_dl:

    model.eval()
    logits = model(X, mask)
    pred = torch.max(logits, 1)[1]
    pred = list(pred.squeeze().detach().numpy())
    true = list(y.squeeze().detach().numpy())

    y_pred.extend(pred)
    y_true.extend(true)



In [153]:
def get_class_accuracy(y_pred, y_true, le):
    """
    Obtains the class level accuracy with counts of correct and incorrectly classified

    Args:
        y_pred: Numpy array of predicted classes
        y_true: Numpy array of true classes
        le: Trained label encoder to convert labels back to descriptions of genre

    Returns:
        Pandas DataFrame with predictions

    """

    preds = pd.DataFrame([y_pred, y_true]).T
    preds.columns = ["y_pred", "y_true"]
    preds["correct"] = np.where(preds["y_pred"]==preds["y_true"], 1, 0)
    preds["genre"] = le.inverse_transform(list(preds["y_true"]))

    genre_predictions = pd.DataFrame(preds.groupby("genre")['correct'].agg(['mean', 'sum', 'count']))
    genre_predictions["incorrect"] = genre_predictions["count"] - genre_predictions["sum"]
    genre_predictions = genre_predictions.sort_values(by="incorrect", ascending=False)
    genre_predictions.columns=["accuracy", "correct", "number_movies", "incorrect"]

    return genre_predictions

In [154]:
get_class_accuracy(y_pred, y_true, le)

Unnamed: 0_level_0,accuracy,correct,number_movies,incorrect
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
drama,0.477912,119,249,130
comedy,0.560185,121,216,95
action,0.271429,19,70,51
romance,0.537736,57,106,49
thriller,0.240741,13,54,41
family,0.305085,18,59,41
documentation,0.446429,25,56,31
crime,0.686047,59,86,27
scifi,0.297297,11,37,26
animation,0.424242,14,33,19
