In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizer, DistilBertModel

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch import optim
from torch.optim.lr_scheduler import StepLR
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
EMBEDDED_COL_NAMES = ["age_certification"]
NUM_NUMERIC_COLS = 2

#### Develop model with multiple features

In [4]:
DATA_PATH = "./data/netflix_movies_single_genre.csv"

In [5]:
def read_data(data_path: str, features: list):
    """Read the data, keep only the features needed and drop missing"""

    # Read the data
    df = pd.read_csv(data_path)

    # Get only the description and genre
    df = df.loc[:, features]

    # Remove missing values from description
    df = df.dropna(subset=["description"])

    return df

In [6]:
df = read_data(DATA_PATH, features = ["description", 
                                      "runtime", 
                                      "imdb_score", 
                                      "age_certification", 
                                      "genre"
                                      ]
                                    )

In [7]:
def fill_missing(df: pd.DataFrame, col: str, fill_value: str):

    df[col] = df[col].fillna(fill_value)

    return df 

In [8]:
df = fill_missing(df, "age_certification", "unknown")
df = fill_missing(df, "imdb_score", 0)


In [9]:
def label_encode_columns(df: pd.DataFrame, cols: list):
  """
  Function to label encode columns

  Args:
      df: The pandas DataFrame containing the column to encode.
      cols: The columns to encode

  Returns:
      A new pandas DataFrame with the encoded columns.
  """

  # Create a label encoder
  le = LabelEncoder()

  # Encode the columns
  enc_cols = df[cols].apply(le.fit_transform)

  df_no_enc = df.drop(cols, axis = 1)

  df_enc = pd.concat([df_no_enc, enc_cols], axis = 1)

  return df_enc, le

In [10]:
df_enc, le = label_encode_columns(df=df, cols = EMBEDDED_COL_NAMES + ["genre"])


In [11]:
for col in EMBEDDED_COL_NAMES:
    df_enc[col] = df_enc[col].astype('category')

categorical_cols = df_enc.select_dtypes(include='category').columns.tolist()
df_categorical = df_enc.loc[:, categorical_cols]
embedded_cols = {n: len(col.cat.categories) for n,col in df_categorical.items() if len(col.cat.categories) > 2}
embedding_sizes = [(n_categories, min(50, (n_categories+1)//2)) for _,n_categories in embedded_cols.items()]

In [12]:
def split_data(df: pd.DataFrame, target: str, test_size: float =0.2, random_state=None):
  """
  Splits the DataFrame into train and test sets

  Args:
      df: DataFrame containing the features and target
      target: String with the name of the target column
      test_size: Float between 0.0 and 1.0 representing the proportion of data 
                 allocated to the test set (default is 0.2).
      random_state: Integer seed for random number generation (for reproducibility).

  Returns:
      A tuple of four Pandas DataFrames containing the X and y for the train and test sets

  """

  X = df.drop(target, axis = 1)
  y = df[target]

  # Split the data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
  X_train = X_train.reset_index(drop=True)
  X_test = X_test.reset_index(drop=True)
  y_train = y_train.reset_index(drop=True)
  y_test = y_test.reset_index(drop=True)

  return X_train, X_test, y_train, y_test


In [13]:
X_train, X_test, y_train, y_test = split_data(df_enc, "genre", test_size=0.2, random_state=42)

In [14]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [15]:
class TokenDataLoader(Dataset):
    def __init__(self, X, y, embedded_col_names):
        X = X.copy()
        self.desc = X.loc[:, "description"]
        self.X2 = X.loc[:,embedded_col_names].copy().values.astype(np.int64) #categorical columns
        self.X3 = X.drop(columns=embedded_col_names + ["description"]).copy().values.astype(np.float32) #numerical columns
        self.y = y

        #### Embedding encoder ####

        # Preprocess the text
        encoded_input = tokenizer(self.desc.tolist(), return_tensors="pt", padding=True, truncation=True)

        # Get input IDs and attention mask
        self.X1= encoded_input["input_ids"]
        self.attention_mask = encoded_input["attention_mask"]
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X1[idx], self.attention_mask[idx], self.X2[idx], self.X3[idx], self.y[idx]

In [16]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    return torch.device('mps')

In [17]:
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

In [18]:
class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [19]:
device = get_default_device()
device

device(type='mps')

In [20]:
train_ds = TokenDataLoader(X_train, y_train, EMBEDDED_COL_NAMES)
test_ds = TokenDataLoader(X_test, y_test, EMBEDDED_COL_NAMES)

In [21]:
batch_size = 32
train_dl = DataLoader(train_ds, batch_size=batch_size,shuffle=True)
valid_dl = DataLoader(test_ds, batch_size=batch_size,shuffle=True)

In [22]:
class SimpleGenreClassifier(nn.Module):
    def __init__(self, embed_model, num_labels, embedding_sizes, n_cont):
        super().__init__()
        
        self.model = embed_model
        self.bert_lin = nn.Linear(768, 128)
        
        self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories,size in embedding_sizes])
        n_emb = sum(e.embedding_dim for e in self.embeddings)
        self.n_emb, self.n_cont = n_emb, n_cont

        self.bn1 = nn.BatchNorm1d(self.n_cont)
        self.bn2 = nn.BatchNorm1d(200)
        self.bn3 = nn.BatchNorm1d(70)
        self.dropout = nn.Dropout(0.2) 
        self.lin1 = nn.Linear(self.n_emb + self.n_cont+128, 200)
        self.lin2 = nn.Linear(200, 70)
        self.classifier = nn.Linear(70, num_labels)


    def forward(self, X, mask, X_cat, X_cont):

        # Categorical embeddings
        x_cat_embed = [e(X_cat[:,i]) for i,e in enumerate(self.embeddings)]
        x_cat_embed = torch.cat(x_cat_embed, 1)
        x_cat_embed = self.dropout(x_cat_embed)

        X_num = self.bn1(X_cont)

        bert_embed = self.model(input_ids=X, attention_mask=mask)
        bert_embed = self.dropout(bert_embed[0])
        bert_embed = bert_embed[:,0,:].view(-1,768)
        bert_embed = self.bert_lin(bert_embed)

        x = torch.cat([x_cat_embed, bert_embed, X_num], 1)
        x = F.relu(self.lin1(x))
        x = self.dropout(x)
        x = self.bn2(x)
        x = F.relu(self.lin2(x))
        x = self.dropout(x)
        x = self.bn3(x)
        x = self.classifier(x)
                
        return x


In [23]:
NUM_LABELS = df_enc["genre"].nunique()

In [24]:
model = SimpleGenreClassifier(embed_model=bert_model, num_labels=NUM_LABELS, embedding_sizes=embedding_sizes, n_cont=NUM_NUMERIC_COLS)

In [25]:
model_path = "./simple_genre_classifier_v2.pt"
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

In [26]:
def train_model(model, optim, train_dl):
    criterion = nn.CrossEntropyLoss()
    model.train()
    total = 0
    sum_loss = 0
    for X, mask, X_cat, X_cont, y in train_dl:
        batch = y.shape[0]
        logits = model(X, mask, X_cat, X_cont)
        loss = criterion(logits.view(-1, NUM_LABELS), y.view(-1))
        optim.zero_grad()
        loss.backward()
        optim.step()
        total += batch
        sum_loss += batch*(loss.item())
    return sum_loss/total

In [27]:
def val_loss(model, valid_dl):
    criterion = nn.CrossEntropyLoss()
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for X, mask, X_cat, X_cont, y in valid_dl:
        current_batch_size = y.shape[0]
        logits = model(X, mask, X_cat, X_cont)
        loss = criterion(logits.view(-1, NUM_LABELS), y.view(-1))
        sum_loss += current_batch_size*(loss.item())
        total += current_batch_size
        pred = torch.max(logits, 1)[1]
        correct += (pred == y).float().sum().item()
    print("valid loss %.3f and accuracy %.3f" % (sum_loss/total, correct/total))
    return sum_loss/total, correct/total

In [28]:
optimizer = optim.Adam(model.parameters(), lr=0.00001)
scheduler = StepLR(optimizer, step_size=1, gamma=0.99)

def train_loop(model, epochs):
    for i in range(epochs): 
        loss = train_model(model, optimizer, train_dl)
        print("training loss: ", loss)
        val_loss(model, valid_dl)
        scheduler.step()
        torch.save(model.state_dict(), "simple_genre_classifier_v2.pt")

In [32]:
train_loop(model, 15)

training loss:  2.9797281286708213
valid loss 2.765 and accuracy 0.182
training loss:  2.6888233798597923
valid loss 2.503 and accuracy 0.324
training loss:  2.475044599453883
valid loss 2.368 and accuracy 0.358
training loss:  2.309942432753355
valid loss 2.271 and accuracy 0.399
training loss:  2.173746128808256
valid loss 2.154 and accuracy 0.441
training loss:  2.052492932877326
valid loss 2.092 and accuracy 0.453
training loss:  1.9315529030499574
valid loss 2.059 and accuracy 0.469
training loss:  1.8056657689665427
valid loss 1.999 and accuracy 0.477
training loss:  1.6858866223001976
valid loss 1.982 and accuracy 0.481
training loss:  1.565968732107882
valid loss 1.963 and accuracy 0.496
training loss:  1.475085916816157
valid loss 1.965 and accuracy 0.489
training loss:  1.3714839946027446
valid loss 2.034 and accuracy 0.491
training loss:  1.276153812152704
valid loss 1.929 and accuracy 0.508
training loss:  1.1935841088476478
valid loss 1.942 and accuracy 0.521
training loss

In [37]:
train_loop(model, 5)

training loss:  1.066253229821017
valid loss 1.971 and accuracy 0.507
training loss:  0.9890745927718272
valid loss 1.996 and accuracy 0.495


### Evaluate Model on test set

In [32]:
y_pred = []
y_true = []

batch_size = 32
valid_dl = DataLoader(test_ds, batch_size=batch_size,shuffle=True)

for X, mask, X_cat, X_cont, y in valid_dl:

    model.eval()
    logits = model(X, mask, X_cat, X_cont)
    pred = torch.max(logits, 1)[1]
    pred = list(pred.squeeze().detach().numpy())
    true = list(y.squeeze().detach().numpy())

    y_pred.extend(pred)
    y_true.extend(true)



In [33]:
def get_class_accuracy(y_pred, y_true, le):
    """
    Obtains the class level accuracy with counts of correct and incorrectly classified

    Args:
        y_pred: Numpy array of predicted classes
        y_true: Numpy array of true classes
        le: Trained label encoder to convert labels back to descriptions of genre

    Returns:
        Pandas DataFrame with predictions

    """

    preds = pd.DataFrame([y_pred, y_true]).T
    preds.columns = ["y_pred", "y_true"]
    preds["correct"] = np.where(preds["y_pred"]==preds["y_true"], 1, 0)
    preds["genre"] = le.inverse_transform(list(preds["y_true"]))

    genre_predictions = pd.DataFrame(preds.groupby("genre")['correct'].agg(['mean', 'sum', 'count']))
    genre_predictions["incorrect"] = genre_predictions["count"] - genre_predictions["sum"]
    genre_predictions = genre_predictions.sort_values(by="incorrect", ascending=False)
    genre_predictions.columns=["accuracy", "correct", "number_movies", "incorrect"]

    return genre_predictions

In [34]:
get_class_accuracy(y_pred, y_true, le)

Unnamed: 0_level_0,accuracy,correct,number_movies,incorrect
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
drama,0.437751,109,249,140
comedy,0.625,135,216,81
action,0.271429,19,70,51
family,0.152542,9,59,50
romance,0.566038,60,106,46
thriller,0.351852,19,54,35
crime,0.697674,60,86,26
documentation,0.535714,30,56,26
scifi,0.459459,17,37,20
fantasy,0.0,0,17,17
