# Import libraries

In [2]:
import torch
from torch import  optim
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from itertools import product
from IPython.display import display, clear_output
from torch.utils.data import Dataset, DataLoader, SequentialSampler, BatchSampler

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Set all random seed

In [3]:
import os
import random

def seed_everything(seed=42):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
seed_everything()

# Load data

In [4]:
train = pd.read_csv("final_data/train")
test = pd.read_csv("final_data/test")
agg_history_norm = pd.read_csv("final_data/agg_history_norm")
new_df = pd.read_csv("final_data/all_data")

In [5]:
train = train.set_index("Unnamed: 0")
test = test.set_index("Unnamed: 0")
agg_history_norm = agg_history_norm.set_index("userId")
new_df = new_df.set_index("Unnamed: 0")

In [6]:
new_df

Unnamed: 0_level_0,movieId,userId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,transaction_month_April,transaction_month_August,transaction_month_December,transaction_month_February,transaction_month_January,transaction_month_July,transaction_month_June,transaction_month_March,transaction_month_May,transaction_month_November,transaction_month_October,transaction_month_September,movie_date_bin_1900s,movie_date_bin_1910s,movie_date_bin_1920s,movie_date_bin_1930s,movie_date_bin_1940s,movie_date_bin_1950s,movie_date_bin_1960s,movie_date_bin_1970s,movie_date_bin_1980s,movie_date_bin_1990s,movie_date_bin_2000s,movie_date_bin_2010s,transaction_from_movie_year,prev_movieId,train,rating
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1
15993,590,429,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,6.0,,1,5.0
5936,222,429,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1.0,590.0,1,4.0
12093,434,429,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,3.0,222.0,1,4.0
16167,592,429,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,7.0,434.0,1,5.0
6119,225,429,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2.0,592.0,1,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100779,187031,514,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0.0,187593.0,1,2.5
100800,187595,514,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0.0,187031.0,1,3.0
66124,5247,514,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,41.0,187595.0,1,2.5
66116,5246,514,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,38.0,5247.0,1,1.5


In [7]:
train.shape, test.shape, agg_history_norm.shape

((99947, 50), (889, 50), (610, 9724))

# Create data loader

In [8]:
class MovieLenDataset(Dataset):
  def __init__(self, user_ids, data, agg_hist, active_matrix, previous_movie,recommendation=False):
    self.user_ids = user_ids
    self.data = data
    self.agg_hist = agg_hist
    self.active_matrix = active_matrix
    self.previous_movie = previous_movie
    self.recommendation = recommendation

  def __len__(self):
    return self.user_ids.shape[0]

  def __getitem__(self, idx):
    # idx = user id 
    batch_data = self.data[self.data["userId"].isin(idx)] # Select the rows corresponding to the list of user indices `idx` from self.data dataframe
    cat_cols = batch_data.iloc[:,2:47] # From batch_data extract only the one-hot encoded categorical columns
    agg_history = batch_data[['userId']].merge(self.agg_hist, left_on='userId', right_index=True) # Get the aggregated history for each selected transaction using merge
    active_groups = self.active_matrix[self.active_matrix.index.isin(list(batch_data.index))] # Select the rows corresponding to the indices of the transactions selected in batch_data
    previous_movie = self.previous_movie[self.previous_movie.index.isin(list(batch_data.index))]
    features = torch.from_numpy(np.hstack((active_groups.values, agg_history.values, cat_cols.values, previous_movie.values))) # Concatenate the processed columns together horizontally

    if not self.recommendation:
      targets = batch_data['rating']
      return features, targets
    else:
      return features


In [9]:
active_columns = pd.get_dummies(new_df[['userId','movieId']].astype(str))
previous_movie = pd.get_dummies(new_df[['prev_movieId']].astype(str))
dataset_train = MovieLenDataset(train["userId"].unique(), train, agg_history_norm, active_columns, previous_movie)
dataset_test = MovieLenDataset(test["userId"].unique(), test, agg_history_norm, active_columns, previous_movie)

In [10]:
print(previous_movie.shape[0])

100836


In [11]:
previous_movie.isna().sum().sum()

0

In [12]:
dataloader_train = DataLoader(dataset_train, 
                              sampler=BatchSampler(SequentialSampler(dataset_train), batch_size=10, drop_last=False), 
                              batch_size=None)

dataloader_test = DataLoader(dataset_test, 
                              sampler=BatchSampler(SequentialSampler(dataset_test), batch_size=10, drop_last=False), 
                              batch_size=None)

# Create model 

In [13]:
class FactorizationMachine(torch.nn.Module):
  def __init__(self, n, k, bias=False):
    super(FactorizationMachine, self).__init__()
    self.n = n
    self.k = k
    self.linear = torch.nn.Linear(self.n, 1, bias)
    self.V = torch.nn.Parameter(torch.randn(n,k)) # Creating the latent matrix V of size (n X k) and initializing it with random values

  def forward(self, x_batch):
    x_batch = x_batch.float()
  
    
    # Perform the first part of the interaction term: row-wise-sum((XV)^2)
    part_1 = torch.sum(torch.pow(torch.mm(x_batch, self.V), 2), dim=1, keepdim=True)
    
    # Perform the second part of the interaction term: row-wise-sum((X)^2 * (V)^2)
    part_2 = torch.mm(torch.pow(x_batch, 2), torch.pow(self.V, 2))
    part_2 = torch.sum(part_2, dim=1, keepdim=True)
    
    # Put the interaction term parts together
    inter_term = 0.5 * torch.sub(part_1, part_2)
    
    # Perform the linear part of the model equation
    var_strength = self.linear(x_batch)
    return var_strength + inter_term
    

In [14]:
x, y = dataset_train.__getitem__([90])

In [15]:
x.shape

torch.Size([53, 29808])

In [16]:
model = FactorizationMachine(n=x.shape[1], k=20)

In [17]:
model

FactorizationMachine(
  (linear): Linear(in_features=29808, out_features=1, bias=False)
)

# Training

In [18]:
def model_step(model, x, y=None, optimizer=None, train=True):
  if train: # If we're in training phase, then zero the gradients and make sure the model is set to train
    model.train()
    optimizer.zero_grad()
  else: # If we're in evaluation phase, then make sure the model is set to eval
    model.eval()

  with torch.set_grad_enabled(train): # Either to perform the next lines with gradient tracing or not
    pred = model(x) # Get the model output from x
    pred = pred.reshape(pred.shape[0], ) # Flatten the prediction values
    y = torch.from_numpy(y.values.reshape(y.shape[0], )).float()

    criterion = torch.nn.MSELoss() # Define the criterion as MSELoss from torch
    loss = criterion(pred, y)

    if train:
      loss.backward()
      optimizer.step()

  return loss

In [19]:
def train_loop(model, train_loader, eval_loader, lr, w_decay, epochs):
  step = 0
  """ Defining our optimizer """
  optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=w_decay)
  epochs_l, steps, t_losses, v_losses = [], [], [], []

  epochs_tqdm = tqdm(range(epochs), desc='Training in Progress', leave=True)
  for epoch in epochs_tqdm:
    for x, y in train_loader:
      loss_batch = model_step(model, x, y, optimizer, train=True)
      step +=1
     
    train_loss = loss_batch
    val_loss = 0
    for x, y in eval_loader:
      val_loss += model_step(model, x, y, train=False)
    epochs_l.append(epoch+1)
    steps.append(step)
    t_losses.append(np.sqrt(train_loss.detach().numpy()))
    v_losses.append(np.sqrt(val_loss.detach().numpy()))
    clear_output(wait=True)
    display(pd.DataFrame({'Epoch': epochs_l, 'Step': steps, 'Training Loss': t_losses, 'Validation Loss': v_losses}))

In [None]:
train_loop(model, dataloader_train, dataloader_test, lr=0.004, w_decay=0.0003, epochs=500)

In [None]:
x_item, y_item = dataset_train.__getitem__([50])

In [None]:
model.eval()
with torch.no_grad():
  print(f'Predicted rating for User of interest: {model(x_item)[0][0]}') # Get the model output on the user of interest after running the previous cell to now their new_id
  print(f'Actual Rating: {y_item.values[0]}')

In [None]:
torch.save(model.state_dict(), 'models/torch_model.pt')