In [139]:
import numpy as np
import pandas as pd

from src import utils, models, metrics

In [140]:
# read data
df_movies, df_users, df_ratings = utils.read_pickles("../../data/ml-1m-after_eda/")

### 1 Data Preparation

Create dummies for all categorical variables:

In [141]:
df_users = df_users[["UserID", "Gender", "Age", "Occupation", "State"]]
df_users.head(3)

Unnamed: 0,UserID,Gender,Age,Occupation,State
0,1,F,1,10,MI
1,2,M,56,16,LA
2,3,M,25,15,MN


In [142]:
categorical_features = ['Gender', 'Age', 'Occupation', 'State']
df_users = pd.get_dummies(df_users, columns=categorical_features, dtype=int)
df_users.head(3)

Unnamed: 0,UserID,Gender_F,Gender_M,Age_1,Age_18,Age_25,Age_35,Age_45,Age_50,Age_56,...,State_SD,State_TN,State_TX,State_UT,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY
0,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,3,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [143]:
df_movies = df_movies.drop(["Title", "Genres"], axis=1)
df_movies.head(3)

Unnamed: 0,MovieID,Year,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1995,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,1995,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,1995,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0


In [144]:
def map_year_to_decade(year):
    try:
        year = int(year)
        if 1900 <= year <= 2000:
            return f"{(year // 10) * 10}"
        else:
            return "Other"
    except:
        return "Other"
    
df_movies['Decade'] = df_movies['Year'].apply(map_year_to_decade)
df_movies = pd.get_dummies(df_movies, columns=['Decade'])
df_movies = df_movies.drop(["Year"], axis=1).astype(int)
df_movies.head()

Unnamed: 0,MovieID,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,...,Decade_1910,Decade_1920,Decade_1930,Decade_1940,Decade_1950,Decade_1960,Decade_1970,Decade_1980,Decade_1990,Decade_2000
0,1,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,3,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,4,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,5,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [145]:
df_ratings = df_ratings[["UserID", "MovieID", "Rating", "Datetime"]]
df_ratings = df_ratings.sort_values("Datetime")
df_ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Datetime
1000138,6040,858,4,2000-04-25 23:05:32
1000153,6040,2384,4,2000-04-25 23:05:54
999873,6040,593,5,2000-04-25 23:05:54
1000007,6040,1961,4,2000-04-25 23:06:17
1000192,6040,2019,5,2000-04-25 23:06:17


In [146]:
# compute averages before the current rating
def compute_rolling_averages(group):
    return group.expanding().mean().shift()

df_ratings['AvgUserRating'] = df_ratings.groupby('UserID')['Rating'].transform(compute_rolling_averages)
df_ratings['AvgMovieRating'] = df_ratings.groupby('MovieID')['Rating'].transform(compute_rolling_averages)

df_ratings.head(5)

Unnamed: 0,UserID,MovieID,Rating,Datetime,AvgUserRating,AvgMovieRating
1000138,6040,858,4,2000-04-25 23:05:32,,
1000153,6040,2384,4,2000-04-25 23:05:54,4.0,
999873,6040,593,5,2000-04-25 23:05:54,4.0,
1000007,6040,1961,4,2000-04-25 23:06:17,4.333333,
1000192,6040,2019,5,2000-04-25 23:06:17,4.25,


In [147]:
# drop date
df_ratings = df_ratings.drop("Datetime", axis=1)
df_ratings.head(3)

Unnamed: 0,UserID,MovieID,Rating,AvgUserRating,AvgMovieRating
1000138,6040,858,4,,
1000153,6040,2384,4,4.0,
999873,6040,593,5,4.0,


Merge and save:

In [148]:
df_all = df_ratings.merge(df_users, on="UserID").merge(df_movies, on="MovieID")

In [149]:
df_all.head(3)

Unnamed: 0,UserID,MovieID,Rating,AvgUserRating,AvgMovieRating,Gender_F,Gender_M,Age_1,Age_18,Age_25,...,Decade_1910,Decade_1920,Decade_1930,Decade_1940,Decade_1950,Decade_1960,Decade_1970,Decade_1980,Decade_1990,Decade_2000
0,6040,858,4,,,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,6040,2384,4,4.0,,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,6040,593,5,4.0,,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [150]:
df_all.to_pickle("../../data/ml-1m-deep-learning/all.pickle")

### 2 Deep Learning model

In [171]:
df = pd.read_pickle("../../data/ml-1m-deep-learning/all.pickle")

In [173]:
df = df.drop(["UserID", "MovieID"], axis=1)

In [154]:
# split data (10 reviews for each user in test)
train, test = utils.TrainTestSplitter.split_by_users(df, n_reviews_in_test=10)
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Train shape: (939809, 115)
Test shape: (60400, 115)


In [174]:
# get means from train
mean_users_rating = train["AvgUserRating"].mean()
mean_movies_rating = train["AvgMovieRating"].mean()

# fill nans
train["AvgUserRating"] = train["AvgUserRating"].fillna(mean_users_rating)
train["AvgMovieRating"] = train["AvgMovieRating"].fillna(mean_movies_rating)
test["AvgUserRating"] = test["AvgUserRating"].fillna(mean_users_rating)
test["AvgMovieRating"] = test["AvgMovieRating"].fillna(mean_movies_rating)

In [175]:
train = train.drop(["UserID", "MovieID"], axis=1)
test = test.drop(["UserID", "MovieID"], axis=1)

In [181]:
import torch
from torch.utils.data import Dataset, DataLoader

class MovieDataset(Dataset):
    def __init__(self, dataframe):
        self.features = dataframe.drop('Rating', axis=1).values
        self.labels = dataframe['Rating'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.tensor(self.features[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.float32)

dataset = MovieDataset(train)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [233]:
import torch
import torch.nn as nn

class MovieRatingNN(nn.Module):
    def __init__(self, num_features):
        super(MovieRatingNN, self).__init__()
        self.layer1 = nn.Linear(num_features, 128)
        self.layer2 = nn.Linear(128, 64)
        self.output_reg = nn.Linear(64, 1)  # Single output unit for regression

    def forward(self, x):
        x = nn.ReLU()(self.layer1(x))
        x = nn.ReLU()(self.layer2(x))
        x = self.output_reg(x)
        x = torch.sigmoid(x) * 4 + 1  # Scale output to range [1, 5]
        return x

In [237]:
model = MovieRatingNN(num_features=len(dataset[0][0]))

In [238]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

criterion_reg = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0
    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), desc=f"Epoch {epoch+1}/{num_epochs}")
    for i, (features, labels) in progress_bar:
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion_reg(outputs.view(-1), labels.float())
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
        progress_bar.set_postfix(loss=running_loss/(i+1))
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(dataloader):.4f}")

Epoch 1/10: 100%|██████████| 29370/29370 [01:10<00:00, 418.57it/s, loss=0.877]


Epoch 1/10, Loss: 0.8768


Epoch 2/10: 100%|██████████| 29370/29370 [01:09<00:00, 422.73it/s, loss=0.854]


Epoch 2/10, Loss: 0.8541


Epoch 3/10: 100%|██████████| 29370/29370 [01:25<00:00, 345.02it/s, loss=0.846]


Epoch 3/10, Loss: 0.8460


Epoch 4/10: 100%|██████████| 29370/29370 [01:10<00:00, 419.51it/s, loss=0.841]


Epoch 4/10, Loss: 0.8406


Epoch 5/10: 100%|██████████| 29370/29370 [01:06<00:00, 440.57it/s, loss=0.836]


Epoch 5/10, Loss: 0.8363


Epoch 6/10: 100%|██████████| 29370/29370 [01:10<00:00, 413.71it/s, loss=0.833]


Epoch 6/10, Loss: 0.8329


Epoch 7/10: 100%|██████████| 29370/29370 [01:11<00:00, 411.00it/s, loss=0.83] 


Epoch 7/10, Loss: 0.8302


Epoch 8/10: 100%|██████████| 29370/29370 [01:04<00:00, 453.34it/s, loss=0.828]


Epoch 8/10, Loss: 0.8275


Epoch 9/10: 100%|██████████| 29370/29370 [01:04<00:00, 453.12it/s, loss=0.825]


Epoch 9/10, Loss: 0.8253


Epoch 10/10: 100%|██████████| 29370/29370 [01:02<00:00, 466.58it/s, loss=0.823]


Epoch 10/10, Loss: 0.8233


In [249]:
test_tensor = torch.tensor(test.drop(["Rating"], axis=1).values, dtype=torch.float32)
predicted_scores = model.forward(test_tensor).detach().numpy()

In [254]:
true_scores = test["Rating"].values

In [258]:
# evaluate
print(f"ML merics: {metrics.ml_metrics(true_scores, predicted_scores)}")
# print(f"Predictive merics: {metrics.predictive_metrics(test, predicted_scores, k=5, threshold=4)}")
# print(f"Rank merics: {metrics.rank_metrics(test, predicted_scores, k=5, threshold=4)}")

ML merics: {'mae': 0.768, 'rmse': 0.962, 'precision': 0.853, 'recall': 0.339, 'f1': 0.486, 'roc_auc': 0.628}
