In [1]:
import numpy as np
import pandas as pd

from src import utils, models, metrics

### 1 Data Preparation

In [2]:
# read data
df_movies, df_users, df_ratings = utils.read_pickles("../../data/ml-1m-after_eda/")

Create dummies for all categorical variables:

In [3]:
df_users = df_users[["UserID", "Gender", "Age", "Occupation", "State"]]
df_users.head(3)

Unnamed: 0,UserID,Gender,Age,Occupation,State
0,1,F,1,10,MI
1,2,M,56,16,LA
2,3,M,25,15,MN


In [4]:
categorical_features = ['Gender', 'Age', 'Occupation', 'State']
df_users = pd.get_dummies(df_users, columns=categorical_features, dtype=int)
df_users.head(3)

Unnamed: 0,UserID,Gender_F,Gender_M,Age_1,Age_18,Age_25,Age_35,Age_45,Age_50,Age_56,...,State_SD,State_TN,State_TX,State_UT,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY
0,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,3,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df_movies = df_movies.drop(["Title", "Genres"], axis=1)
df_movies.head(3)

Unnamed: 0,MovieID,Year,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1995,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,1995,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,1995,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0


In [6]:
def map_year_to_decade(year):
    try:
        year = int(year)
        if 1900 <= year <= 2000:
            return f"{(year // 10) * 10}"
        else:
            return "Other"
    except:
        return "Other"
    
df_movies['Decade'] = df_movies['Year'].apply(map_year_to_decade)
df_movies = pd.get_dummies(df_movies, columns=['Decade'])
df_movies = df_movies.drop(["Year"], axis=1).astype(int)
df_movies.head()

Unnamed: 0,MovieID,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,...,Decade_1910,Decade_1920,Decade_1930,Decade_1940,Decade_1950,Decade_1960,Decade_1970,Decade_1980,Decade_1990,Decade_2000
0,1,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,3,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,4,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,5,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [7]:
df_ratings = df_ratings[["UserID", "MovieID", "Rating", "Datetime"]]
df_ratings = df_ratings.sort_values("Datetime")
df_ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Datetime
1000138,6040,858,4,2000-04-25 23:05:32
1000153,6040,2384,4,2000-04-25 23:05:54
999873,6040,593,5,2000-04-25 23:05:54
1000007,6040,1961,4,2000-04-25 23:06:17
1000192,6040,2019,5,2000-04-25 23:06:17


In [8]:
# compute averages before the current rating
def compute_rolling_averages(group):
    return group.expanding().mean().shift()

df_ratings['AvgUserRating'] = df_ratings.groupby('UserID')['Rating'].transform(compute_rolling_averages)
df_ratings['AvgMovieRating'] = df_ratings.groupby('MovieID')['Rating'].transform(compute_rolling_averages)

df_ratings.head(5)

Unnamed: 0,UserID,MovieID,Rating,Datetime,AvgUserRating,AvgMovieRating
1000138,6040,858,4,2000-04-25 23:05:32,,
1000153,6040,2384,4,2000-04-25 23:05:54,4.0,
999873,6040,593,5,2000-04-25 23:05:54,4.0,
1000007,6040,1961,4,2000-04-25 23:06:17,4.333333,
1000192,6040,2019,5,2000-04-25 23:06:17,4.25,


In [9]:
# drop date
df_ratings = df_ratings.drop("Datetime", axis=1)
df_ratings.head(3)

Unnamed: 0,UserID,MovieID,Rating,AvgUserRating,AvgMovieRating
1000138,6040,858,4,,
1000153,6040,2384,4,4.0,
999873,6040,593,5,4.0,


Merge and save:

In [10]:
df_all = df_ratings.merge(df_users, on="UserID").merge(df_movies, on="MovieID")

In [11]:
df_all.head(3)

Unnamed: 0,UserID,MovieID,Rating,AvgUserRating,AvgMovieRating,Gender_F,Gender_M,Age_1,Age_18,Age_25,...,Decade_1910,Decade_1920,Decade_1930,Decade_1940,Decade_1950,Decade_1960,Decade_1970,Decade_1980,Decade_1990,Decade_2000
0,6040,858,4,,,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,6040,2384,4,4.0,,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,6040,593,5,4.0,,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [12]:
# df_all.to_pickle("../../data/ml-1m-deep-learning/all.pickle")

### 1* Same, but using utils:

In [3]:
df_movies, df_users, df_ratings = utils.read_pickles("../../data/ml-1m-after_eda/")
df = utils.dl_data_pipeline(df_movies, df_users, df_ratings)
df.head(3)

Unnamed: 0,UserID,MovieID,Rating,AvgUserRating,AvgMovieRating,Gender_F,Gender_M,Age_1,Age_18,Age_25,...,Decade_1910,Decade_1920,Decade_1930,Decade_1940,Decade_1950,Decade_1960,Decade_1970,Decade_1980,Decade_1990,Decade_2000
0,6040,858,4,,,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,6040,2384,4,4.0,,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,6040,593,5,4.0,,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0


### 2 Deep Learning model

In [4]:
# df = df.drop(["UserID", "MovieID"], axis=1)
df_cache = df.copy()
df = df.drop(["UserID", "MovieID"], axis=1)

In [5]:
train, test = utils.TrainTestSplitter.split_by_percent(df, 0.8, sort_by_datetime=True)
train_cache, test_cache = utils.TrainTestSplitter.split_by_percent(df_cache, 0.8, sort_by_datetime=True) # for ranking metrics

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Train shape: (800167, 113)
Test shape: (200042, 113)


In [8]:
# calculate means from train
mean_users_rating = train["AvgUserRating"].mean()
mean_movies_rating = train["AvgMovieRating"].mean()

# fill missing values in train
train["AvgUserRating"] = train["AvgUserRating"].fillna(mean_users_rating)
train["AvgMovieRating"] = train["AvgMovieRating"].fillna(mean_movies_rating)

# fill missing values in test
test["AvgUserRating"] = test["AvgUserRating"].fillna(mean_users_rating)
test["AvgMovieRating"] = test["AvgMovieRating"].fillna(mean_movies_rating)

In [15]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from tqdm import tqdm

class MovieDataset(Dataset):
    def __init__(self, dataframe):
        self.features = dataframe.drop('Rating', axis=1).values
        self.labels = dataframe['Rating'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.tensor(self.features[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.float32)

class MovieRatingNN(nn.Module):
    def __init__(self, num_features):
        super(MovieRatingNN, self).__init__()
        self.layer1 = nn.Linear(num_features, 128)
        self.layer2 = nn.Linear(128, 64)
        self.output_reg = nn.Linear(64, 1)

    def forward(self, x):
        x = nn.ReLU()(self.layer1(x))
        x = nn.ReLU()(self.layer2(x))
        x = self.output_reg(x)
        x = torch.sigmoid(x) * 4 + 1
        return x

dataset = MovieDataset(train)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
model = MovieRatingNN(num_features=len(dataset[0][0]))

criterion_reg = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0
    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), desc=f"Epoch {epoch+1}/{num_epochs}")
    for i, (features, labels) in progress_bar:
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion_reg(outputs.view(-1), labels.float())
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
        progress_bar.set_postfix(loss=running_loss/(i+1))
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(dataloader):.4f}")

Epoch 1/10: 100%|██████████| 25006/25006 [00:36<00:00, 692.88it/s, loss=0.89] 


Epoch 1/10, Loss: 0.8901


Epoch 2/10: 100%|██████████| 25006/25006 [00:39<00:00, 637.96it/s, loss=0.867]


Epoch 2/10, Loss: 0.8674


Epoch 3/10: 100%|██████████| 25006/25006 [00:40<00:00, 611.78it/s, loss=0.858]


Epoch 3/10, Loss: 0.8575


Epoch 4/10: 100%|██████████| 25006/25006 [00:42<00:00, 584.14it/s, loss=0.851]


Epoch 4/10, Loss: 0.8515


Epoch 5/10: 100%|██████████| 25006/25006 [00:42<00:00, 590.92it/s, loss=0.847]


Epoch 5/10, Loss: 0.8469


Epoch 6/10: 100%|██████████| 25006/25006 [00:36<00:00, 689.83it/s, loss=0.843]


Epoch 6/10, Loss: 0.8432


Epoch 7/10: 100%|██████████| 25006/25006 [00:45<00:00, 545.70it/s, loss=0.84] 


Epoch 7/10, Loss: 0.8398


Epoch 8/10: 100%|██████████| 25006/25006 [00:40<00:00, 616.19it/s, loss=0.837]


Epoch 8/10, Loss: 0.8370


Epoch 9/10: 100%|██████████| 25006/25006 [00:42<00:00, 583.84it/s, loss=0.834]


Epoch 9/10, Loss: 0.8344


Epoch 10/10: 100%|██████████| 25006/25006 [00:40<00:00, 624.70it/s, loss=0.832]


Epoch 10/10, Loss: 0.8319


In [16]:
torch.save(model.state_dict(), "../../artifacts/simple_nn.pth")

Let's check performance:

In [17]:
test_tensor = torch.tensor(test.drop(["Rating"], axis=1).values, dtype=torch.float32)
predicted_scores = model.forward(test_tensor).detach().numpy()

In [25]:
# evaluate
ml_dict = metrics.ml_metrics(true_scores=test["Rating"].values,
                             predicted_scores=predicted_scores)
pred_dict = metrics.predictive_metrics(test=test_cache, predicted_scores=predicted_scores,
                                       k=5, threshold=4)
rank_dict = metrics.rank_metrics(test=test_cache, predicted_scores=predicted_scores,
                                 k=5, threshold=4)

print(f"ML merics: {ml_dict}")
print(f"Predictive merics: {pred_dict}")
print(f"Rank merics: {rank_dict}")

ML merics: {'mae': 0.728, 'rmse': 0.931, 'precision': 0.801, 'recall': 0.513, 'f1': 0.625, 'roc_auc': 0.675}
Predictive merics: {'k': 5, 'threshold': 4, 'precision_at_k': 0.813, 'recall_at_k': 0.185, 'avrg_prec_at_k': 0.84, 'n_users_with_k': 1657}
Rank merics: {'mean_reciprocal_rank': 1.243, 'hit_rate': 0.983}


Comparing to baseline we got better results:
- RMSE: 0.931 vs 0.985
- ROC-AUC: 0.675 vs 0.617