In [None]:
from typing import Tuple

import numpy as np
import sklearn
import sklearn.model_selection
import torch
import torch.nn.functional as F
from tqdm.notebook import tqdm
import pandas as pd

from recommend.utils import PROJ_ROOT

In [None]:
ratings = pd.read_pickle(f"{PROJ_ROOT}/data/ratings.pkl")

In [None]:
ratings.shape

In [None]:
def filter_ratings(ratings: pd.DataFrame, min_user_ratings: int, min_movie_ratings: int) -> pd.DataFrame:
    ratings = ratings.dropna()

    user_rating_count = ratings.groupby("username").size()
    worthy_users = set(user_rating_count[user_rating_count >= min_user_ratings].index)
    ratings = ratings.loc[ratings.username.isin(worthy_users)]

    movie_rating_count = ratings.groupby("movie_id").size()
    worthy_movies = set(movie_rating_count[movie_rating_count >= min_movie_ratings].index)
    ratings = ratings.loc[ratings.movie_id.isin(worthy_movies)]
    
    return ratings

In [None]:
ratings = filter_ratings(ratings, 10, 10)

In [None]:
ratings.shape

In [None]:
ratings_train, ratings_test = sklearn.model_selection.train_test_split(
    ratings,
    test_size=0.1,
    shuffle=True,
    random_state=0,
)

ratings_train, ratings_valid = sklearn.model_selection.train_test_split(
    ratings_train,
    test_size=0.1,
    shuffle=True,
    random_state=0,
)

In [None]:
idx2movie = pd.Series(ratings.movie_id.unique())
movie2idx = pd.Series(idx2movie.index.values, index=idx2movie.values)
movie2idx.head()

In [None]:
idx2user = pd.Series(ratings.username.unique())
user2idx = pd.Series(idx2user.index.values, index=idx2user.values)
user2idx.head()


In [None]:
random_gen = np.random.default_rng(seed=42)

num_features = 100

movies_shape = len(movie2idx), num_features
movies = torch.from_numpy(
    random_gen.standard_normal(movies_shape, dtype=np.float32).clip(-2, 2)
)

users_shape = len(user2idx), num_features
users = torch.from_numpy(
    random_gen.standard_normal(users_shape, dtype=np.float32).clip(-2, 2)
)


In [None]:
device = 'cuda'
movies = movies.to(device).requires_grad_()
users = users.to(device).requires_grad_()
None

In [None]:
optim = torch.optim.Adam(
    params=[movies, users],
    lr=1e-4,
)

In [None]:
ratings_train.head()

In [None]:
class RatingDataset(torch.utils.data.Dataset): 

    def __init__(self, df_ratings: pd.DataFrame, movie2idx: pd.Series, user2idx: pd.Series):
        self.df_ratings = df_ratings
        self.movie2idx = movie2idx
        self.user2idx = user2idx

    def __len__(self) -> int:
        return len(self.df_ratings)

    def __getitem__(self, idx: int) -> Tuple[int, int, float]:
        row = self.df_ratings.iloc[idx]
        idx_movie = self.movie2idx[row.movie_id]
        idx_user = self.user2idx[row.username]
        rating = (row.stars / 5.0).astype(np.float32)
        return idx_movie, idx_user, rating

In [None]:
ds_train = RatingDataset(ratings_train, movie2idx, user2idx)
ds_valid = RatingDataset(ratings_valid, movie2idx, user2idx)
ds_test = RatingDataset(ratings_test, movie2idx, user2idx)

ds_train_mini = torch.utils.data.Subset(
    ds_train,
    np.random.default_rng(0).choice(len(ds_train), len(ds_valid), replace=False)
)

In [None]:
len(ds_train), len(ds_valid), len(ds_test), len(ds_train_mini)

In [None]:
ds_train[0]

In [None]:
batch_size = 64

def collate_move(device):
    def collate(*params):
        movie_idx, user_idx, rating = torch.utils.data.default_collate(*params)
        return movie_idx.to(device), user_idx.to(device), rating.to(device)
    return collate


loader_train = torch.utils.data.DataLoader(ds_train, batch_size, collate_fn=collate_move(device))
loader_valid = torch.utils.data.DataLoader(ds_valid, batch_size, collate_fn=collate_move(device))
loader_test = torch.utils.data.DataLoader(ds_test, batch_size, collate_fn=collate_move(device))

loader_train_mini = torch.utils.data.DataLoader(ds_train_mini, batch_size, collate_fn=collate_move(device))

In [None]:
len(loader_train), len(loader_valid), len(loader_test), len(loader_train_mini)

In [None]:
def evaluate(movies, users, loader, desc, leave) -> float:
    total_loss = 0.0
    with torch.no_grad():
        for movie_idx, user_idx, rating in tqdm(loader, desc=desc, leave=leave, position=1):
            m = movies[movie_idx]
            u = users[user_idx]
            pred = torch.sigmoid((m * u).sum(-1))
            total_loss += F.mse_loss(pred, rating).cpu().item()
    return total_loss / len(loader)


In [None]:

"""
class Trainer:
    def __init__(movies, users, loader_train, loader_valid):
        self.movies = movies
        self.users = users
"""

patience = 5
step = 0
losses_train = []
losses_valid = []
past_movies = []
past_users = []
for epoch in range(25):
    for movie_idx, user_idx, rating in tqdm(loader_train, desc=f"epoch {epoch}", position=0):
        step += 1
        m = movies[movie_idx]
        u = users[user_idx]
        pred = torch.sigmoid((m * u).sum(-1))
        loss = F.mse_loss(pred, rating)
        loss.backward()
        optim.step()
        optim.zero_grad()
        if step % 12500 == 0:
            losses_train.append(evaluate(movies, users, loader_train_mini, "train evaluation", leave=False))
            losses_valid.append(evaluate(movies, users, loader_valid, "validation evaluation", leave=False))
            print(f"train loss: {losses_train[-1]:.4f}, validation loss: {losses_valid[-1]:.4f}")
            past_movies.append(movies.clone().detach().cpu())
            past_users.append(users.clone().detach().cpu())
            if len(losses_valid) >= patience:
                past_movies.pop(0)
                past_users.pop(0)
                last_n = losses_valid[-patience:]
                if np.argmax(last_n) == 0:
                    break
