In [1]:
# Suppress all warnings
import warnings
warnings.filterwarnings("ignore")

import os
import sys
import numpy as np
import pandas as pd
import torch
from tempfile import TemporaryDirectory

from recommenders.utils.constants import (
    DEFAULT_USER_COL as USER, 
    DEFAULT_ITEM_COL as ITEM, 
    DEFAULT_RATING_COL as RATING, 
    DEFAULT_TIMESTAMP_COL as TIMESTAMP, 
    DEFAULT_PREDICTION_COL as PREDICTION
) 
from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.models.fastai.fastai_utils import cartesian_product, scoreV2
from recommenders.evaluation.python_evaluation import map, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.evaluation.python_evaluation import rmse, mae, rsquared, exp_var
from recommenders.utils.notebook_utils import store_metadata

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Torch version: {}".format(torch.__version__))
print("CUDA Available: {}".format(torch.cuda.is_available()))
print("CuDNN Enabled: {}".format(torch.backends.cudnn.enabled))

System version: 3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]
Pandas version: 2.2.3
Torch version: 2.6.0+cu124
CUDA Available: False
CuDNN Enabled: True


In [2]:
from recommenders.models.fastai.model import EmbeddingDotBias
from recommenders.models.fastai.colab_data_loader import CollabDataLoadersV3

In [3]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# Model parameters
N_FACTORS = 40
EPOCHS = 5

In [4]:
ratings_df = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE,
    header=[USER,ITEM,RATING,TIMESTAMP]
)

# make sure the IDs are loaded as strings to better prevent confusion with embedding ids
ratings_df[USER] = ratings_df[USER].astype('str')
ratings_df[ITEM] = ratings_df[ITEM].astype('str')

ratings_df.head()

100%|██████████| 4.81k/4.81k [00:00<00:00, 16.2kKB/s]


Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [5]:
train_valid_df, test_df = python_stratified_split(
    ratings_df,
    ratio=0.75, 
    min_rating=1, 
    filter_by="item", 
    col_user=USER, 
    col_item=ITEM
)

In [6]:
train_valid_df

Unnamed: 0,userID,itemID,rating,timestamp
10047,94,1,4.0,885870323
44185,620,1,5.0,889987954
82784,779,1,4.0,875501555
83281,399,1,4.0,882340657
69124,864,1,5.0,877214125
...,...,...,...,...
77891,429,999,2.0,882387163
31448,393,999,4.0,889730187
7847,125,999,4.0,892838288
42623,476,999,2.0,883365385


In [7]:
# Remove "cold" users from test set  
test_df = test_df[test_df.userID.isin(train_valid_df.userID)]

## Training

In [8]:
np.random.seed(101)
torch.manual_seed(101)
torch.cuda.manual_seed_all(101)

In [9]:
with Timer() as preprocess_time:
    data = CollabDataLoadersV3.from_df(train_valid_df, 
                                     user_name=USER, 
                                     item_name=ITEM, 
                                     rating_name=RATING, 
                                     valid_pct=0.001) 

In [10]:
data.show_batch()

Showing a sample batch:
Showing 5 examples from a batch:
  userID itemID  rating
0    505    161     3.0
1    500   1010     4.0
2    172    430     3.0
3    880    380     3.0
4    158      4     4.0


In [11]:
#classes = data.classes['itemID']

In [12]:
#classes = data.classes

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import MSELoss

In [14]:
n_factors = 40
model = EmbeddingDotBias.from_classes(
    n_factors=n_factors,
    classes=data.classes,
    user='userID',
    item='itemID',
    y_range=[0,5.5]
)


In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = optim.AdamW(model.parameters(), 
    lr=1e-3,
    betas=(0.9, 0.99),
    eps=1e-5,
    weight_decay=0.01
)

loss_fn = MSELoss()

In [16]:
# 5. Training loop
def train_epoch(model, train_dl, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for batch in train_dl:
        # Get batch data
        users_items, ratings = batch
        users_items = users_items.to(device)
        ratings = ratings.to(device)
        
        # Forward pass
        predictions = model(users_items)

        #Flatten both predictions and targets directly in the training loop
        loss = loss_fn(predictions.view(-1), ratings.view(-1))
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(train_dl)

def validate(model, valid_dl, loss_fn, device):
    model.eval()
    total_loss = 0
    try:
        with torch.no_grad():
            for batch in valid_dl:
                users_items, ratings = batch
                users_items = users_items.to(device)
                ratings = ratings.to(device)
                
                predictions = model(users_items)
                loss = loss_fn(predictions.view(-1), ratings.view(-1))
                total_loss += loss.item()
        
        return total_loss / len(valid_dl)
    except ZeroDivisionError:
        return None

# 6. Train the model
n_epochs = 7
for epoch in range(n_epochs):
    train_loss = train_epoch(model, data.train, optimizer, loss_fn, device)
    valid_loss = validate(model, data.valid, loss_fn, device)
    print(f'Epoch {epoch+1}/{n_epochs}:')
    print(f'Train Loss: {train_loss}')
    print(f'Valid Loss: {valid_loss}')

Epoch 1/7:
Train Loss: 1.3408423032064893
Valid Loss: 0.8520179986953735
Epoch 2/7:
Train Loss: 0.9016596601939039
Valid Loss: 0.7767820954322815
Epoch 3/7:
Train Loss: 0.8278594535772305
Valid Loss: 0.7384382486343384
Epoch 4/7:
Train Loss: 0.7706746463838697
Valid Loss: 0.7467287182807922
Epoch 5/7:
Train Loss: 0.7190423823460784
Valid Loss: 0.7333086133003235
Epoch 6/7:
Train Loss: 0.6650302821306239
Valid Loss: 0.7224665284156799
Epoch 7/7:
Train Loss: 0.6063857537792812
Valid Loss: 0.7398031949996948


In [17]:
def predict_rating(model, user_id, item_id):
    model.eval()
    with torch.no_grad():
        try:
            print(f"Trying to predict for user {user_id} and item {item_id}")
            print(f"Available users: {model.classes[USER][:5]}...")  # Print first 5 users
            print(f"Available items: {model.classes[ITEM][:5]}...")  # Print first 5 items
            
            # Convert user and item IDs to indices
            user_idx = model._get_idx([user_id], is_item=False)
            item_idx = model._get_idx([item_id], is_item=True)
            
            # Create input tensor
            x = torch.stack([user_idx, item_idx], dim=1).to(device)
            
            # Get prediction
            pred = model(x)
            return pred.item()
        except Exception as e:
            print(f"Error in prediction: {str(e)}")
            return None

In [18]:
total_items = model.classes[ITEM][1:]
total_users = model.classes[USER][1:]

In [22]:
# Get all users from the test set and remove any users that were know in the training set 
test_users = test_df[USER].unique()

In [23]:
test_users = np.intersect1d(test_users, total_users)

In [24]:
user_embeddings = model.weight(['1', '10'], is_item=False)

In [25]:
# Example prediction
user_id = "1"
item_id = "10"
predicted_rating = predict_rating(model, user_id, item_id)
print(f'\nPredicted rating for user {user_id} and item {item_id}: {predicted_rating}')

Trying to predict for user 1 and item 10
Available users: ['#na#', '1', '10', '100', '101']...
Available items: ['#na#', '1', '10', '100', '1000']...

Predicted rating for user 1 and item 10: 4.050769329071045


In [26]:
users_items = cartesian_product(np.array(test_users),np.array(total_items))

In [27]:
users_items = pd.DataFrame(users_items, columns=[USER,ITEM])

In [28]:
training_removed = pd.merge(users_items, train_valid_df.astype(str), on=[USER, ITEM], how='left')
training_removed = training_removed[training_removed[RATING].isna()][[USER, ITEM]]

In [29]:
total_users, total_items = data.classes.values()

In [30]:
with Timer() as test_time:
    top_k_scores = scoreV2(model, 
                         data,
                         test_df=training_removed,
                         user_col=USER, 
                         item_col=ITEM, 
                         prediction_col=PREDICTION)

In [31]:
top_k_scores

Unnamed: 0,userID,itemID,prediction
760,1,169,4.997737
1026,1,408,4.976365
926,1,318,4.887697
1343,1,694,4.844806
1109,1,483,4.833915
...,...,...,...
1585433,99,375,1.772301
1585502,99,437,1.758064
1585504,99,439,1.729182
1584542,99,1087,1.655640


In [32]:
eval_map = map(test_df, top_k_scores, col_user=USER, col_item=ITEM, 
               col_rating=RATING, col_prediction=PREDICTION, 
               relevancy_method="top_k", k=TOP_K)

In [33]:
eval_ndcg = ndcg_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, 
                      col_rating=RATING, col_prediction=PREDICTION, 
                      relevancy_method="top_k", k=TOP_K)

In [34]:
eval_precision = precision_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, 
                                col_rating=RATING, col_prediction=PREDICTION, 
                                relevancy_method="top_k", k=TOP_K)

In [35]:
eval_recall = recall_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, 
                          col_rating=RATING, col_prediction=PREDICTION, 
                          relevancy_method="top_k", k=TOP_K)

In [38]:
print("Model:\t\t" + model.__class__.__name__,
      "Top K:\t\t%d" % TOP_K,
      "MAP:\t\t%f" % eval_map,
      "NDCG:\t\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

Model:		EmbeddingDotBias
Top K:		10
MAP:		0.021975
NDCG:		0.136488
Precision@K:	0.123754
Recall@K:	0.051028


In [37]:
print("Model:\t\t" + model.__class__.__name__,
      "Top K:\t\t%d" % TOP_K,
      "MAP:\t\t%f" % eval_map,
      "NDCG:\t\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

Model:		EmbeddingDotBias
Top K:		10
MAP:		0.021975
NDCG:		0.136488
Precision@K:	0.123754
Recall@K:	0.051028


In [58]:
scores = scoreV2(model,
                data,
               test_df=test_df.copy(), 
               user_col=USER, 
               item_col=ITEM, 
               prediction_col=PREDICTION)

In [59]:
eval_r2 = rsquared(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)
eval_rmse = rmse(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)
eval_mae = mae(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)
eval_exp_var = exp_var(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)

print("Model:\t\t\t" + model.__class__.__name__,
      "RMSE:\t\t\t%f" % eval_rmse,
      "MAE:\t\t\t%f" % eval_mae,
      "Explained variance:\t%f" % eval_exp_var,
      "R squared:\t\t%f" % eval_r2, sep='\n')

Model:			EmbeddingDotBias
RMSE:			0.910242
MAE:			0.714374
Explained variance:	0.347097
R squared:		0.347089
