In [1]:
# Suppress all warnings
import warnings
warnings.filterwarnings("ignore")

import os
import sys
import numpy as np
import pandas as pd
import torch
import fastai
from tempfile import TemporaryDirectory

from fastai.collab import collab_learner, CollabDataLoaders, load_learner

from recommenders.utils.constants import (
    DEFAULT_USER_COL as USER, 
    DEFAULT_ITEM_COL as ITEM, 
    DEFAULT_RATING_COL as RATING, 
    DEFAULT_TIMESTAMP_COL as TIMESTAMP, 
    DEFAULT_PREDICTION_COL as PREDICTION
) 
from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.models.fastai.fastai_utils import cartesian_product, score
from recommenders.evaluation.python_evaluation import map, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.evaluation.python_evaluation import rmse, mae, rsquared, exp_var
from recommenders.utils.notebook_utils import store_metadata

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Fast AI version: {}".format(fastai.__version__))
print("Torch version: {}".format(torch.__version__))
print("CUDA Available: {}".format(torch.cuda.is_available()))
print("CuDNN Enabled: {}".format(torch.backends.cudnn.enabled))

System version: 3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]
Pandas version: 2.2.3
Fast AI version: 2.8.1
Torch version: 2.6.0+cu124
CUDA Available: False
CuDNN Enabled: True


In [2]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# Model parameters
N_FACTORS = 40
EPOCHS = 5

In [3]:
ratings_df = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE,
    header=[USER,ITEM,RATING,TIMESTAMP]
)

# make sure the IDs are loaded as strings to better prevent confusion with embedding ids
ratings_df[USER] = ratings_df[USER].astype('str')
ratings_df[ITEM] = ratings_df[ITEM].astype('str')

ratings_df.head()

100%|██████████| 4.81k/4.81k [00:00<00:00, 12.5kKB/s]


Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [4]:
# Split the dataset
train_valid_df, test_df = python_stratified_split(
    ratings_df, 
    ratio=0.75, 
    min_rating=1, 
    filter_by="item", 
    col_user=USER, 
    col_item=ITEM
)

In [5]:
# Remove "cold" users from test set  
test_df = test_df[test_df.userID.isin(train_valid_df.userID)]

## Training

In [6]:
# fix random seeds to make sure our runs are reproducible
np.random.seed(101)
torch.manual_seed(101)
torch.cuda.manual_seed_all(101)

In [7]:
with Timer() as preprocess_time:
    data = CollabDataLoaders.from_df(train_valid_df, 
                                     user_name=USER, 
                                     item_name=ITEM, 
                                     rating_name=RATING, 
                                     valid_pct=0)

In [8]:
data.show_batch()

Unnamed: 0,userID,itemID,rating
0,447,183,5.0
1,445,886,3.0
2,121,427,4.0
3,478,145,1.0
4,398,476,3.0
5,457,717,3.0
6,435,366,2.0
7,752,322,1.0
8,588,95,4.0
9,554,216,3.0


In [9]:
from fastai.tabular.model import get_emb_sz, tabular_config
from fastai.losses import MSELossFlat
from fastcore.basics import ifnone

In [10]:
import torch
import torch.nn as nn
from fastai.layers import Embedding
from fastai.torch_core import Module, to_detach
from fastai.basics import ifnone
from fastai.torch_core import tensor

class EmbeddingDotBias(Module):
    "Base dot model for collaborative filtering."
    def __init__(self, n_factors, n_users, n_items, y_range=None):
        self.y_range = y_range
        (self.u_weight, self.i_weight, self.u_bias, self.i_bias) = [Embedding(*o) for o in [
            (n_users, n_factors), (n_items, n_factors), (n_users,1), (n_items,1)
        ]]

    def forward(self, x):
        users,items = x[:,0],x[:,1]
        dot = self.u_weight(users)* self.i_weight(items)
        res = dot.sum(1) + self.u_bias(users).squeeze() + self.i_bias(items).squeeze()
        if self.y_range is None: return res
        return torch.sigmoid(res) * (self.y_range[1]-self.y_range[0]) + self.y_range[0]

    @classmethod
    def from_classes(cls, n_factors, classes, user=None, item=None, y_range=None):
        "Build a model with `n_factors` by inferring `n_users` and  `n_items` from `classes`"
        if user is None: user = list(classes.keys())[0]
        if item is None: item = list(classes.keys())[1]
        res = cls(n_factors, len(classes[user]), len(classes[item]), y_range=y_range)
        res.classes,res.user,res.item = classes,user,item
        return res

    def _get_idx(self, arr, is_item=True):
        "Fetch item or user (based on `is_item`) for all in `arr`"
        assert hasattr(self, 'classes'), "Build your model with `EmbeddingDotBias.from_classes` to use this functionality."
        classes = self.classes[self.item] if is_item else self.classes[self.user]
        c2i = {v:k for k,v in enumerate(classes)}
        try: return tensor([c2i[o] for o in arr])
        except KeyError as e:
            message = f"You're trying to access {'an item' if is_item else 'a user'} that isn't in the training data. If it was in your original data, it may have been split such that it's only in the validation set now."
            raise modify_exception(e, message, replace=True)

    def bias(self, arr, is_item=True):
        "Bias for item or user (based on `is_item`) for all in `arr`"
        idx = self._get_idx(arr, is_item)
        layer = (self.i_bias if is_item else self.u_bias).eval().cpu()
        return to_detach(layer(idx).squeeze(),gather=False)

    def weight(self, arr, is_item=True):
        "Weight for item or user (based on `is_item`) for all in `arr`"
        idx = self._get_idx(arr, is_item)
        layer = (self.i_weight if is_item else self.u_weight).eval().cpu()
        return to_detach(layer(idx),gather=False)

In [88]:
y_range=[0,5.5]

In [89]:
# 3. Create model
n_factors = 40
model = EmbeddingDotBias.from_classes(
    n_factors=n_factors,
    classes=data.classes,
    user='userID',
    item='itemID',
    y_range=y_range
)

In [90]:
import torch
import torch.nn as nn
import torch.optim as optim
from fastai.losses import MSELossFlat

In [93]:
# 4. Setup training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = optim.AdamW(model.parameters(), 
    lr=1e-3,
    betas=(0.9, 0.99),
    eps=1e-5,
    weight_decay=0.01
)
loss_fn = MSELossFlat()

In [94]:
device

device(type='cpu')

In [51]:

# 5. Training loop
def train_epoch(model, train_dl, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for batch in train_dl:
        # Get batch data
        users_items, ratings = batch
        users_items = users_items.to(device)
        ratings = ratings.to(device)
        
        # Forward pass
        predictions = model(users_items)
        loss = loss_fn(predictions, ratings)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(train_dl)

def validate(model, valid_dl, loss_fn, device):
    model.eval()
    total_loss = 0
    try:
        with torch.no_grad():
            for batch in valid_dl:
                users_items, ratings = batch
                users_items = users_items.to(device)
                ratings = ratings.to(device)
                
                predictions = model(users_items)
                loss = loss_fn(predictions, ratings)
                total_loss += loss.item()
        
        return total_loss / len(valid_dl)
    except ZeroDivisionError:
        return None

# 6. Train the model
n_epochs = 7
for epoch in range(n_epochs):
    train_loss = train_epoch(model, data.train, optimizer, loss_fn, device)
    valid_loss = validate(model, data.valid, loss_fn, device)
    print(f'Epoch {epoch+1}/{n_epochs}:')
    print(f'Train Loss: {train_loss}')
    print(f'Valid Loss: {valid_loss}')

# 7. Make prediction

Epoch 1/7:
Train Loss: 1.3133113299315293
Valid Loss: None
Epoch 2/7:
Train Loss: 0.8673861778179126
Valid Loss: None
Epoch 3/7:
Train Loss: 0.7958553408752529
Valid Loss: None
Epoch 4/7:
Train Loss: 0.7252955946041455
Valid Loss: None
Epoch 5/7:
Train Loss: 0.6661141418149447
Valid Loss: None
Epoch 6/7:
Train Loss: 0.6091437903026266
Valid Loss: None
Epoch 7/7:
Train Loss: 0.5478217573550374
Valid Loss: None


In [52]:
def predict_rating(model, user_id, item_id):
    model.eval()
    with torch.no_grad():
        try:
            print(f"Trying to predict for user {user_id} and item {item_id}")
            print(f"Available users: {model.classes[USER][:5]}...")  # Print first 5 users
            print(f"Available items: {model.classes[ITEM][:5]}...")  # Print first 5 items
            
            # Convert user and item IDs to indices
            user_idx = model._get_idx([user_id], is_item=False)
            item_idx = model._get_idx([item_id], is_item=True)
            
            # Create input tensor
            x = torch.stack([user_idx, item_idx], dim=1).to(device)
            
            # Get prediction
            pred = model(x)
            return pred.item()
        except Exception as e:
            print(f"Error in prediction: {str(e)}")
            return None

In [53]:
model.classes[USER][:5]

(#5) ['#na#','1','10','100','101']

In [54]:
model.classes[ITEM][:5]

(#5) ['#na#','1','10','100','1000']

In [55]:
total_items = model.classes[ITEM][1:]
total_users = model.classes[USER][1:]

In [56]:
total_users

(#943) ['1','10','100','101','102','103','104','105','106','107','108','109','11','110','111','112','113','114','115','116'...]

In [57]:
total_items

(#1682) ['1','10','100','1000','1001','1002','1003','1004','1005','1006','1007','1008','1009','101','1010','1011','1012','1013','1014','1015'...]

In [58]:
# Get all users from the test set and remove any users that were know in the training set 
test_users = test_df[USER].unique()

In [59]:
test_users

array(['57', '141', '184', '15', '486', '738', '843', '18', '434', '761',
       '692', '777', '913', '38', '472', '295', '311', '174', '938',
       '458', '246', '560', '487', '545', '389', '715', '537', '521',
       '930', '177', '865', '365', '468', '882', '263', '79', '83', '664',
       '663', '505', '540', '411', '933', '674', '6', '350', '327', '895',
       '742', '465', '710', '293', '460', '387', '390', '454', '339',
       '84', '463', '577', '941', '42', '265', '77', '868', '234', '344',
       '471', '20', '81', '137', '550', '503', '621', '330', '676', '280',
       '374', '854', '419', '916', '648', '745', '210', '727', '716',
       '200', '554', '549', '896', '934', '885', '13', '262', '459',
       '150', '689', '160', '438', '66', '735', '654', '253', '893',
       '649', '562', '838', '887', '5', '41', '817', '45', '618', '268',
       '279', '181', '790', '416', '506', '514', '565', '423', '906',
       '378', '500', '312', '862', '518', '49', '733', '104', '373'

In [60]:
test_users = np.intersect1d(test_users, total_users)

In [61]:
test_users

array(['1', '10', '100', '101', '102', '103', '104', '105', '106', '107',
       '108', '109', '11', '110', '111', '112', '113', '114', '115',
       '116', '117', '118', '119', '12', '120', '121', '122', '123',
       '124', '125', '126', '127', '128', '129', '13', '130', '131',
       '132', '133', '134', '135', '136', '137', '138', '139', '14',
       '140', '141', '142', '143', '144', '145', '146', '147', '148',
       '149', '15', '150', '151', '152', '153', '154', '155', '156',
       '157', '158', '159', '16', '160', '161', '162', '163', '164',
       '165', '166', '167', '168', '169', '17', '170', '171', '172',
       '173', '174', '175', '176', '177', '178', '179', '18', '180',
       '181', '182', '183', '184', '185', '186', '187', '188', '189',
       '19', '190', '191', '192', '193', '194', '195', '196', '197',
       '198', '199', '2', '20', '200', '201', '202', '203', '204', '205',
       '206', '207', '208', '209', '21', '210', '211', '212', '213',
       '214', '215', '

In [62]:

user_embeddings = model.weight(['1', '10'], is_item=False)

In [63]:
user_embeddings

tensor([[-0.1048,  0.3249, -0.2006,  0.2101,  0.2772, -0.5145,  0.2459,  0.2221,
          0.4579, -0.2994, -0.0732,  0.2747,  0.2688,  0.0817,  0.2250, -0.0390,
          0.3556,  0.1781, -0.3167, -0.1528,  0.1455, -0.3536, -0.1514,  0.0931,
         -0.1596, -0.3471,  0.3498, -0.1267, -0.3370, -0.1636, -0.2451,  0.1978,
         -0.3226,  0.1124,  0.4129, -0.0341,  0.1606, -0.3904, -0.1992,  0.1793],
        [-0.0316, -0.0526, -0.1815,  0.1249,  0.2481, -0.0657,  0.1219,  0.2211,
         -0.2178, -0.1097, -0.2226,  0.1309,  0.1486,  0.0750,  0.2260, -0.1479,
          0.1439,  0.2649, -0.1586, -0.0115,  0.1220, -0.0798,  0.0195,  0.1279,
         -0.0204, -0.1194,  0.2746, -0.0832,  0.0102, -0.3449, -0.2135,  0.1516,
          0.1070,  0.2587,  0.0801,  0.1401, -0.0299,  0.3534, -0.1786,  0.2045]])

In [64]:

# Example prediction
user_id = "1"
item_id = "10"
predicted_rating = predict_rating(model, user_id, item_id)
print(f'\nPredicted rating for user {user_id} and item {item_id}: {predicted_rating}')

Trying to predict for user 1 and item 10
Available users: ['#na#', '1', '10', '100', '101']...
Available items: ['#na#', '1', '10', '100', '1000']...

Predicted rating for user 1 and item 10: 4.051078796386719


In [65]:
np.array(test_users)

array(['1', '10', '100', '101', '102', '103', '104', '105', '106', '107',
       '108', '109', '11', '110', '111', '112', '113', '114', '115',
       '116', '117', '118', '119', '12', '120', '121', '122', '123',
       '124', '125', '126', '127', '128', '129', '13', '130', '131',
       '132', '133', '134', '135', '136', '137', '138', '139', '14',
       '140', '141', '142', '143', '144', '145', '146', '147', '148',
       '149', '15', '150', '151', '152', '153', '154', '155', '156',
       '157', '158', '159', '16', '160', '161', '162', '163', '164',
       '165', '166', '167', '168', '169', '17', '170', '171', '172',
       '173', '174', '175', '176', '177', '178', '179', '18', '180',
       '181', '182', '183', '184', '185', '186', '187', '188', '189',
       '19', '190', '191', '192', '193', '194', '195', '196', '197',
       '198', '199', '2', '20', '200', '201', '202', '203', '204', '205',
       '206', '207', '208', '209', '21', '210', '211', '212', '213',
       '214', '215', '

In [66]:
np.array(total_items)

array(['1', '10', '100', ..., '997', '998', '999'],
      shape=(1682,), dtype='<U4')

In [67]:
users_items = cartesian_product(np.array(test_users),np.array(total_items))

In [68]:
users_items

array([['1', '1'],
       ['1', '10'],
       ['1', '100'],
       ...,
       ['99', '997'],
       ['99', '998'],
       ['99', '999']], shape=(1586126, 2), dtype=object)

In [69]:
users_items = pd.DataFrame(users_items, columns=[USER,ITEM])

In [70]:
users_items

Unnamed: 0,userID,itemID
0,1,1
1,1,10
2,1,100
3,1,1000
4,1,1001
...,...,...
1586121,99,995
1586122,99,996
1586123,99,997
1586124,99,998


In [71]:
training_removed = pd.merge(users_items, train_valid_df.astype(str), on=[USER, ITEM], how='left')
training_removed = training_removed[training_removed[RATING].isna()][[USER, ITEM]]

In [72]:
training_removed

Unnamed: 0,userID,itemID
3,1,1000
4,1,1001
5,1,1002
6,1,1003
7,1,1004
...,...,...
1586121,99,995
1586122,99,996
1586123,99,997
1586124,99,998


In [73]:
from recommenders.utils import constants as cc

In [74]:
total_users, total_items = data.classes.values()

In [75]:
total_users

['#na#', '1', '10', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '11', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '12', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '13', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '14', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '15', '150', '151', '152', '153', '154', '155', '156', '157', '158', '159', '16', '160', '161', '162', '163', '164', '165', '166', '167', '168', '169', '17', '170', '171', '172', '173', '174', '175', '176', '177', '178', '179', '18', '180', '181', '182', '183', '184', '185', '186', '187', '188', '189', '19', '190', '191', '192', '193', '194', '195', '196', '197', '198', '199', '2', '20', '200', '201', '202', '203', '204', '205', '206', '207', '208', '209', '21', '210', '211', '212', '213', '214', '215', '216', '217', '218', '219', '22', '220', '221', '222', '223', '224', '225', '226', '227', '228', 

In [76]:
def score(
    learner,
    data,
    test_df,
    user_col=cc.DEFAULT_USER_COL,
    item_col=cc.DEFAULT_ITEM_COL,
    prediction_col=cc.DEFAULT_PREDICTION_COL,
    top_k=None,
):
    """Score all users+items provided and reduce to top_k items per user if top_k>0

    Args:
        learner (object): Model.
        test_df (pandas.DataFrame): Test dataframe.
        user_col (str): User column name.
        item_col (str): Item column name.
        prediction_col (str): Prediction column name.
        top_k (int): Number of top items to recommend.

    Returns:
        pandas.DataFrame: Result of recommendation
    """
    # replace values not known to the model with NaN
    total_users, total_items = data.classes.values()
    test_df.loc[~test_df[user_col].isin(total_users), user_col] = np.nan
    test_df.loc[~test_df[item_col].isin(total_items), item_col] = np.nan

    # map ids to embedding ids
    u = learner._get_idx(test_df[user_col], is_item=False)
    m = learner._get_idx(test_df[item_col], is_item=True)

    # score the pytorch model
    x = torch.column_stack((u, m))

    if torch.cuda.is_available():
        x = x.to("cuda")
        learner = learner.to("cuda")

    pred = learner.forward(x).detach().cpu().numpy()
    scores = pd.DataFrame(
        {user_col: test_df[user_col], item_col: test_df[item_col], prediction_col: pred}
    )
    scores = scores.sort_values([user_col, prediction_col], ascending=[True, False])

    if top_k is not None:
        top_scores = scores.groupby(user_col).head(top_k).reset_index(drop=True)
    else:
        top_scores = scores

    return top_scores

In [77]:
with Timer() as test_time:
    top_k_scores = score(model, 
                         data, # Agregado pero podria quitarlo 
                         test_df=training_removed,
                         user_col=USER, 
                         item_col=ITEM, 
                         prediction_col=PREDICTION)

In [78]:
top_k_scores

Unnamed: 0,userID,itemID,prediction
1026,1,408,5.119030
760,1,169,5.117696
1109,1,483,5.072352
500,1,1449,4.981776
1596,1,921,4.971748
...,...,...,...
1585858,99,758,1.940612
1586100,99,976,1.886269
1585524,99,457,1.866316
1585408,99,352,1.776408


In [79]:
eval_map = map(test_df, top_k_scores, col_user=USER, col_item=ITEM, 
               col_rating=RATING, col_prediction=PREDICTION, 
               relevancy_method="top_k", k=TOP_K)

In [80]:
eval_ndcg = ndcg_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, 
                      col_rating=RATING, col_prediction=PREDICTION, 
                      relevancy_method="top_k", k=TOP_K)

In [81]:
eval_precision = precision_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, 
                                col_rating=RATING, col_prediction=PREDICTION, 
                                relevancy_method="top_k", k=TOP_K)

In [82]:
eval_recall = recall_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, 
                          col_rating=RATING, col_prediction=PREDICTION, 
                          relevancy_method="top_k", k=TOP_K)

In [83]:
print("Model:\t\t" + model.__class__.__name__,
      "Top K:\t\t%d" % TOP_K,
      "MAP:\t\t%f" % eval_map,
      "NDCG:\t\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

Model:		EmbeddingDotBias
Top K:		10
MAP:		0.018972
NDCG:		0.122750
Precision@K:	0.111983
Recall@K:	0.044390


In [101]:
#### Delete from Here 