In [1]:
# Suppress all warnings
import warnings
warnings.filterwarnings("ignore")

import os
import sys
import numpy as np
import pandas as pd
import torch
#import fastai
from tempfile import TemporaryDirectory

#from fastai.collab import collab_learner, CollabDataLoaders, load_learner
#from fastai.collab import collab_learner, load_learner

from recommenders.utils.constants import (
    DEFAULT_USER_COL as USER, 
    DEFAULT_ITEM_COL as ITEM, 
    DEFAULT_RATING_COL as RATING, 
    DEFAULT_TIMESTAMP_COL as TIMESTAMP, 
    DEFAULT_PREDICTION_COL as PREDICTION
) 
from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.models.fastai.fastai_utils import cartesian_product, score
from recommenders.evaluation.python_evaluation import map, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.evaluation.python_evaluation import rmse, mae, rsquared, exp_var
from recommenders.utils.notebook_utils import store_metadata

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
#print("Fast AI version: {}".format(fastai.__version__))
print("Torch version: {}".format(torch.__version__))
print("CUDA Available: {}".format(torch.cuda.is_available()))
print("CuDNN Enabled: {}".format(torch.backends.cudnn.enabled))

System version: 3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]
Pandas version: 2.2.3
Torch version: 2.6.0+cu124
CUDA Available: False
CuDNN Enabled: True


In [2]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# Model parameters
N_FACTORS = 40
EPOCHS = 5

In [3]:
ratings_df = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE,
    header=[USER,ITEM,RATING,TIMESTAMP]
)

# make sure the IDs are loaded as strings to better prevent confusion with embedding ids
ratings_df[USER] = ratings_df[USER].astype('str')
ratings_df[ITEM] = ratings_df[ITEM].astype('str')

ratings_df.head()

100%|██████████| 4.81k/4.81k [00:00<00:00, 16.0kKB/s]


Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [4]:
# Split the dataset
train_valid_df, test_df = python_stratified_split(
    ratings_df, 
    ratio=0.75, 
    min_rating=1, 
    filter_by="item", 
    col_user=USER, 
    col_item=ITEM
)

In [5]:
train_valid_df

Unnamed: 0,userID,itemID,rating,timestamp
10047,94,1,4.0,885870323
44185,620,1,5.0,889987954
82784,779,1,4.0,875501555
83281,399,1,4.0,882340657
69124,864,1,5.0,877214125
...,...,...,...,...
77891,429,999,2.0,882387163
31448,393,999,4.0,889730187
7847,125,999,4.0,892838288
42623,476,999,2.0,883365385


In [6]:
# Remove "cold" users from test set  
test_df = test_df[test_df.userID.isin(train_valid_df.userID)]

## Training

In [7]:
# fix random seeds to make sure our runs are reproducible
np.random.seed(101)
torch.manual_seed(101)
torch.cuda.manual_seed_all(101)

In [8]:
############# DRAFT ##############

In [9]:
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
import pandas as pd
from pathlib import Path
import random

class CollabDatasetV3(Dataset):
    def __init__(self, users, items, ratings):
        # Convert to numpy arrays first and ensure correct types
        users = np.array(users, dtype=np.int64)
        items = np.array(items, dtype=np.int64)
        ratings = np.array(ratings, dtype=np.float32)

        # Then convert to tensors
        self.users = torch.tensor(users, dtype=torch.long)
        self.items = torch.tensor(items, dtype=torch.long)
        self.ratings = torch.tensor(ratings, dtype=torch.float)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        user_item_tensor = torch.stack((self.users[idx], self.items[idx]))
        rating_tensor = self.ratings[idx].unsqueeze(0) # Use .unsqueeze(0)
        return user_item_tensor, rating_tensor # Return the shaped tensor


class CollabDataLoadersV3:
    def __init__(self, train_dl, valid_dl=None):
        """Initialize the dataloaders.

        Args:
            train_dl (DataLoader): Training dataloader
            valid_dl (DataLoader, optional): Validation dataloader
        """
        self.train = train_dl
        self.valid = valid_dl

    @classmethod
    def from_df(cls, ratings, valid_pct=0.2, user_name=None, item_name=None,
                rating_name=None, seed=42, batch_size=64, **kwargs):
        """Create DataLoaders from a pandas DataFrame for collaborative filtering."""
        # Validate input
        if ratings is None or len(ratings) == 0:
            raise ValueError("Input DataFrame is empty")

        # Set random seed
        torch.manual_seed(seed)
        random.seed(seed)
        np.random.seed(seed)

        # Get column names
        user_name = user_name or ratings.columns[0]
        item_name = item_name or ratings.columns[1]
        rating_name = rating_name or ratings.columns[2]

        # Validate columns exist
        required_cols = [user_name, item_name, rating_name]
        if not all(col in ratings.columns for col in required_cols):
            raise ValueError(f"Missing required columns: {[col for col in required_cols if col not in ratings.columns]}")

        # Drop any rows with NaN values
        ratings = ratings.dropna(subset=[user_name, item_name, rating_name])
        if len(ratings) == 0:
            raise ValueError("No valid data after dropping NaN values")

        # Get unique users and items (as strings)
        # Convert to string first to ensure consistent type for sorting
        users = ratings[user_name].astype(str).unique()
        items = ratings[item_name].astype(str).unique()

        if len(users) == 0 or len(items) == 0:
            raise ValueError("No unique users or items found in the data")

        # Sort unique users and items using standard string sorting
        # This matches the behavior observed in fastai's categorization for numeric strings
        sorted_users = ['#na#'] + sorted(users.tolist())
        sorted_items = ['#na#'] + sorted(items.tolist())

        # Create mapping dictionaries using the string-sorted lists
        user2idx = {u: i for i, u in enumerate(sorted_users)}
        item2idx = {i: idx for idx, i in enumerate(sorted_items)}

        # Convert original IDs in the DataFrame to indices using the mapping
        # Use .loc[] for assignment to avoid SettingWithCopyWarning
        ratings.loc[:, user_name] = ratings[user_name].astype(str).map(user2idx).fillna(user2idx['#na#']).astype(np.int64)
        ratings.loc[:, item_name] = ratings[item_name].astype(str).map(item2idx).fillna(item2idx['#na#']).astype(np.int64)
        ratings.loc[:, rating_name] = ratings[rating_name].astype(np.float32) # Ensure rating is float

        # No need to remove rows where mapping failed if using '#na#' index for fillna
        # but keep this if you want to strictly remove unseen IDs.
        # For now, mapping to '#na#' index (usually 0) is more like standard categorization.

        # Split into train and validation
        n = len(ratings)
        n_valid = int(n * valid_pct)

        if n_valid >= n:
             # Adjusted error message to be more precise
             if n == 0:
                 raise ValueError("Input DataFrame was empty or contained no valid rows after cleaning.")
             else:
                raise ValueError(f"Validation percentage {valid_pct} is too high. {n} total items, {n_valid} requested for validation leaves {n - n_valid} for training.")


        indices = list(range(n))
        random.shuffle(indices)
        train_idx = indices[n_valid:]
        valid_idx = indices[:n_valid]

        if len(train_idx) == 0:
            raise ValueError("Training set is empty after split. Reduce valid_pct.")

        # Create datasets using the index-mapped values
        train_ds = CollabDatasetV3(
            ratings.iloc[train_idx][user_name].values,
            ratings.iloc[train_idx][item_name].values,
            ratings.iloc[train_idx][rating_name].values
        )

        valid_ds = CollabDatasetV3(
            ratings.iloc[valid_idx][user_name].values,
            ratings.iloc[valid_idx][item_name].values,
            ratings.iloc[valid_idx][rating_name].values
        ) if n_valid > 0 else None

        # Create dataloaders with safe batch sizes
        train_dl = DataLoader(
            train_ds,
            batch_size=min(batch_size, len(train_ds)) if len(train_ds) > 0 else 1, # Ensure batch_size isn't larger than dataset
            shuffle=True,
            **kwargs
        )

        valid_batch_size = batch_size * 2
        valid_dl = DataLoader(
            valid_ds,
            batch_size=min(valid_batch_size, len(valid_ds)) if valid_ds and len(valid_ds) > 0 else (1 if valid_ds else None), # Safe batch size for valid
            shuffle=False,
            **kwargs
        ) if valid_ds is not None and len(valid_ds) > 0 else None # Ensure valid_dl is None if valid_ds is empty


        # Create instance and store metadata
        dl = cls(train_dl, valid_dl)
        # Store the string-sorted lists in .classes
        dl.classes = {
            user_name: sorted_users,
            item_name: sorted_items
        }
        dl.user = user_name
        dl.item = item_name
        # n_users and n_items should be the size of the classes lists, including #na#
        dl.n_users = len(sorted_users)
        dl.n_items = len(sorted_items)
        dl.user2idx = user2idx # Store mappings for potential later use
        dl.item2idx = item2idx # Store mappings for potential later use

        return dl

    def show_batch(self, n=5):
        """Show a batch of data."""
        print("Showing a sample batch:")
        # Get one batch from the training dataloader
        # Unpack the two elements from the batch: user_item_batch (tensor of shape [bs, 2]) and ratings_batch (tensor of shape [bs, 1])
        for user_item_batch, ratings_batch in self.train:
            # Extract users and items from the user_item_batch tensor
            users = user_item_batch[:, 0] # Shape [bs]
            items = user_item_batch[:, 1] # Shape [bs]

            # Now take the first n elements as intended by the original code
            users = users[:n].numpy() # Shape [n]
            items = items[:n].numpy() # Shape [n]
            # Squeeze the ratings numpy array to remove the dimension of size 1
            ratings = ratings_batch[:n].numpy().squeeze() # Shape [n]

            df = pd.DataFrame({
                self.user: [self.classes[self.user][u] for u in users],
                self.item: [self.classes[self.item][i] for i in items],
                'rating': ratings # Now 'ratings' is a 1D array
            })

            print(f"Showing {n} examples from a batch:")
            print(df)  # This line prints the DataFrame
            break



In [10]:
with Timer() as preprocess_time:
    data = CollabDataLoadersV3.from_df(train_valid_df, 
                                     user_name=USER, 
                                     item_name=ITEM, 
                                     rating_name=RATING, 
                                     valid_pct=0.001) 

In [11]:
data.show_batch()

Showing a sample batch:
Showing 5 examples from a batch:
  userID itemID  rating
0    505    161     3.0
1    500   1010     4.0
2    172    430     3.0
3    880    380     3.0
4    158      4     4.0


In [12]:
data.classes['userID']

['#na#',
 '1',
 '10',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '11',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '12',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '13',
 '130',
 '131',
 '132',
 '133',
 '134',
 '135',
 '136',
 '137',
 '138',
 '139',
 '14',
 '140',
 '141',
 '142',
 '143',
 '144',
 '145',
 '146',
 '147',
 '148',
 '149',
 '15',
 '150',
 '151',
 '152',
 '153',
 '154',
 '155',
 '156',
 '157',
 '158',
 '159',
 '16',
 '160',
 '161',
 '162',
 '163',
 '164',
 '165',
 '166',
 '167',
 '168',
 '169',
 '17',
 '170',
 '171',
 '172',
 '173',
 '174',
 '175',
 '176',
 '177',
 '178',
 '179',
 '18',
 '180',
 '181',
 '182',
 '183',
 '184',
 '185',
 '186',
 '187',
 '188',
 '189',
 '19',
 '190',
 '191',
 '192',
 '193',
 '194',
 '195',
 '196',
 '197',
 '198',
 '199',
 '2',
 '20',
 '200',
 '201',
 '202',
 '203',
 '204',
 '205',
 '206',
 '207',
 '208',
 '209',
 '21',
 '210',
 '211',

In [16]:
len(data.classes.get("itemID"))

1683

In [11]:
######################################

In [None]:
####### DEBUG ############

In [18]:
data.show_batch()

Showing a sample batch:
Showing 5 examples from a batch:
  userID itemID  rating
0    757    405     4.0
1    158    562     4.0
2    144     68     2.0
3    547    269     3.0
4    919    813     4.0


In [19]:
######################

In [20]:
data.show_batch()

Showing a sample batch:
Showing 5 examples from a batch:
  userID itemID  rating
0    921    181     5.0
1    862    651     5.0
2    798    110     4.0
3    442    168     4.0
4    650    635     3.0


In [21]:
#from fastai.tabular.model import get_emb_sz, tabular_config
#from fastai.losses import MSELossFlat
from torch.nn import MSELoss

In [22]:
def trunc_normal_(x, mean=0., std=1.):
    "Truncated normal initialization (approximation)"
    # From https://discuss.pytorch.org/t/implementing-truncated-normal-initializer/4778/12
    return x.normal_().fmod_(2).mul_(std).add_(mean)

In [23]:
import torch
import torch.nn as nn
#from fastai.layers import Embedding
from torch.nn import Embedding
#from fastai.torch_core import Module, to_detach
from torch.nn import Module
#from fastai.torch_core import tensor
import torch.nn.init as init

class EmbeddingDotBias(Module):
    "Base dot model for collaborative filtering."
    def __init__(self, n_factors, n_users, n_items, y_range=None):
        
        super().__init__()
        self.y_range = y_range
        (self.u_weight, self.i_weight, self.u_bias, self.i_bias) = [Embedding(*o) for o in [
            (n_users, n_factors), (n_items, n_factors), (n_users,1), (n_items,1)
        ]]

        # Initialize with truncated normal
        for emb in [self.u_weight, self.i_weight, self.u_bias, self.i_bias]:
            init.trunc_normal_(emb.weight, std=0.01)

    def forward(self, x):
        users,items = x[:,0],x[:,1]
        dot = self.u_weight(users)* self.i_weight(items)
        res = dot.sum(1) + self.u_bias(users).squeeze() + self.i_bias(items).squeeze()
        if self.y_range is None: return res
        return torch.sigmoid(res) * (self.y_range[1]-self.y_range[0]) + self.y_range[0]

    @classmethod
    def from_classes(cls, n_factors, classes, user=None, item=None, y_range=None):
        "Build a model with `n_factors` by inferring `n_users` and  `n_items` from `classes`"
        if user is None: user = list(classes.keys())[0]
        if item is None: item = list(classes.keys())[1]
        res = cls(n_factors, len(classes[user]), len(classes[item]), y_range=y_range)
        res.classes,res.user,res.item = classes,user,item
        return res

    def _get_idx(self, arr, is_item=True):
        "Fetch item or user (based on `is_item`) for all in `arr`"
        assert hasattr(self, 'classes'), "Build your model with `EmbeddingDotBias.from_classes` to use this functionality."
        classes = self.classes[self.item] if is_item else self.classes[self.user]
        c2i = {v:k for k,v in enumerate(classes)}
        try: return torch.tensor([c2i[o] for o in arr])
        except KeyError as e:
            message = f"You're trying to access {'an item' if is_item else 'a user'} that isn't in the training data. If it was in your original data, it may have been split such that it's only in the validation set now."
            raise modify_exception(e, message, replace=True)

    def bias(self, arr, is_item=True):
        "Bias for item or user (based on `is_item`) for all in `arr`"
        idx = self._get_idx(arr, is_item)
        layer = (self.i_bias if is_item else self.u_bias).eval().cpu()
        #return to_detach(layer(idx).squeeze(),gather=False)
        return layer(idx).squeeze().detach()

    def weight(self, arr, is_item=True):
        "Weight for item or user (based on `is_item`) for all in `arr`"
        idx = self._get_idx(arr, is_item)
        layer = (self.i_weight if is_item else self.u_weight).eval().cpu()
        #return to_detach(layer(idx),gather=False)
        return layer(idx).detach()

In [24]:
classes = data.classes['itemID']

In [25]:
data.classes

{'userID': ['#na#',
  '1',
  '10',
  '100',
  '101',
  '102',
  '103',
  '104',
  '105',
  '106',
  '107',
  '108',
  '109',
  '11',
  '110',
  '111',
  '112',
  '113',
  '114',
  '115',
  '116',
  '117',
  '118',
  '119',
  '12',
  '120',
  '121',
  '122',
  '123',
  '124',
  '125',
  '126',
  '127',
  '128',
  '129',
  '13',
  '130',
  '131',
  '132',
  '133',
  '134',
  '135',
  '136',
  '137',
  '138',
  '139',
  '14',
  '140',
  '141',
  '142',
  '143',
  '144',
  '145',
  '146',
  '147',
  '148',
  '149',
  '15',
  '150',
  '151',
  '152',
  '153',
  '154',
  '155',
  '156',
  '157',
  '158',
  '159',
  '16',
  '160',
  '161',
  '162',
  '163',
  '164',
  '165',
  '166',
  '167',
  '168',
  '169',
  '17',
  '170',
  '171',
  '172',
  '173',
  '174',
  '175',
  '176',
  '177',
  '178',
  '179',
  '18',
  '180',
  '181',
  '182',
  '183',
  '184',
  '185',
  '186',
  '187',
  '188',
  '189',
  '19',
  '190',
  '191',
  '192',
  '193',
  '194',
  '195',
  '196',
  '197',
  '198',
  

In [26]:
y_range=[0,5.5]

In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
#from fastai.losses import MSELossFlat
from torch.nn import MSELoss 

In [28]:
classes = data.classes

In [29]:
# 3. Create model
n_factors = 40
model = EmbeddingDotBias.from_classes(
    n_factors=n_factors,
    classes=data.classes,
    user='userID',
    item='itemID',
    y_range=y_range
)


In [30]:
model.item

'itemID'

In [31]:

# 4. Setup training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = optim.AdamW(model.parameters(), 
    lr=1e-3,
    betas=(0.9, 0.99),
    eps=1e-5,
    weight_decay=0.01
)

#loss_fn = MSELossFlat()
loss_fn = MSELoss()
#loss_fn = MSELossFlat2()

In [32]:
device

device(type='cpu')

In [33]:

# 5. Training loop
def train_epoch(model, train_dl, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for batch in train_dl:
        # Get batch data
        users_items, ratings = batch
        users_items = users_items.to(device)
        ratings = ratings.to(device)
        
        # Forward pass
        predictions = model(users_items)

        #Flatten both predictions and targets directly in the training loop
        loss = loss_fn(predictions.view(-1), ratings.view(-1))
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(train_dl)

def validate(model, valid_dl, loss_fn, device):
    model.eval()
    total_loss = 0
    try:
        with torch.no_grad():
            for batch in valid_dl:
                users_items, ratings = batch
                users_items = users_items.to(device)
                ratings = ratings.to(device)
                
                predictions = model(users_items)
                loss = loss_fn(predictions.view(-1), ratings.view(-1))
                total_loss += loss.item()
        
        return total_loss / len(valid_dl)
    except ZeroDivisionError:
        return None

# 6. Train the model
n_epochs = 7
for epoch in range(n_epochs):
    train_loss = train_epoch(model, data.train, optimizer, loss_fn, device)
    valid_loss = validate(model, data.valid, loss_fn, device)
    print(f'Epoch {epoch+1}/{n_epochs}:')
    print(f'Train Loss: {train_loss}')
    print(f'Valid Loss: {valid_loss}')

# 7. Make prediction

Epoch 1/7:
Train Loss: 1.3487142637219445
Valid Loss: 0.8316531181335449
Epoch 2/7:
Train Loss: 0.8984370516174482
Valid Loss: 0.7407376170158386
Epoch 3/7:
Train Loss: 0.8195157784701614
Valid Loss: 0.7237364649772644
Epoch 4/7:
Train Loss: 0.7633327331406672
Valid Loss: 0.746097207069397
Epoch 5/7:
Train Loss: 0.7117874205061601
Valid Loss: 0.7170356512069702
Epoch 6/7:
Train Loss: 0.6573844272579756
Valid Loss: 0.7437258958816528
Epoch 7/7:
Train Loss: 0.598891092343542
Valid Loss: 0.7339410781860352


In [34]:
def predict_rating(model, user_id, item_id):
    model.eval()
    with torch.no_grad():
        try:
            print(f"Trying to predict for user {user_id} and item {item_id}")
            print(f"Available users: {model.classes[USER][:5]}...")  # Print first 5 users
            print(f"Available items: {model.classes[ITEM][:5]}...")  # Print first 5 items
            
            # Convert user and item IDs to indices
            user_idx = model._get_idx([user_id], is_item=False)
            item_idx = model._get_idx([item_id], is_item=True)
            
            # Create input tensor
            x = torch.stack([user_idx, item_idx], dim=1).to(device)
            
            # Get prediction
            pred = model(x)
            return pred.item()
        except Exception as e:
            print(f"Error in prediction: {str(e)}")
            return None

In [35]:
model.classes[USER][:5]

['#na#', '1', '10', '100', '101']

In [36]:
model.classes[ITEM][:5]

['#na#', '1', '10', '100', '1000']

In [37]:
total_items = model.classes[ITEM][1:]
total_users = model.classes[USER][1:]

In [38]:
len(total_users)

943

In [39]:
total_items

['1',
 '10',
 '100',
 '1000',
 '1001',
 '1002',
 '1003',
 '1004',
 '1005',
 '1006',
 '1007',
 '1008',
 '1009',
 '101',
 '1010',
 '1011',
 '1012',
 '1013',
 '1014',
 '1015',
 '1016',
 '1017',
 '1018',
 '1019',
 '102',
 '1020',
 '1021',
 '1022',
 '1023',
 '1024',
 '1025',
 '1026',
 '1027',
 '1028',
 '1029',
 '103',
 '1030',
 '1031',
 '1032',
 '1033',
 '1034',
 '1035',
 '1036',
 '1037',
 '1038',
 '1039',
 '104',
 '1040',
 '1041',
 '1042',
 '1043',
 '1044',
 '1045',
 '1046',
 '1047',
 '1048',
 '1049',
 '105',
 '1050',
 '1051',
 '1052',
 '1053',
 '1054',
 '1055',
 '1056',
 '1057',
 '1058',
 '1059',
 '106',
 '1060',
 '1061',
 '1062',
 '1063',
 '1064',
 '1065',
 '1066',
 '1067',
 '1068',
 '1069',
 '107',
 '1070',
 '1071',
 '1072',
 '1073',
 '1074',
 '1075',
 '1076',
 '1077',
 '1078',
 '1079',
 '108',
 '1080',
 '1081',
 '1082',
 '1083',
 '1084',
 '1085',
 '1086',
 '1087',
 '1088',
 '1089',
 '109',
 '1090',
 '1091',
 '1092',
 '1093',
 '1094',
 '1095',
 '1096',
 '1097',
 '1098',
 '1099',
 '11',


In [40]:
total_items

['1',
 '10',
 '100',
 '1000',
 '1001',
 '1002',
 '1003',
 '1004',
 '1005',
 '1006',
 '1007',
 '1008',
 '1009',
 '101',
 '1010',
 '1011',
 '1012',
 '1013',
 '1014',
 '1015',
 '1016',
 '1017',
 '1018',
 '1019',
 '102',
 '1020',
 '1021',
 '1022',
 '1023',
 '1024',
 '1025',
 '1026',
 '1027',
 '1028',
 '1029',
 '103',
 '1030',
 '1031',
 '1032',
 '1033',
 '1034',
 '1035',
 '1036',
 '1037',
 '1038',
 '1039',
 '104',
 '1040',
 '1041',
 '1042',
 '1043',
 '1044',
 '1045',
 '1046',
 '1047',
 '1048',
 '1049',
 '105',
 '1050',
 '1051',
 '1052',
 '1053',
 '1054',
 '1055',
 '1056',
 '1057',
 '1058',
 '1059',
 '106',
 '1060',
 '1061',
 '1062',
 '1063',
 '1064',
 '1065',
 '1066',
 '1067',
 '1068',
 '1069',
 '107',
 '1070',
 '1071',
 '1072',
 '1073',
 '1074',
 '1075',
 '1076',
 '1077',
 '1078',
 '1079',
 '108',
 '1080',
 '1081',
 '1082',
 '1083',
 '1084',
 '1085',
 '1086',
 '1087',
 '1088',
 '1089',
 '109',
 '1090',
 '1091',
 '1092',
 '1093',
 '1094',
 '1095',
 '1096',
 '1097',
 '1098',
 '1099',
 '11',


In [41]:
# Get all users from the test set and remove any users that were know in the training set 
test_users = test_df[USER].unique()

In [42]:
test_users

array(['57', '141', '184', '15', '486', '738', '843', '18', '434', '761',
       '692', '777', '913', '38', '472', '295', '311', '174', '938',
       '458', '246', '560', '487', '545', '389', '715', '537', '521',
       '930', '177', '865', '365', '468', '882', '263', '79', '83', '664',
       '663', '505', '540', '411', '933', '674', '6', '350', '327', '895',
       '742', '465', '710', '293', '460', '387', '390', '454', '339',
       '84', '463', '577', '941', '42', '265', '77', '868', '234', '344',
       '471', '20', '81', '137', '550', '503', '621', '330', '676', '280',
       '374', '854', '419', '916', '648', '745', '210', '727', '716',
       '200', '554', '549', '896', '934', '885', '13', '262', '459',
       '150', '689', '160', '438', '66', '735', '654', '253', '893',
       '649', '562', '838', '887', '5', '41', '817', '45', '618', '268',
       '279', '181', '790', '416', '506', '514', '565', '423', '906',
       '378', '500', '312', '862', '518', '49', '733', '104', '373'

In [43]:
test_users = np.intersect1d(test_users, total_users)

In [44]:
test_users

array(['1', '10', '100', '101', '102', '103', '104', '105', '106', '107',
       '108', '109', '11', '110', '111', '112', '113', '114', '115',
       '116', '117', '118', '119', '12', '120', '121', '122', '123',
       '124', '125', '126', '127', '128', '129', '13', '130', '131',
       '132', '133', '134', '135', '136', '137', '138', '139', '14',
       '140', '141', '142', '143', '144', '145', '146', '147', '148',
       '149', '15', '150', '151', '152', '153', '154', '155', '156',
       '157', '158', '159', '16', '160', '161', '162', '163', '164',
       '165', '166', '167', '168', '169', '17', '170', '171', '172',
       '173', '174', '175', '176', '177', '178', '179', '18', '180',
       '181', '182', '183', '184', '185', '186', '187', '188', '189',
       '19', '190', '191', '192', '193', '194', '195', '196', '197',
       '198', '199', '2', '20', '200', '201', '202', '203', '204', '205',
       '206', '207', '208', '209', '21', '210', '211', '212', '213',
       '214', '215', '

In [45]:
test_users

array(['1', '10', '100', '101', '102', '103', '104', '105', '106', '107',
       '108', '109', '11', '110', '111', '112', '113', '114', '115',
       '116', '117', '118', '119', '12', '120', '121', '122', '123',
       '124', '125', '126', '127', '128', '129', '13', '130', '131',
       '132', '133', '134', '135', '136', '137', '138', '139', '14',
       '140', '141', '142', '143', '144', '145', '146', '147', '148',
       '149', '15', '150', '151', '152', '153', '154', '155', '156',
       '157', '158', '159', '16', '160', '161', '162', '163', '164',
       '165', '166', '167', '168', '169', '17', '170', '171', '172',
       '173', '174', '175', '176', '177', '178', '179', '18', '180',
       '181', '182', '183', '184', '185', '186', '187', '188', '189',
       '19', '190', '191', '192', '193', '194', '195', '196', '197',
       '198', '199', '2', '20', '200', '201', '202', '203', '204', '205',
       '206', '207', '208', '209', '21', '210', '211', '212', '213',
       '214', '215', '

In [46]:

user_embeddings = model.weight(['1', '10'], is_item=False)

In [47]:
user_embeddings

tensor([[-0.1083,  0.0905, -0.2598, -0.2011, -0.2167,  0.3186, -0.4005, -0.2391,
         -0.5461, -0.2792, -0.2281, -0.3078, -0.0830,  0.4561, -0.1771, -0.1391,
         -0.2941,  0.2496, -0.0533,  0.3777,  0.0588,  0.2409, -0.0387, -0.1984,
          0.0218,  0.3842, -0.0915,  0.0910, -0.1608,  0.0711,  0.2534, -0.2032,
          0.1467,  0.2070, -0.3984,  0.2221,  0.0216, -0.1842,  0.1961,  0.3593],
        [-0.2927,  0.1166, -0.1005, -0.0473, -0.1500,  0.0769,  0.0102, -0.1799,
         -0.0429, -0.2102, -0.1506, -0.1559, -0.2458,  0.1731,  0.1617, -0.0912,
          0.0615,  0.0348,  0.2621,  0.1188,  0.1640, -0.0959, -0.0403, -0.2300,
         -0.1632, -0.1429,  0.0424,  0.1144,  0.1771,  0.1764,  0.1228, -0.0860,
         -0.2225,  0.0685, -0.1919,  0.1486, -0.0976, -0.1591, -0.0258,  0.1433]])

In [48]:
# Example prediction
user_id = "1"
item_id = "10"
predicted_rating = predict_rating(model, user_id, item_id)
print(f'\nPredicted rating for user {user_id} and item {item_id}: {predicted_rating}')

Trying to predict for user 1 and item 10
Available users: ['#na#', '1', '10', '100', '101']...
Available items: ['#na#', '1', '10', '100', '1000']...

Predicted rating for user 1 and item 10: 4.2375335693359375


In [49]:
np.array(test_users)

array(['1', '10', '100', '101', '102', '103', '104', '105', '106', '107',
       '108', '109', '11', '110', '111', '112', '113', '114', '115',
       '116', '117', '118', '119', '12', '120', '121', '122', '123',
       '124', '125', '126', '127', '128', '129', '13', '130', '131',
       '132', '133', '134', '135', '136', '137', '138', '139', '14',
       '140', '141', '142', '143', '144', '145', '146', '147', '148',
       '149', '15', '150', '151', '152', '153', '154', '155', '156',
       '157', '158', '159', '16', '160', '161', '162', '163', '164',
       '165', '166', '167', '168', '169', '17', '170', '171', '172',
       '173', '174', '175', '176', '177', '178', '179', '18', '180',
       '181', '182', '183', '184', '185', '186', '187', '188', '189',
       '19', '190', '191', '192', '193', '194', '195', '196', '197',
       '198', '199', '2', '20', '200', '201', '202', '203', '204', '205',
       '206', '207', '208', '209', '21', '210', '211', '212', '213',
       '214', '215', '

In [50]:
np.array(total_items)

array(['1', '10', '100', ..., '997', '998', '999'],
      shape=(1682,), dtype='<U4')

In [51]:
users_items = cartesian_product(np.array(test_users),np.array(total_items))

In [52]:
users_items

array([['1', '1'],
       ['1', '10'],
       ['1', '100'],
       ...,
       ['99', '997'],
       ['99', '998'],
       ['99', '999']], shape=(1586126, 2), dtype=object)

In [53]:
users_items = pd.DataFrame(users_items, columns=[USER,ITEM])

In [54]:
users_items

Unnamed: 0,userID,itemID
0,1,1
1,1,10
2,1,100
3,1,1000
4,1,1001
...,...,...
1586121,99,995
1586122,99,996
1586123,99,997
1586124,99,998


In [55]:
training_removed = pd.merge(users_items, train_valid_df.astype(str), on=[USER, ITEM], how='left')
training_removed = training_removed[training_removed[RATING].isna()][[USER, ITEM]]

In [56]:
training_removed

Unnamed: 0,userID,itemID
3,1,1000
4,1,1001
5,1,1002
6,1,1003
7,1,1004
...,...,...
1586121,99,995
1586122,99,996
1586123,99,997
1586124,99,998


In [57]:
from recommenders.utils import constants as cc

In [58]:
total_users, total_items = data.classes.values()

In [59]:
total_users

['#na#',
 '1',
 '10',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '11',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '12',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '13',
 '130',
 '131',
 '132',
 '133',
 '134',
 '135',
 '136',
 '137',
 '138',
 '139',
 '14',
 '140',
 '141',
 '142',
 '143',
 '144',
 '145',
 '146',
 '147',
 '148',
 '149',
 '15',
 '150',
 '151',
 '152',
 '153',
 '154',
 '155',
 '156',
 '157',
 '158',
 '159',
 '16',
 '160',
 '161',
 '162',
 '163',
 '164',
 '165',
 '166',
 '167',
 '168',
 '169',
 '17',
 '170',
 '171',
 '172',
 '173',
 '174',
 '175',
 '176',
 '177',
 '178',
 '179',
 '18',
 '180',
 '181',
 '182',
 '183',
 '184',
 '185',
 '186',
 '187',
 '188',
 '189',
 '19',
 '190',
 '191',
 '192',
 '193',
 '194',
 '195',
 '196',
 '197',
 '198',
 '199',
 '2',
 '20',
 '200',
 '201',
 '202',
 '203',
 '204',
 '205',
 '206',
 '207',
 '208',
 '209',
 '21',
 '210',
 '211',

In [60]:
def score(
    learner,
    data,
    test_df,
    user_col=cc.DEFAULT_USER_COL,
    item_col=cc.DEFAULT_ITEM_COL,
    prediction_col=cc.DEFAULT_PREDICTION_COL,
    top_k=None,
):
    """Score all users+items provided and reduce to top_k items per user if top_k>0

    Args:
        learner (object): Model.
        test_df (pandas.DataFrame): Test dataframe.
        user_col (str): User column name.
        item_col (str): Item column name.
        prediction_col (str): Prediction column name.
        top_k (int): Number of top items to recommend.

    Returns:
        pandas.DataFrame: Result of recommendation
    """
    # replace values not known to the model with NaN
    total_users, total_items = data.classes.values()
    test_df.loc[~test_df[user_col].isin(total_users), user_col] = np.nan
    test_df.loc[~test_df[item_col].isin(total_items), item_col] = np.nan

    # map ids to embedding ids
    u = learner._get_idx(test_df[user_col], is_item=False)
    m = learner._get_idx(test_df[item_col], is_item=True)

    # score the pytorch model
    x = torch.column_stack((u, m))

    if torch.cuda.is_available():
        x = x.to("cuda")
        learner = learner.to("cuda")

    pred = learner.forward(x).detach().cpu().numpy()
    scores = pd.DataFrame(
        {user_col: test_df[user_col], item_col: test_df[item_col], prediction_col: pred}
    )
    scores = scores.sort_values([user_col, prediction_col], ascending=[True, False])

    if top_k is not None:
        top_scores = scores.groupby(user_col).head(top_k).reset_index(drop=True)
    else:
        top_scores = scores

    return top_scores

In [61]:
with Timer() as test_time:
    top_k_scores = score(model, 
                         data, # Agregado pero podria quitarlo 
                         test_df=training_removed,
                         user_col=USER, 
                         item_col=ITEM, 
                         prediction_col=PREDICTION)

In [62]:
top_k_scores

Unnamed: 0,userID,itemID,prediction
760,1,169,5.118164
1026,1,408,5.043024
926,1,318,4.909151
1109,1,483,4.870055
1343,1,694,4.862435
...,...,...,...
1585643,99,564,1.869777
1585337,99,289,1.868608
1584542,99,1087,1.852386
1585506,99,440,1.810928


In [63]:
eval_map = map(test_df, top_k_scores, col_user=USER, col_item=ITEM, 
               col_rating=RATING, col_prediction=PREDICTION, 
               relevancy_method="top_k", k=TOP_K)

In [64]:
eval_ndcg = ndcg_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, 
                      col_rating=RATING, col_prediction=PREDICTION, 
                      relevancy_method="top_k", k=TOP_K)

In [65]:
eval_precision = precision_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, 
                                col_rating=RATING, col_prediction=PREDICTION, 
                                relevancy_method="top_k", k=TOP_K)

In [66]:
eval_recall = recall_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, 
                          col_rating=RATING, col_prediction=PREDICTION, 
                          relevancy_method="top_k", k=TOP_K)

In [67]:
print("Model:\t\t" + model.__class__.__name__,
      "Top K:\t\t%d" % TOP_K,
      "MAP:\t\t%f" % eval_map,
      "NDCG:\t\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

Model:		EmbeddingDotBias
Top K:		10
MAP:		0.021065
NDCG:		0.135539
Precision@K:	0.124708
Recall@K:	0.049484


In [68]:
scores = score(model,
                data,
               test_df=test_df.copy(), 
               user_col=USER, 
               item_col=ITEM, 
               prediction_col=PREDICTION)

In [70]:
eval_r2 = rsquared(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)
eval_rmse = rmse(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)
eval_mae = mae(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)
eval_exp_var = exp_var(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)

print("Model:\t\t\t" + model.__class__.__name__,
      "RMSE:\t\t\t%f" % eval_rmse,
      "MAE:\t\t\t%f" % eval_mae,
      "Explained variance:\t%f" % eval_exp_var,
      "R squared:\t\t%f" % eval_r2, sep='\n')

Model:			EmbeddingDotBias
RMSE:			0.912432
MAE:			0.716623
Explained variance:	0.343944
R squared:		0.343943
