In [1]:
import torch
from fontTools.subset import subset

if torch.cuda.is_available():
    device = torch.device('cuda')  # Use the GPU if available
else:
    device = torch.device('cpu')  # Fall back to CPU
print(device)


cuda


In [2]:
import pandas as pd
df = pd.read_csv("../../Data/raw/ratings.csv")
df.head()
df = df.reset_index(drop=True)
df

Unnamed: 0,item_id,user_id,rating
0,5,997206,3.0
1,10,997206,4.0
2,13,997206,4.0
3,17,997206,5.0
4,21,997206,4.0
...,...,...,...
28490111,104211,187144,1.5
28490112,104243,187144,3.5
28490113,104374,187144,3.5
28490114,104841,187144,3.0


In [19]:
import torch
from torch.utils.data import Dataset

class MovieLensDataset(Dataset):
    """
    The movie lens dataset class, this class prepares the dataset for training and validation.
    """
    def __init__(self, dataframe):
        """
        init the dataset object with users, movies and rating data.
        :param users: the users id
        :param movies: the movies id
        :param ratings: the rating data by users on movie 
        """
        self.dataframe = dataframe
        self.user_id = dataframe['user_id']
        self.movie_id = dataframe['item_id']
        self.ratings = dataframe['rating']
    def __len__(self):
        """
        returns the total number of smapels in the dataset
        :return: len of dataset
        """
        return len(self.dataframe)
    def __getitem__(self, index):
        """
        Fetches a sample from the dataset.
        """
        row = self.dataframe.iloc[index]
        if index >= len(self.dataframe):
            raise IndexError(f"Index {index} out of range for dataset of size {len(self.dataframe)}")

        return {
            'user_id': torch.tensor(row['user_id'], dtype=torch.long),
            'item_id': torch.tensor(row['item_id'], dtype=torch.long),
            'rating': torch.tensor(row['rating'], dtype=torch.float)
        }
        
        

In [11]:
import torch.nn as nn
class RecommendationSystemModel(nn.Module):
    def __init__(
            self,
            num_users,
            num_movies,
            embedding_size=256,
            hidden_size=256,
            dropout=0.2,
    ):
        super(RecommendationSystemModel, self).__init__()
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_users = num_users
        self.num_movies = num_movies
        
        self.user_embedding = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.embedding_size)
        self.movie_embedding = nn.Embedding(num_embeddings=self.num_movies, embedding_dim=self.embedding_size)
        
        self.fc1 =nn.Linear(2*self.embedding_size, self.hidden_size)
        self.fc2 = nn.Linear(self.hidden_size, 64)
        self.fc3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(p=dropout)
        self.relu = nn.ReLU()
        
    def forward(self, users, movies):
        user_embedding = self.user_embedding(users)
        movie_embedding = self.movie_embedding(movies)
        
        combined = torch.cat((user_embedding, movie_embedding), 1)
        x = self.relu(self.fc1(combined))
        x = self.dropout(x)
        x =self.relu( self.fc2(x))
        x = self.fc3(x)
        output = x.squeeze()
        return output
       

In [12]:
from sklearn.preprocessing import LabelEncoder

le_user = LabelEncoder()
le_movie = LabelEncoder()
df.user_id = le_user.fit_transform(df['user_id'].values)
df.item_id = le_movie.fit_transform(df['item_id'].values)

In [13]:
from sklearn.model_selection import train_test_split
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42,stratify=df.rating.values)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

print(df_train.shape,"\n", df_train)

(22792092, 3) 
           item_id  user_id  rating
0             313   110092     2.0
1           10713   154036     2.5
2             719   116187     3.0
3            1069   115011     3.0
4           46163   215847     1.5
...           ...      ...     ...
22792087      345     7197     4.0
22792088     3939   200478     4.5
22792089     2322    15224     3.5
22792090    10267   229139     4.0
22792091     6277   142352     4.0

[22792092 rows x 3 columns]


In [20]:
from torch.utils.data import DataLoader

BATCH_SIZE = 32
# 
# train_dataset = MovieLensDataset(df_train)
# val_dataset = MovieLensDataset(df_val)
# 
# train_loader = DataLoader(df_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)
# val_loader = DataLoader(df_val, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)
# df_train = df_train.reset_index(drop=True)
# df_val = df_val.reset_index(drop=True)

# Rename columns if necessary
df_train = df_train.rename(columns={
    'user_id': 'user_id',
    'item_id': 'item_id',
    'rating': 'rating'
})
df_val = df_val.rename(columns={
    'user_id': 'user_id',
    'item_id': 'item_id',
    'rating': 'rating'
})

# Initialize dataset and DataLoader
train_dataset = MovieLensDataset(df_train)
val_dataset = MovieLensDataset(df_val)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)

# Debug DataLoader
for i, train_data in enumerate(train_loader):
    print(train_data)
    break


{'user_id': tensor([202160,   1598, 124286, 113701, 110417, 112373, 152932,  77929, 242401,
         99132,  60555,  37861, 203750, 140879, 154862, 182585,  16429,  70192,
        138093, 219306, 174366, 127113, 191849, 136339,  34763, 138761, 186884,
         63391,  19674,  79363,  14492,  45004]), 'item_id': tensor([ 2381, 19936, 13031, 10670,  1197,  9623,  4403,  2893,  1240,  8154,
          366,   602,   843,  2619,  2043, 11099,   476,   109,  6885,  3212,
         5924, 22655,  1764, 12241,  7805, 22136,  3344,   353,   602, 26113,
         2619, 12862]), 'rating': tensor([2.0000, 3.0000, 2.0000, 1.5000, 3.0000, 4.5000, 3.0000, 3.5000, 5.0000,
        3.5000, 2.0000, 3.0000, 3.5000, 4.0000, 4.0000, 3.5000, 4.0000, 4.0000,
        3.5000, 4.0000, 3.0000, 4.0000, 5.0000, 4.0000, 4.0000, 3.0000, 3.0000,
        3.0000, 3.0000, 4.0000, 4.5000, 3.0000])}


In [21]:
#todo delete after checking if the codes works or not and how much is RMSE values

import sys
from torch.utils.data import Subset
import random

SUB_SIZE = int(len(train_dataset) * 0.2)

indices = list(range(len(train_dataset)))
random.shuffle(indices)
sub_indices = indices[:SUB_SIZE]

subset_train_dataset = Subset(train_loader.dataset, sub_indices)
subset_train_loader = torch.utils.data.DataLoader(subset_train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)

recommendation_model = RecommendationSystemModel(
    num_users=len(le_user.classes_),
    num_movies=len(le_movie.classes_),
    embedding_size=256,
    hidden_size=256,
    dropout=0.1,
).to(device)

optimizer = torch.optim.Adam(
    recommendation_model.parameters(),
    lr=1e-3,
)
loss_fn = nn.MSELoss()

EPOCHS = 2

def log_progress(epoch, step, total_loss, log_progress_step, data_size, losses):
    avg_loss = total_loss / log_progress_step
    sys.stderr.write(
        f"\r{epoch+1:02d}/{EPOCHS:02d}  | step: {step}/{data_size} | avg_loss: {avg_loss:.4f}"
    )
    sys.stderr.flush()
    losses.append(avg_loss)
total_loss =0
log_progress_step = 100
losses = []
train_dataset_size = len(df_train)
print(f"traning on {train_dataset_size} samples")

recommendation_model.train()
for e in range(EPOCHS):
    step_count = 0
    for i, train_data in enumerate(subset_train_loader):
        outputs = recommendation_model(train_data['user_id'].to(device), train_data['item_id'].to(device))
        outputs = outputs.squeeze()
        ratings = (
            train_data["rating"].to(torch.float32).to(device)
        )
        loss = loss_fn(outputs,ratings)
        total_loss+=loss.sum().item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        step_count+=len(train_data['user_id'])
        
        if(
            step_count%log_progress_step==0 or i == len(train_loader)-1
        ):
            log_progress(
                e, step_count, total_loss,log_progress_step,train_dataset_size, losses
            )
            total_loss = 0

traning on 22792092 samples


02/02  | step: 4558400/22792092 | avg_loss: 0.2033

In [23]:
from sklearn.metrics import mean_squared_error
import numpy as np

def calculate_rmse(model, data_loader):
    model.eval()  # Set the model to evaluation mode
    all_predictions = []
    all_actuals = []
    with torch.no_grad():  # No need to compute gradients for evaluation
        for batch in data_loader:
            user_ids = batch['user_id'].to(device)
            item_ids = batch['item_id'].to(device)
            ratings = batch['rating'].to(torch.float32).to(device)
            
            # Get predictions
            predictions = model(user_ids, item_ids).squeeze()
            all_predictions.extend(predictions.cpu().numpy())
            all_actuals.extend(ratings.cpu().numpy())
    
    # Compute RMSE
    mse = mean_squared_error(all_actuals, all_predictions)
    rmse = np.sqrt(mse)
    return rmse

# After training, calculate RMSE on the validation dataset
validation_rmse = calculate_rmse(recommendation_model, val_loader)
print(f"Validation RMSE: {validation_rmse:.4f}")


Validation RMSE: 0.8973


In [10]:
import sys
recommendation_model = RecommendationSystemModel(
    num_users=len(le_user.classes_),
    num_movies=len(le_movie.classes_),
    embedding_size=128,
    hidden_size=256,
    dropout=0.1,
).to(device)

optimizer = torch.optim.Adam(
    recommendation_model.parameters(),
    lr=1e-3,
)
loss_fn = nn.MSELoss()

EPOCHS = 2

def log_progress(epoch, step, total_loss, log_progress_step, data_size, losses):
    avg_loss = total_loss / log_progress_step
    sys.stderr.write(
        f"\r{epoch+1:02d}/{EPOCHS:02d}  | step: {step}/{data_size} | avg_loss: {avg_loss:.4f}"
    )
    sys.stderr.flush()
    losses.append(avg_loss)
total_loss =0
log_progress_step = 100
losses = []
train_dataset_size = len(df_train)
print(f"traning on {train_dataset_size} samples")

recommendation_model.train()
for e in range(EPOCHS):
    step_count = 0
    for i, train_data in enumerate(train_loader):
        outputs = recommendation_model(train_data['user_id'].to(device), train_data['item_id'].to(device))
        outputs = outputs.squeeze()
        ratings = (
            train_data["rating"].to(torch.float32).to(device)
        )
        loss = loss_fn(outputs,ratings)
        total_loss+=loss.sum().item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        step_count+=len(train_data['user_id'])
        
        if(
            step_count%log_progress_step==0 or i == len(train_loader)-1
        ):
            log_progress(
                e, step_count, total_loss,log_progress_step,train_dataset_size, losses
            )
            total_loss = 0

traning on 22792092 samples


02/02  | step: 22792092/22792092 | avg_loss: 0.0201

In [22]:

import torch

# Save the model state_dict with just the filename
torch.save(recommendation_model.state_dict(), "recommendation_model.pth")

print("Model saved as 'recommendation_model.pth' in the current directory.")


Model saved as 'recommendation_model.pth' in the current directory.


In [12]:
from sklearn.metrics import mean_squared_error

y_pred = []
y_true = []

recommendation_model.eval()

with torch.no_grad():
    for i, valid_data in enumerate(val_loader):
        output = recommendation_model(
            valid_data["user_id"].to(device), valid_data["item_id"].to(device)
        )
        ratings = valid_data["rating"].to(device)
        y_pred.extend(output.cpu().numpy())
        y_true.extend(ratings.cpu().numpy())

# Calculate RMSE
rms = mean_squared_error(y_true, y_pred, squared=False)
print(f"RMSE: {rms:.4f}")

RMSE: 0.8677


In [None]:
from collections import defaultdict

def calculate_precision_recall(user_ratings, k, threshold):
    user_ratings.sort(key=lambda x: x[0], reverse=True)
    n_rel = sum(true_r >= threshold for _, true_r in user_ratings)
    n_rec_k = sum(est >= threshold for est, _ in user_ratings[:k])
    n_rel_and_rec_k = sum((true_r >= threshold) and (est >= threshold) for est, true_r in user_ratings[:k])

    precision = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
    recall = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
    return precision, recall

user_ratings_comparison = defaultdict(list)

with torch.no_grad():
    for valid_data in val_loader:
        users = valid_data["user_id"].to(device)
        movies = valid_data["item_id"].to(device)
        ratings = valid_data["rating"].to(device)
        output = recommendation_model(users, movies)

        for user, pred, true in zip(users, output, ratings):
            user_ratings_comparison[user.item()].append((pred[0].item(), true.item()))

user_precisions = dict()
user_based_recalls = dict()

k = 50
threshold = 3

for user_id, user_ratings in user_ratings_comparison.items():
    precision, recall = calculate_precision_recall(user_ratings, k, threshold)
    user_precisions[user_id] = precision
    user_based_recalls[user_id] = recall


average_precision = sum(prec for prec in user_precisions.values()) / len(user_precisions)
average_recall = sum(rec for rec in user_based_recalls.values()) / len(user_based_recalls)

print(f"precision @ {k}: {average_precision:.4f}")
print(f"recall @ {k}: {average_recall:.4f}")
