In [52]:
import pandas as pd
df = pd.read_json("../../Dateset/raw/ratings.json", lines=True)
df.head()
df = df.reset_index(drop=True)


In [59]:
import torch
from torch.utils.data import Dataset

class MovieLensDataset(Dataset):
    """
    The movie lens dataset class, this class prepares the dataset for training and validation.
    """
    def __init__(self, dataframe):
        """
        init the dataset object with users, movies and rating data.
        :param users: the users id
        :param movies: the movies id
        :param ratings: the rating data by users on movie 
        """
        self.dataframe = dataframe
    def __len__(self):
        """
        returns the total number of smapels in the dataset
        :return: len of dataset
        """
        return len(self.dataframe)
    def __getitem__(self, index):
        """
        Fetches a sample from the dataset.
        """
        row = self.dataframe.iloc[index]
        if index >= len(self.dataframe):
            raise IndexError(f"Index {index} out of range for dataset of size {len(self.dataframe)}")

        return {
            'user_id': torch.tensor(row['user_id'], dtype=torch.long),
            'item_id': torch.tensor(row['item_id'], dtype=torch.long),
            'rating': torch.tensor(row['rating'], dtype=torch.float)
        }
        
        

In [60]:
import torch.nn as nn
class RecommendationSystemModel(nn.Module):
    def __init__(
            self,
            num_users,
            num_movies,
            embedding_size=256,
            hidden_size=256,
            dropout=0.2,
    ):
        super(RecommendationSystemModel, self).__init__()
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_users = num_users
        self.num_movies = num_movies
        
        self.user_embedding = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.embedding_size)
        self.movie_embedding = nn.Embedding(num_embeddings=self.num_movies, embedding_dim=self.embedding_size)
        
        self.fc1 =nn.Linear(2*self.embedding_size, self.hidden_size)
        self.fc2 = nn.Linear(self.hidden_size, 1)
        
        self.dropout = nn.Dropout(p=dropout)
        self.relu = nn.ReLU()
        
    def forward(self, users, movies):
        user_embedding = self.user_embedding(users)
        movie_embedding = self.movie_embedding(movies)
        
        combined = torch.cat((user_embedding, movie_embedding), 1)
        x = self.relu(self.fc1(combined))
        x = self.dropout(x)
        output = self.fc2(x)
        return output
       

In [61]:
from sklearn.preprocessing import LabelEncoder

le_user = LabelEncoder()
le_movie = LabelEncoder()
df.user_id = le_user.fit_transform(df['user_id'].values)
df.item_id = le_movie.fit_transform(df['item_id'].values)

In [62]:
from sklearn.model_selection import train_test_split
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42,stratify=df.rating.values)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

print(df_train.shape,"\n", df_train)

(22792092, 3) 
           item_id  user_id  rating
0             313   110092     2.0
1           10713   154036     2.5
2             719   116187     3.0
3            1069   115011     3.0
4           46163   215847     1.5
...           ...      ...     ...
22792087      345     7197     4.0
22792088     3939   200478     4.5
22792089     2322    15224     3.5
22792090    10267   229139     4.0
22792091     6277   142352     4.0

[22792092 rows x 3 columns]


In [63]:
from torch.utils.data import DataLoader

BATCH_SIZE = 32
# 
# train_dataset = MovieLensDataset(df_train)
# val_dataset = MovieLensDataset(df_val)
# 
# train_loader = DataLoader(df_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)
# val_loader = DataLoader(df_val, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

# Rename columns if necessary
df_train = df_train.rename(columns={
    'user_id': 'user_id',
    'item_id': 'item_id',
    'rating': 'rating'
})
df_val = df_val.rename(columns={
    'user_id': 'user_id',
    'item_id': 'item_id',
    'rating': 'rating'
})

# Initialize dataset and DataLoader
train_dataset = MovieLensDataset(df_train)
val_dataset = MovieLensDataset(df_val)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)

# Debug DataLoader
for i, train_data in enumerate(train_loader):
    print(train_data)
    break


{'user_id': tensor([155629,  55460,  77677, 149120,  23016,  70673,  70737,  33933,  19275,
         85593, 204305,  59232,  46208, 146515, 163904,  11374,  86568,  67646,
         83555, 191952, 190291,   8702,  90644, 229724, 230021, 116708,  41637,
        215599,    671, 218041,  30914, 102355]), 'item_id': tensor([22751,   640, 12504,  4372,  5874,  2652,   257,  3074,  6241,  6443,
          315,  1198,   428,  5031,  7979,  1244,  1575,   297,  2312,  1232,
         8270,  2429,  3010,  1007, 16404,  7267,  2030, 30726, 18163,  1223,
         2639,   843]), 'rating': tensor([4.5000, 4.0000, 4.5000, 4.0000, 3.5000, 3.0000, 5.0000, 5.0000, 3.5000,
        0.5000, 5.0000, 5.0000, 0.5000, 3.0000, 3.0000, 2.0000, 2.0000, 3.0000,
        3.0000, 4.5000, 4.0000, 2.0000, 2.0000, 4.0000, 2.5000, 4.5000, 4.0000,
        5.0000, 4.0000, 4.5000, 3.0000, 4.0000])}


In [64]:
if torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')
print(device)

mps


In [66]:
import sys
recommendation_model = RecommendationSystemModel(
    num_users=len(le_user.classes_),
    num_movies=len(le_movie.classes_),
    embedding_size=128,
    hidden_size=256,
    dropout=0.1,
).to(device)

optimizer = torch.optim.Adam(
    recommendation_model.parameters(),
    lr=1e-3,
)
loss_fn = nn.MSELoss()

EPOCHS = 2

def log_progress(epoch, step, total_loss, log_progress_step, data_size, losses):
    avg_loss = total_loss / log_progress_step
    sys.stderr.write(
        f"\r{epoch+1:02d}/{EPOCHS:02d}  | step: {step}/{data_size} | avg_loss: {avg_loss:.4f}"
    )
    sys.stderr.flush()
    losses.append(avg_loss)
total_loss =0
log_progress_step = 100
losses = []
train_dataset_size = len(df_train)
print(f"traning on {train_dataset_size} samples")

recommendation_model.train()
for e in range(EPOCHS):
    step_count = 0
    for i, train_data in enumerate(train_loader):
        outputs = recommendation_model(train_data['user_id'].to(device), train_data['item_id'].to(device))
        outputs = outputs.squeeze()
        ratings = (
            train_data["rating"].to(torch.float32).to(device)
        )
        loss = loss_fn(outputs,ratings)
        total_loss+=loss.sum().item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        step_count+=len(train_data['user_id'])
        
        if(
            step_count%log_progress_step==0 or i == len(train_loader)-1
        ):
            log_progress(
                e, step_count, total_loss,log_progress_step,train_dataset_size, losses
            )
            total_loss = 0

traning on 22792092 samples


01/02  | step: 7711200/22792092 | avg_loss: 0.2111

KeyboardInterrupt: 

KeyError: Caught KeyError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/opt/anaconda3/envs/Movie_rec/lib/python3.11/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 2602010

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/opt/anaconda3/envs/Movie_rec/lib/python3.11/site-packages/torch/utils/data/_utils/worker.py", line 351, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/Movie_rec/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/Movie_rec/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
            ~~~~~~~~~~~~^^^^^
  File "/opt/anaconda3/envs/Movie_rec/lib/python3.11/site-packages/pandas/core/frame.py", line 4102, in __getitem__
    indexer = self.columns.get_loc(key)
              ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/Movie_rec/lib/python3.11/site-packages/pandas/core/indexes/base.py", line 3812, in get_loc
    raise KeyError(key) from err
KeyError: 2602010
