In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

np.random.seed(123)

In [None]:

try:
    ratings=pd.read_csv('rating.csv', parse_dates=['timestamp'], on_bad_lines='skip')
    ratings=ratings.head(10000)
    print("Dataset loaded successfully!")
    print(ratings.head())
    print(f"Total rows loaded:{len(ratings)}")
except FileNotFoundError:
    print("Error:'rating.csv' not found. please ensure it is in the same directory as this notebook.")
except Exception as e:
    print(f"An unexpected error occured during file reading:{e}")

Dataset loaded successfully!
   userId  movieId  rating           timestamp
0       1        2     3.5 2005-04-02 23:53:47
1       1       29     3.5 2005-04-02 23:31:16
2       1       32     3.5 2005-04-02 23:33:39
3       1       47     3.5 2005-04-02 23:32:07
4       1       50     3.5 2005-04-02 23:29:40
Total rows loaded:10000


In [None]:
#using only 30% of users dataset
rand_userIds = np.random.choice(ratings['userId'].unique(), size=int(len(ratings['userId'].unique())*0.3), replace=False)

ratings = ratings.loc[ratings['userId'].isin(rand_userIds)]

print('There are {} rows of data from {} users'.format(len(ratings), len(rand_userIds)))

There are 3116 rows of data from 27 users


In [None]:
ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
2725,24,1682,4.0,2001-07-04 07:04:07
5905,54,2699,4.0,2000-11-21 20:37:46
5793,54,2161,4.0,2000-11-21 19:56:29
494,5,631,3.0,1996-12-25 15:17:36
5786,54,2134,3.0,2000-11-28 20:09:51


In [None]:
#train and test 
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'] \
    .rank(method='first', ascending=False)

train_ratings = ratings[ratings['rank_latest'] != 1]
test_ratings = ratings[ratings['rank_latest'] == 1]

# drop columns that we no longer need
train_ratings = train_ratings[['userId', 'movieId', 'rating']]
test_ratings = test_ratings[['userId', 'movieId', 'rating']]

In [None]:
train_ratings.loc[:, 'rating'] = 1

train_ratings.sample(5)

Unnamed: 0,userId,movieId,rating
7252,61,44397,1.0
2906,24,3616,1.0
9333,88,3686,1.0
9407,89,2716,1.0
8024,71,4022,1.0


In [None]:
# Get a list of all movie IDs
all_movieIds = ratings['movieId'].unique()

print("Starting Fast Negative Sampling...")

# Placeholders that will hold the final training data
users, items, labels = [], [], []
user_item_set = set(zip(train_ratings['userId'], train_ratings['movieId']))
# Get the total number of positive interactions (for the loop)
num_positive_samples = len(user_item_set)

# 1. Process Positive Samples (Efficiently)
for (u, i) in tqdm(user_item_set, desc="Processing Positive Samples"):
    users.append(u)
    items.append(i)
    labels.append(1) # Positive label

# 2. Process Negative Samples (Optimized for Speed)
# We need num_negatives * num_positive_samples total negative samples
num_negatives=1
total_negative_needed = num_negatives * num_positive_samples
all_movieIds_array = np.array(all_movieIds)

# Create a list to store all (user, negative_item) pairs
negative_samples_list = []

# For large datasets, a fixed batch approach is faster than checking every single sample.
# We will generate a large pool and filter out the positives.

# Create an array of user IDs, repeating each user ID 'num_negatives' times
# Example: [u1, u1, u1, u2, u2, u2, ...]
negative_user_ids = np.repeat(list(train_ratings['userId']), num_negatives)

# Randomly select a large batch of movie IDs
negative_movie_ids = np.random.choice(all_movieIds_array, size=len(negative_user_ids))

# Zip the randomly generated pairs into a set for quick checking
random_pairs_set = set(zip(negative_user_ids, negative_movie_ids))

# Find the pairs that ARE positive samples (i.e., we need to reject them)
bad_pairs = random_pairs_set.intersection(user_item_set)

# Filter out the bad pairs to get a set of valid negative samples
valid_negative_samples = random_pairs_set - bad_pairs


# 3. Append the valid negative samples to the final lists
# We only take the number of samples we originally intended to generate
final_negative_samples = list(valid_negative_samples)[:total_negative_needed]

for u, i in tqdm(final_negative_samples, desc="Appending Negative Samples"):
    users.append(u)
    items.append(i)
    labels.append(0) # Negative label

print(f"Total positive samples: {num_positive_samples}")
print(f"Total negative samples appended: {len(final_negative_samples)}")
print(f"Total samples for training: {len(users)}")

Starting Fast Negative Sampling...


Processing Positive Samples: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3089/3089 [00:00<00:00, 515751.96it/s]
Appending Negative Samples: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2259/2259 [00:00<00:00, 1900688.61it/s]

Total positive samples: 3089
Total negative samples appended: 2259
Total samples for training: 5348





In [None]:
class MovieLensTrainDataset(Dataset):
    """MovieLens PyTorch Dataset for Training
    
    Args:
        ratings (pd.DataFrame): Dataframe containing the movie ratings
        all_movieIds (list): List containing all movieIds
    
    """

    def __init__(self, ratings, all_movieIds):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_movieIds)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_movieIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['userId'], ratings['movieId']))

        num_negatives = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_movieIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        users_tensor=torch.tensor(users,dtype=torch.long)
        items_tensor=torch.tensor(items,dtype=torch.long)
        labels_tensor=torch.tensor(labels,dtype=torch.float32)
        
        return users_tensor, items_tensor, labels_tensor

In [None]:
#NCF model using Pytorch lightning
class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)
    
        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the movie ratings for training
            all_movieIds (list): List containing all movieIds (train + test)
    """
    
    def __init__(self, num_users, num_items, ratings, all_movieIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_movieIds = all_movieIds
        
    def forward(self, user_input, item_input):
        
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred
    
    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(MovieLensTrainDataset(self.ratings, self.all_movieIds),
                          batch_size=512, num_workers=0)

In [None]:
num_users = ratings['userId'].max()+1
num_items = ratings['movieId'].max()+1

all_movieIds = ratings['movieId'].unique()

model = NCF(num_users, num_items, train_ratings, all_movieIds)

In [None]:
# Assuming NUM_EPOCHS = 2 or similar earlier

trainer = pl.Trainer(
    # 1. Standard Argument for epochs
    max_epochs=2, 
    
    # 2. Modern replacement for 'gpus' argument
    accelerator="auto", # Automatically detects and uses the best device (GPU or CPU)
    devices=1,          # Uses 1 device 
    
    # 3. Keep the logger for standard tracking
    logger=True
    
    
)

# Start training
trainer.fit(model)

ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name           | Type      | Params | Mode  | FLOPs
-------------------------------------------------------------
0 | user_embedding | Embedding | 720    | train | 0    
1 | item_embedding | Embedding | 940 K  | train | 0    
2 | fc1            | Linear    | 1.1 K  | train | 0    
3 | fc2            | Linear    | 2.1 K  | train | 0    
4 | output         | Linear    | 33     | train | 0    
-------------------------------------------------------------
944 K     Trainable params
0         Non-trainable params
944 K     Total params
3.779     Total estimated model params size (MB)
5         Modules in train mode
0         Modules in eval mode
0         Total Flops
c:\Users\geeta\.conda\envs\pytorch_env\li

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=2` reached.


In [None]:
#evaluating the model
# User-item pairs for testing
import numpy as np
test_user_item_set = set(zip(test_ratings['userId'], test_ratings['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    user_input_tensor=torch.tensor([u]*100, dtype=torch.long)
    item_input_tensor=torch.tensor(test_items,dtype=torch.long)
    with torch.no_grad():
        predictions_tensor=model(user_input_tensor,item_input_tensor).squeeze()
    top_k_values,top_k_indices=torch.topk(predictions_tensor, k=10, largest=True)
    top10_items=[test_items[i]for i in top_k_indices.tolist()]
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
        
print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

  0%|          | 0/27 [00:00<?, ?it/s]

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 27/27 [00:00<00:00, 71.41it/s]

The Hit Ratio @ 10 is 0.04



