In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras


In [2]:
movies=pd.read_csv("movies.csv")
ratings=pd.read_csv("ratings.csv")
tags=pd.read_csv("tags.csv")
links=pd.read_csv("links.csv")


In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


Filter Data for Reliability 

In [None]:
#users wiith 20 ratings
user_ratings_counts=ratings.groupby("userId").size()
valid_users=user_ratings_counts[user_ratings_counts>=20].index
ratings=ratings[ratings["userId"].isin(valid_users)]

In [None]:
#movies with 20 ratings
movie_rating_counts=ratings.groupby("movieId").size()
valid_movies=movie_rating_counts[movie_rating_counts>=20].index
ratings=ratings[ratings["movieId"].isin(valid_movies)]

Build Ratings Matrix (Y) and Indicator Matrix (R)

In [None]:
# Get unique movie and user ids from the filtered data

unique_movie_ids=ratings["movieId"].unique()
unique_user_ids=ratings["userId"].unique()

num_movies=len(unique_movie_ids)
num_users=len(unique_user_ids)

In [None]:
# Create mappings from movieId and userId to matrix indices

movie_id_to_index={movie_id:i for i,movie_id in enumerate(unique_movie_ids)}

user_id_to_index={user_id:i for i,user_id in enumerate(unique_user_ids)}

In [15]:
# Initialize matrices: Y holds ratings, R indicates if a rating exists (1) or not (0)



from scipy.sparse import coo_matrix

# Assuming ratings_df is your DataFrame with columns: userId, movieId, rating
row_indices = ratings['movieId'].map(movie_id_to_index)
col_indices = ratings['userId'].map(user_id_to_index)
data = ratings['rating']



In [17]:
# Build a sparse ratings matrix (COO format)
Y_sparse = coo_matrix((data, (row_indices, col_indices)), shape=(num_movies, num_users))
R_sparse = coo_matrix((np.ones_like(data), (row_indices, col_indices)), shape=(num_movies, num_users))


In [18]:
print("Sparse ratings matrix shape:", Y_sparse.shape)
print("Number of non-zero entries in ratings:", Y_sparse.nnz)
print("Sparse indicator matrix shape:", R_sparse.shape)
print("Number of non-zero entries in indicator matrix:", R_sparse.nnz)

Sparse ratings matrix shape: (23350, 200948)
Number of non-zero entries in ratings: 31725920
Sparse indicator matrix shape: (23350, 200948)
Number of non-zero entries in indicator matrix: 31725920


In [20]:
Y_csr = Y_sparse.tocsr()
R_csr = R_sparse.tocsr()

In [23]:
# from numpy import dtype


# def normalizeRatings_fromSparse(Y_csr,R_csr):
#     m=Y_csr.shape[0]
#     Ynorm=np.zeros((m,num_users), dtype=np.float32)
#     Ymean = np.zeros((m, 1), dtype=np.float32)
#     for i in range(m):
#         # Get the dense row for movie i (only for indices with ratings)
#         row_start = Y_csr.indptr[i]
#         row_end   = Y_csr.indptr[i+1]
#         if row_end > row_start:
#             ratings = Y_csr.data[row_start:row_end]
#             indices = Y_csr.indices[row_start:row_end]
#             mean_rating = np.mean(ratings)
#             Ymean[i] = mean_rating
#             # Place normalized ratings in the corresponding columns
#             Ynorm[i, indices] = ratings - mean_rating
#     return Ynorm, Ymean
# Ynorm, Ymean = normalizeRatings_fromSparse(Y_csr, R_csr)


In [24]:
def compute_Ymean(Y_csr):
    m = Y_csr.shape[0]
    Ymean = np.zeros((m, 1), dtype=np.float32)
    for i in range(m):
        row = Y_csr.getrow(i)
        if row.nnz > 0:
            Ymean[i, 0] = np.mean(row.data)
        else:
            Ymean[i, 0] = 0.0
    return Ymean

Ymean = compute_Ymean(Y_csr)

In [25]:
def movie_row_generator():
    """
    For each movie (row), yield:
      - movie index (int)
      - normalized ratings row (dense vector of shape (num_users,)), computed on the fly
      - indicator row (dense vector of shape (num_users,))
    """
    for i in range(num_movies):
        # Get the sparse row for movie i
        row = Y_csr.getrow(i)
        # Build a dense normalized row on the fly:
        # Allocate a dense vector of zeros for the full user dimension.
        norm_row = np.zeros(num_users, dtype=np.float32)
        if row.nnz > 0:
            # Subtract the movie's mean from its nonzero ratings.
            norm_row[row.indices] = row.data.astype(np.float32) - Ymean[i, 0]
        # Get the corresponding indicator row from R
        r_dense = R_csr.getrow(i).toarray().flatten().astype(np.float32)
        yield i, norm_row, r_dense

In [26]:
dataset = tf.data.Dataset.from_generator(
    movie_row_generator,
    output_types=(tf.int32, tf.float32, tf.float32),
    output_shapes=((), (num_users,), (num_users,))
)

Instructions for updating:
Use output_signature instead
Instructions for updating:
Use output_signature instead


In [27]:
batch_size = 100
dataset = dataset.batch(batch_size)

In [36]:
num_features = 100  # latent feature dimension

tf.random.set_seed(1234)

# X: movie features matrix, shape (num_movies, num_features)
X = tf.Variable(tf.random.normal((num_movies, num_features),stddev=0.01, dtype=tf.float32), name='X')
# W: user features matrix, shape (num_users, num_features)
W = tf.Variable(tf.random.normal((num_users, num_features),stddev=0.01, dtype=tf.float32), name='W')
# b: user bias vector, shape (1, num_users)
b = tf.Variable(tf.random.normal((1, num_users), dtype=tf.float32), name='b')
b_movies = tf.Variable(tf.zeros((num_movies, 1), dtype=tf.float32), name='b_movies')


In [37]:
def mini_batch_cost(X_batch, W, b, Y_batch, R_batch, lambda_):
    # X_batch: (batch_size, num_features)
    # Y_batch, R_batch: (batch_size, num_users)
    predictions = tf.matmul(X_batch, W, transpose_b=True) + b  # (batch_size, num_users)
    error = (predictions - Y_batch) * R_batch
    cost = 0.5 * tf.reduce_sum(tf.square(error))
    cost += (lambda_ / 2) * (tf.reduce_sum(tf.square(X_batch)) + tf.reduce_sum(tf.square(W))+tf.reduce_sum(tf.square(b_movies_batch)))
    return cost

In [35]:
lambda_ = 0.01
learning_rate = 1e-4
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate,clipnorm=1.0)
num_epochs = 5

for epoch in range(num_epochs):
    epoch_cost = 0.0
    batch_count=0
    for batch in dataset:
        batch_count+=1
        indices, Y_batch, R_batch = batch  # indices: (batch_size,), Y_batch, R_batch: (batch_size, num_users)
        # Ensure that X is explicitly watched.
        with tf.GradientTape() as tape:
            tape.watch(X)  # Ensure X is watched.
            X_batch = tf.gather(X, indices)
            cost_val = mini_batch_cost(X_batch, W, b, Y_batch, R_batch, lambda_)
        # Compute gradients for X, W, and b in one go:
        grad_X_full, grad_W, grad_b = tape.gradient(cost_val, [X, W, b])

        
        # Check if grad_X_full is still None (it shouldn't be now)
        if grad_X_full is None:
            raise ValueError("grad_X_full is None; check that your cost function depends on X.")
        
        # Extract the gradient for the mini-batch slice.
        grad_X_batch = tf.gather(grad_X_full, indices)
        
        # Update global parameters W and b using the optimizer.
        optimizer.apply_gradients(zip([grad_W, grad_b], [W, b]))
        
        # Manually update the corresponding slice of X.
        X_batch_updated = tf.gather(X, indices) - learning_rate * grad_X_batch
        X.assign(tf.tensor_scatter_nd_update(X, tf.expand_dims(indices, 1), X_batch_updated))
        
        
        epoch_cost += cost_val

        if batch_count % 10 == 0:
            print(f"  Batch {batch_count}, Current cost: {cost_val.numpy():.2f}")
            if tf.math.is_nan(cost_val):
                print("NaN detected! Stopping training.")
                break
    print(f"Epoch {epoch+1}/{num_epochs} cost: {epoch_cost.numpy():.2f}")

    if tf.math.is_nan(epoch_cost):
        print("Training stopped due to NaN values")
        break

  Batch 10, Current cost: 862025.19
  Batch 20, Current cost: 335439.75
  Batch 30, Current cost: 192001.64
  Batch 40, Current cost: 275550.91
  Batch 50, Current cost: 106135.52
  Batch 60, Current cost: 89548.34
  Batch 70, Current cost: 30082.48
  Batch 80, Current cost: 37282.74
  Batch 90, Current cost: 38134.46
  Batch 100, Current cost: 28241.84
  Batch 110, Current cost: 18897.18
  Batch 120, Current cost: 21920.80
  Batch 130, Current cost: 11817.30
  Batch 140, Current cost: 6582.86
  Batch 150, Current cost: 9804.51
  Batch 160, Current cost: 9217.42
  Batch 170, Current cost: 9038.06
  Batch 180, Current cost: 6815.80
  Batch 190, Current cost: 4943.78
  Batch 200, Current cost: 4771.76
  Batch 210, Current cost: 4272.40
  Batch 220, Current cost: 3324.58
  Batch 230, Current cost: 2854.41
Epoch 1/5 cost: 30324322.00
  Batch 10, Current cost: 857115.06
  Batch 20, Current cost: 333234.12
  Batch 30, Current cost: 190416.80
  Batch 40, Current cost: 273418.66
  Batch 50, Cu