In [83]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [84]:
movieData = pd.read_csv("data/ratings.csv")
movieData.drop("timestamp", axis = 1, inplace=True)
moviesMapping = pd.read_csv("data/movies.csv")

In [85]:
movieData.head(10)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
5,1,70,3.0
6,1,101,5.0
7,1,110,4.0
8,1,151,5.0
9,1,157,5.0


In [86]:
moviesMapping.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [87]:
# Checking how many unique movie ID are present actually and in the dataframe we have
unqInDf = set(movieData["movieId"].unique())
unqIndata = set(moviesMapping["movieId"].unique())

print(movieData["movieId"].nunique())
print(moviesMapping["movieId"].nunique())
print(movieData["userId"].nunique())


9724
9742
610


In [88]:
# Here I am just checking which of the movies are present in movies.csv but are not present in the rating.csv

notPresent = []
for movie in unqIndata:
    if movie not in unqInDf:
        notPresent.append(movie)
        
print(len(notPresent))
print(notPresent)
        

18
[np.int64(1076), np.int64(34482), np.int64(2939), np.int64(3338), np.int64(3456), np.int64(4194), np.int64(5721), np.int64(6668), np.int64(6849), np.int64(7020), np.int64(7792), np.int64(8765), np.int64(85565), np.int64(25855), np.int64(26085), np.int64(30892), np.int64(32160), np.int64(32371)]


In [89]:
# Creating new indexs from userid and moviesId to create the matrix for training 

unique_users = movieData["userId"].unique()
unique_movies = movieData["movieId"].unique()

# Step 2: Build mapping dicts (old ID -> new index)
user2idx = {old: new for new, old in enumerate(unique_users)}
movie2idx = {old: new for new, old in enumerate(unique_movies)}

# Step 3: Apply mapping
movieData["userId"] = movieData["userId"].map(user2idx)
movieData["movieId"] = movieData["movieId"].map(movie2idx)

# Now users are in range [0, n_users-1] and movies in [0, n_movies-1]
print(movieData.head())
print("Number of users:", len(user2idx))
print("Number of movies:", len(movie2idx))

   userId  movieId  rating
0       0        0     4.0
1       0        1     4.0
2       0        2     4.0
3       0        3     5.0
4       0        4     5.0
Number of users: 610
Number of movies: 9724


In [90]:
# Creating the matrix

# Step 1: Get dimensions
n_users = movieData["userId"].nunique()
n_movies = movieData["movieId"].nunique()

# Step 2: Initialize empty matrix
R = np.zeros((n_users, n_movies))

# Step 3: Fill the matrix with ratings
for row in movieData.itertuples(index=False):
    R[row.userId, row.movieId] = row.rating

print("Shape of R:", R.shape)
print("Example row (user 0 ratings):", R[0][:20])

Shape of R: (610, 9724)
Example row (user 0 ratings): [4. 4. 4. 5. 5. 3. 5. 4. 5. 5. 5. 5. 3. 5. 4. 5. 3. 3. 5. 4.]


In [None]:
# Count total entries in R
total_entries = R.size
nonzero_entries = np.count_nonzero(R)
zero_entries = total_entries - nonzero_entries

print("Matrix shape:", R.shape)
print("Total cells in R:", total_entries)
print("Rated cells (nonzero):", nonzero_entries)
print("Unrated cells (zero):", zero_entries)
print("Sparsity of R: {:.2f}%".format(100 * nonzero_entries / total_entries))



Matrix shape: (610, 9724)
Total cells in R: 5931640
Rated cells (nonzero): 100836
Unrated cells (zero): 5830804
Sparsity of R: 1.70%


In [92]:
def train_test_split(R, test_ratio=0.2, seed=42):
    np.random.seed(seed)
    train = R.copy()
    test = np.zeros_like(R)

    users, movies = R.nonzero()
    n_ratings = len(users)

    test_size = int(n_ratings * test_ratio)
    test_indices = np.random.choice(n_ratings, size=test_size, replace=False)

    for idx in test_indices:
        u, m = users[idx], movies[idx]
        test[u, m] = R[u, m]
        train[u, m] = 0

    return train, test


In [93]:
train, test = train_test_split(R, test_ratio=0.2)


print("Original ratings:", np.count_nonzero(R))
print("Train ratings:", np.count_nonzero(train))
print("Test ratings:", np.count_nonzero(test))


Original ratings: 100836
Train ratings: 80669
Test ratings: 20167


In [94]:
import numpy as np

class MF_bias:
    def __init__(self, R, K=20, alpha=0.01, beta=0.02, epochs=20):
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.epochs = epochs

        # Latent factors
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Bias terms
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.mu = np.mean(R[np.where(R > 0)])  # global mean rating

        # Precompute observed ratings (faster than looping over full matrix)
        self.samples = [(u, i, R[u, i]) for u, i in zip(*R.nonzero())]

    def train(self):
        self.training_process = []

        for epoch in range(self.epochs):
            np.random.shuffle(self.samples)  # SGD shuffle

            for u, i, r in self.samples:
                # prediction with bias
                pred = self.mu + self.b_u[u] + self.b_i[i] + np.dot(self.P[u, :], self.Q[i, :].T)
                e_ui = r - pred

                # update biases
                self.b_u[u] += self.alpha * (e_ui - self.beta * self.b_u[u])
                self.b_i[i] += self.alpha * (e_ui - self.beta * self.b_i[i])

                # update latent factors
                self.P[u, :] += self.alpha * (e_ui * self.Q[i, :] - self.beta * self.P[u, :])
                self.Q[i, :] += self.alpha * (e_ui * self.P[u, :] - self.beta * self.Q[i, :])

            rmse = self.rmse()
            self.training_process.append((epoch, rmse))
            if epoch % 5 == 0:
                print(f"Iteration {epoch+1}/{self.epochs} ; Train RMSE = {rmse:.4f}")

    def rmse(self):
        predicted = self.full_matrix()
        xs, ys = self.R.nonzero()
        error = 0
        for x, y in zip(xs, ys):
            error += (self.R[x, y] - predicted[x, y]) ** 2
        return np.sqrt(error / len(xs))

    def full_matrix(self):
        return self.mu + self.b_u[:, np.newaxis] + self.b_i[np.newaxis:, ] + np.dot(self.P, self.Q.T)


In [95]:
mf_b = MF_bias(train, K=30, alpha=0.01, beta=0.04, epochs=51)
mf_b.train()

# Evaluate on test
test_rmse = mf_b.rmse()  # careful: this still checks train!
pred = mf_b.full_matrix()

# Proper test evaluation:
def evaluate(pred, test):
    xs, ys = test.nonzero()
    error = 0
    for x, y in zip(xs, ys):
        # clip to rating range [1,5]
        p = min(5, max(1, pred[x, y]))
        error += (test[x, y] - p) ** 2
    return np.sqrt(error / len(xs))

print("Test RMSE:", evaluate(pred, test))


Iteration 1/51 ; Train RMSE = 0.9003
Iteration 6/51 ; Train RMSE = 0.8397
Iteration 11/51 ; Train RMSE = 0.8165
Iteration 16/51 ; Train RMSE = 0.7864
Iteration 21/51 ; Train RMSE = 0.7312
Iteration 26/51 ; Train RMSE = 0.6621
Iteration 31/51 ; Train RMSE = 0.5977
Iteration 36/51 ; Train RMSE = 0.5434
Iteration 41/51 ; Train RMSE = 0.4998
Iteration 46/51 ; Train RMSE = 0.4657
Iteration 51/51 ; Train RMSE = 0.4387
Test RMSE: 0.8661199283251985


# Just testing the recommendations

In [96]:
# Reverse maps
idx2user = {v: k for k, v in user2idx.items()}
idx2movie = {v: k for k, v in movie2idx.items()}

In [97]:
def recommend_movies(mf_model, user_idx, movieData, moviesMapping, N=5):
    """
    Get top-N movie recommendations for a user.
    """
    # Predictions for this user
    predictions = mf_model.full_matrix()[user_idx]

    # Movies this user already rated
    rated_movies = set(movieData[movieData.userId == user_idx]["movieId"].tolist())

    # Exclude rated movies
    unrated_indices = [i for i in range(len(predictions)) if i not in rated_movies]

    # Sort unrated movies by predicted rating
    top_indices = sorted(unrated_indices, key=lambda x: predictions[x], reverse=True)[:N]

    recommendations = []
    for idx in top_indices:
        orig_movieId = idx2movie[idx]

        # Make sure movie exists in mapping
        row = moviesMapping.loc[moviesMapping.movieId == orig_movieId]
        if not row.empty:
            title = row.iloc[0]["title"]
        else:
            title = f"Unknown Movie (ID {orig_movieId})"

        pred_rating = predictions[idx]
        recommendations.append((title, round(float(pred_rating), 2)))

    return recommendations


In [98]:
user_idx = 100  # reindexed user

top_recs = recommend_movies(mf_b, user_idx, movieData, moviesMapping, N=5)

print(f"Top recommendations for user {user_idx} (orig ID: {idx2user[user_idx]}):")
for title, score in top_recs:
    print(f"{title} (predicted rating: {score})")


Top recommendations for user 100 (orig ID: 101):
Pulp Fiction (1994) (predicted rating: 4.93)
Trainspotting (1996) (predicted rating: 4.81)
To Kill a Mockingbird (1962) (predicted rating: 4.78)
Rosencrantz and Guildenstern Are Dead (1990) (predicted rating: 4.73)
Donnie Darko (2001) (predicted rating: 4.69)
