In [None]:
import numpy as np
import pandas as pd
import polars as pl

import torch
import torch.nn as nn
import torch.nn.functional as F

# Loading Data

In [None]:
users_df_raw = pl.read_csv("users.dat", has_header=False)

users_df = users_df_raw.select(pl.col("column_1").str.split_exact("::",4).alias("columns")).unnest("columns")

users_df = users_df.rename({"field_0": "user_id",
                            "field_1": "gender",
                            "field_2": "age",
                            "field_3": "job",
                            "field_4": "zipcode"})

users_df = users_df.with_columns(pl.col("user_id").cast(pl.Int32),
                                 pl.col("age").cast(pl.Int32),
                                 pl.col("job").cast(pl.Int32))

print(users_df.shape)
display(users_df.head())

(6040, 5)


user_id,gender,age,job,zipcode
i32,str,i32,i32,str
1,"""F""",1,10,"""48067"""
2,"""M""",56,16,"""70072"""
3,"""M""",25,15,"""55117"""
4,"""M""",45,7,"""02460"""
5,"""M""",25,20,"""55455"""


In [None]:
movies_df_raw = pl.read_csv("movies.dat",encoding="utf8-lossy",truncate_ragged_lines=True,has_header=False)

movies_df = movies_df_raw.select(pl.col("column_1").str.split_exact("::",2).alias("columns")).unnest("columns")

movies_df = movies_df.rename({"field_0": "movie_id",
                              "field_1": "title",
                              "field_2": "genres"})

movies_df = movies_df.with_columns(pl.col("movie_id").cast(pl.Int32))

print(movies_df.shape)
display(movies_df.head())

(3883, 3)


movie_id,title,genres
i32,str,str
1,"""Toy Story (1995)""","""Animation|Children's|Comedy"""
2,"""Jumanji (1995)""","""Adventure|Children's|Fantasy"""
3,"""Grumpier Old Men (1995)""","""Comedy|Romance"""
4,"""Waiting to Exhale (1995)""","""Comedy|Drama"""
5,"""Father of the Bride Part II (1…","""Comedy"""


In [None]:
ratings_df_raw = pl.read_csv("ratings.dat", has_header=False)

ratings_df = ratings_df_raw.select(pl.col("column_1").str.split_exact("::",3).alias("columns")).unnest("columns")

ratings_df = ratings_df.rename({"field_0": "user_id",
                                "field_1": "movie_id",
                                "field_2": "rating",
                                "field_3": "timestamp"})

ratings_df = ratings_df.with_columns(pl.col("user_id").cast(pl.Int32),
                                     pl.col("movie_id").cast(pl.Int32),
                                     pl.col("rating").cast(pl.Int32))

print(ratings_df.shape)
display(ratings_df.head())

(1000209, 4)


user_id,movie_id,rating,timestamp
i32,i32,i32,str
1,1193,5,"""978300760"""
1,661,3,"""978302109"""
1,914,3,"""978301968"""
1,3408,4,"""978300275"""
1,2355,5,"""978824291"""


# Checks

In [None]:
all_unique_users=users_df.select('user_id').unique().to_series(0).to_list()
users_rated_movies=ratings_df.select('user_id').unique().to_series(0).to_list()
users_not_rated_movies=list(set(all_unique_users)-set(users_rated_movies))

print(len(users_rated_movies))
print(len(users_not_rated_movies))

6040
0


In [None]:
all_unique_movies=movies_df.select('movie_id').unique().to_series(0).to_list()
rated_movies=ratings_df.select('movie_id').unique().to_series(0).to_list()
unrated_movies=list(set(all_unique_users)-set(users_rated_movies))

print(len(rated_movies))
print(len(unrated_movies))

3706
0


In [None]:
data=ratings_df.to_pandas()
print(len(data))

1000209


In [None]:
# Split train and test data
np.random.seed(42)
mask = np.random.rand(len(data)) < 0.8
train = data[mask].copy()
test = data[~mask].copy()

print(len(train))
print(len(test))

799910
200299


In [None]:
def proc_col(col, train_col=None):
    """Encodes a pandas column with continuous ids.
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}

    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [None]:
def encode_data(df, train=None):
    """ Encodes ratings data with continous user and movie ids.
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["user_id", "movie_id"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [None]:
# Encoding the train and test data
df_train = encode_data(train)
df_test = encode_data(test, train)

In [None]:
users_train = torch.LongTensor(df_train["user_id"].values)
movies_train = torch.LongTensor(df_train["movie_id"].values)
ratings_train = torch.FloatTensor(df_train["rating"].values)

In [None]:
users_test = torch.LongTensor(df_test["user_id"].values)
movies_test = torch.LongTensor(df_test["movie_id"].values)
ratings_test = torch.FloatTensor(df_test["rating"].values)

In [None]:
num_users = len(df_train.user_id.unique())
num_items = len(df_train.movie_id.unique())

print(num_users)
print(num_items)

6040
3682


# Matrix Factorization Model with Vector Bias

In [None]:
# Matrix Factorization Model Class Including Bias Vectors for Users and Items (Movies)
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=64):

        super().__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)

        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)

        # Initializing Embedding Weights
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)

        # Initializing Biases
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)

    # Forward Pass
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)

        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()

        return (U*V).sum(1) +  b_u  + b_v # Adding user and item bias to dot products

# Training MF Model with Bias

In [None]:
def train_epochs_MF_bias(model, users, items, ratings, epochs=20, lr=0.01, wd=0.0, reg_weight=0.25, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd) # Initializing Adam Optimizer
    model.train()
    if unsqueeze:
            ratings = ratings.unsqueeze(1)

    for i in range(epochs):


        y_hat = model(users, items)
        mse = F.mse_loss(y_hat, ratings)

        # Adding Regularization Loss to MSE
        reg = model.user_emb(users).pow(2).sum() + model.item_emb(items).pow(2).sum()

        reg += model.user_bias(users).pow(2).sum() + model.item_bias(items).pow(2).sum()

        # Final loss
        loss = mse + reg_weight * reg

        # Using Gradient Descent for Optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print("Train Loss: %.3f" % loss.item())

        test_loss_MF_bias(model, users_test, movies_test, ratings_test, unsqueeze)

In [None]:
def test_loss_MF_bias(model, users, items, ratings, unsqueeze=False):
    model.eval()

    if unsqueeze:
        ratings = ratings.unsqueeze(1)

    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)

    print("Test Loss: %.3f " % loss.item()) # MSE Loss

In [None]:
vec_bias_model = MF_bias(num_users, num_items, emb_size=64)

In [None]:
train_epochs_MF_bias(vec_bias_model, users_train, movies_train, ratings_train, epochs=50, lr=0.001)

Train Loss: 21359.203
Test Loss: 13.802 
Train Loss: 20100.889
Test Loss: 13.823 
Train Loss: 18894.787
Test Loss: 13.842 
Train Loss: 17740.795
Test Loss: 13.861 
Train Loss: 16638.176
Test Loss: 13.878 
Train Loss: 15586.008
Test Loss: 13.895 
Train Loss: 14583.400
Test Loss: 13.910 
Train Loss: 13629.372
Test Loss: 13.924 
Train Loss: 12722.795
Test Loss: 13.937 
Train Loss: 11862.418
Test Loss: 13.949 
Train Loss: 11046.966
Test Loss: 13.960 
Train Loss: 10275.142
Test Loss: 13.971 
Train Loss: 9545.615
Test Loss: 13.980 
Train Loss: 8856.986
Test Loss: 13.989 
Train Loss: 8207.823
Test Loss: 13.997 
Train Loss: 7596.688
Test Loss: 14.004 
Train Loss: 7022.145
Test Loss: 14.011 
Train Loss: 6482.758
Test Loss: 14.017 
Train Loss: 5977.076
Test Loss: 14.022 
Train Loss: 5503.654
Test Loss: 14.027 
Train Loss: 5061.057
Test Loss: 14.031 
Train Loss: 4647.870
Test Loss: 14.035 
Train Loss: 4262.696
Test Loss: 14.038 
Train Loss: 3904.154
Test Loss: 14.042 
Train Loss: 3570.889
Test Lo

# Matrix Factorization Model with Confidence Bias



*   **The dataset has explicit feedback in the form of movie ratings, to model it as an implicit feedback problem, I've assumed a positive rating value to be an indicator for preference of the user towards a movie, the rating value can also be thought of as the total interactions (clicks, views, etc) of the user with the movie**



In [None]:
# Matrix Factorization Model Class Including Confidence Bias
class ConfidenceMF(torch.nn.Module):
    def __init__(self, n_users, n_items, emb_size=64, alpha=40.0, reg=0.01, use_bias=True):
        super().__init__()
        self.alpha = alpha # Scaling factor for confidence
        self.reg = reg
        self.use_bias = use_bias

        self.user_emb = torch.nn.Embedding(n_users, emb_size)
        self.item_emb = torch.nn.Embedding(n_items, emb_size)

        if use_bias:
            self.user_bias = torch.nn.Embedding(n_users, 1)
            self.item_bias = torch.nn.Embedding(n_items, 1)
            self.global_bias = torch.nn.Parameter(torch.randn(1) * 0.01)

        # Initializing Embedding Weights
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)

        # Initializing Biases
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)

    def forward(self, users, items, ratings):

        p_ui = (ratings > 0).float() # Preference of the user, assumed to be 1 if the user has rated the movie
        c_ui = 1.0 + self.alpha * ratings # Confidence bias term

        dot = (self.user_emb(users) * self.item_emb(items)).sum(dim=1)

        if self.use_bias:
            pred = self.global_bias + self.user_bias(users).squeeze() + self.item_bias(items).squeeze() + dot # Adding global, user and item level biases
        else:
            pred = dot

        return pred, p_ui, c_ui

# Training MF Model with Confidence Bias

In [None]:
def train_epochs_Conf_MF(model, users, items, ratings, epochs=20, lr=0.01, wd=0.0, reg_weight=0.25, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    if unsqueeze:
            ratings = ratings.unsqueeze(1)

    for i in range(epochs):


        y_hat, pref, conf = model(users, items, ratings)

        diff = pref - y_hat
        loss = (conf * diff**2).mean()

        # Adding Regularization Loss to MSE

        reg = model.user_emb(users).pow(2).sum() + model.item_emb(items).pow(2).sum()

        reg += model.user_bias(users).pow(2).sum() + model.item_bias(items).pow(2).sum()

        # Final loss
        loss += reg_weight * reg

        # Using Gradient Descent for Optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print("Train Loss: %.3f" % loss.item())

        test_loss_Conf_MF(model, users_test, movies_test, ratings_test, unsqueeze)

In [None]:
def test_loss_Conf_MF(model, users, items, ratings, unsqueeze=False):
    model.eval()

    if unsqueeze:
        ratings = ratings.unsqueeze(1)

    y_hat, pref, conf = model(users, items, ratings)
    loss = F.mse_loss(y_hat, ratings)

    print("Test Loss: %.3f " % loss.item()) # MSE Loss

In [None]:
conf_bias_model = ConfidenceMF(num_users, num_items, emb_size=64)

In [None]:
train_epochs_Conf_MF(conf_bias_model, users_train, movies_train, ratings_train, epochs=50, lr=0.001)

Train Loss: 21408.242
Test Loss: 13.900 
Train Loss: 20154.254
Test Loss: 13.912 
Train Loss: 18952.400
Test Loss: 13.924 
Train Loss: 17802.584
Test Loss: 13.934 
Train Loss: 16704.018
Test Loss: 13.943 
Train Loss: 15655.738
Test Loss: 13.951 
Train Loss: 14656.843
Test Loss: 13.958 
Train Loss: 13706.349
Test Loss: 13.964 
Train Loss: 12803.121
Test Loss: 13.969 
Train Loss: 11945.914
Test Loss: 13.973 
Train Loss: 11133.444
Test Loss: 13.976 
Train Loss: 10364.430
Test Loss: 13.978 
Train Loss: 9637.541
Test Loss: 13.980 
Train Loss: 8951.396
Test Loss: 13.981 
Train Loss: 8304.571
Test Loss: 13.981 
Train Loss: 7695.632
Test Loss: 13.981 
Train Loss: 7123.152
Test Loss: 13.980 
Train Loss: 6585.701
Test Loss: 13.978 
Train Loss: 6081.831
Test Loss: 13.977 
Train Loss: 5610.099
Test Loss: 13.974 
Train Loss: 5169.070
Test Loss: 13.972 
Train Loss: 4757.332
Test Loss: 13.969 
Train Loss: 4373.486
Test Loss: 13.965 
Train Loss: 4016.157
Test Loss: 13.962 
Train Loss: 3683.990
Test Lo

# Evaluation with Precision@K
# *The number of relevant items out of top-K recommendations*


# Here I have assumed if the user has rated a movie,
# then the movie is relevant to the user (does not account for ranking)

In [None]:
def predict_for_user(model, user_id, known_items, all_item_ids):
    user_tensor = torch.LongTensor([user_id])
    item_tensor = torch.LongTensor(all_item_ids)
    user_vec = model.user_emb(user_tensor).repeat(len(all_item_ids), 1)
    item_vecs = model.item_emb(item_tensor)
    scores = (user_vec * item_vecs).sum(dim=1)
    scores[known_items] = -1e10  # Excluding already seen items
    return scores

In [None]:
def get_top_k_recs(scores, k=10):
    return torch.topk(scores, k=k).indices.numpy()

In [None]:
def precision_at_k(model, df_train, df_test, k=10):
    user_hits = []
    user_ids = df_test['user_id'].unique()
    all_items = np.arange(model.item_emb.num_embeddings)

    for user in user_ids:
        test_items = df_test[df_test['user_id'] == user]['movie_id'].values
        train_items = df_train[df_train['user_id'] == user]['movie_id'].values

        if len(test_items) == 0:
            continue

        scores = predict_for_user(model, user, known_items=train_items, all_item_ids=all_items)
        top_k = get_top_k_recs(scores, k)

        hits = np.isin(top_k, test_items).sum()
        precision = hits / k
        user_hits.append(precision)

    avg_precision = np.mean(user_hits)
    print(f"Precision@{k}: {avg_precision:.4f}")

In [None]:
precision_at_k(vec_bias_model, df_train, df_test, k=10)

Precision@10: 0.0098


In [None]:
precision_at_k(conf_bias_model, df_train, df_test, k=10)

Precision@10: 0.0103


# The Confidence Bias MF Model is marginally better than the Vector Bias Model under the set configurations based on Precision@K

# Evaluation with MAP@K - Accounts for both relevance and ranking

In [None]:
def map_at_k(model, df_train, df_test, k=10):
    user_aps = []
    user_ids = df_test['user_id'].unique()
    all_items = np.arange(model.item_emb.num_embeddings)

    for user in user_ids:
        test_items = df_test[df_test['user_id'] == user]['movie_id'].values
        train_items = df_train[df_train['user_id'] == user]['movie_id'].values

        if len(test_items) == 0:
            continue

        scores = predict_for_user(model, user, known_items=train_items, all_item_ids=all_items)
        top_k = get_top_k_recs(scores, k)

        # Computing Average Precision@K
        hits = 0
        sum_precisions = 0.0
        for i, item in enumerate(top_k):
            if item in test_items:
                hits += 1
                precision_at_i = hits / (i + 1)
                sum_precisions += precision_at_i

        if hits > 0:
            ap = sum_precisions / min(len(test_items), k)  # Normalize by relevant items
        else:
            ap = 0.0

        user_aps.append(ap)

    mean_ap = np.mean(user_aps)
    print(f"MAP@{k}: {mean_ap:.4f}")

In [None]:
map_at_k(vec_bias_model, df_train, df_test, k=10)

MAP@10: 0.0033


In [None]:
map_at_k(conf_bias_model, df_train, df_test, k=10)

MAP@10: 0.0031


# Here the Vector Bias MF Model is marginally better than the Confidence Bias MF Model under the set configurations based on MAP@K