In [1]:
# Import libraries
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.model_selection import train_test_split
import time
import os
import matplotlib.pyplot as plt

In [2]:
# Load data
project_root = os.path.abspath(os.path.join("..", "..", ".."))
file_path = os.path.join(project_root, "data", "processed", "ratings_snap.csv")
ratings = pd.read_csv(file_path)

In [3]:
ratings.head()

Unnamed: 0,timestamp,user_id,event_type,movie_id,rating
0,2025-09-26 22:17:54,41208,rate,harry+potter+and+the+goblet+of+fire+2005,4
1,2025-09-26 22:17:55,131506,rate,new+york+doll+2005,4
2,2025-09-26 22:17:55,75157,rate,snowpiercer+2013,4
3,2025-09-26 22:17:56,33667,rate,kwaidan+1964,3
4,2025-09-26 22:17:59,117442,rate,viva+cuba+2005,4


In [4]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13004 entries, 0 to 13003
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   timestamp   13004 non-null  object
 1   user_id     13004 non-null  int64 
 2   event_type  13004 non-null  object
 3   movie_id    13004 non-null  object
 4   rating      13004 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 508.1+ KB


In [5]:
# Preprocess
# Convert user_id and movie_id to categorical codes for matrix indices
ratings['user_id'] = ratings['user_id'].astype('category')
ratings['movie_id'] = ratings['movie_id'].astype('category')

user_ids = list(ratings['user_id'].cat.categories)
movie_ids = list(ratings['movie_id'].cat.categories)

In [6]:
# Train/Test Split
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)
train_users = set(train_data['user_id'])
test_users = set(test_data['user_id'])

known_test_users = test_users & train_users
cold_start_users = test_users - train_users

In [7]:
# SVD Model Training
train_data['user_code'] = train_data['user_id'].cat.codes
train_data['movie_code'] = train_data['movie_id'].cat.codes

num_users = len(user_ids)
num_movies = len(movie_ids)

# Build user-item sparse matrix
user_item_matrix = csr_matrix(
    (train_data['rating'], (train_data['user_code'], train_data['movie_code'])),
    shape=(num_users, num_movies)
)

In [8]:
# Helper functions
def get_top_n_scores(user_index, pred_matrix, n=20):
    user_ratings = pred_matrix[user_index]
    top_indices = np.argsort(user_ratings)[::-1][:n]
    return top_indices, user_ratings[top_indices]

def compute_metrics(rec_items, true_items):
    num_hits = len(set(rec_items) & set(true_items))
    hit = int(num_hits > 0)
    recall = num_hits / len(true_items) if true_items else 0
    precision = num_hits / len(rec_items) if rec_items else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return {"hit": hit, "precision": precision, "recall": recall, "f1": f1}

In [29]:
start_train = time.time()
u, s, vt = svds(user_item_matrix, k=30)
train_time = time.time() - start_train
print(f"Training time: {train_time:.4f} seconds")

s_diag = np.diag(s)
pred_ratings = np.dot(np.dot(u, s_diag), vt)

Training time: 0.1732 seconds


In [31]:
# Evaluate known test users inference time and metrics
results = []
start_inf = time.time()
for uid in known_test_users:
    uidx = user_id_to_index[uid]
    true_items = list(test_data[test_data['user_id'] == uid]['movie_id'])
    rec_indices, rec_scores = get_top_n_scores(uidx, pred_ratings, n=20)
    rec_items = [movie_index_to_id[i] for i in rec_indices]
    metrics = compute_metrics(rec_items, true_items)
    metrics["user"] = uid
    results.append(metrics)
inference_time = time.time() - start_inf
print(f"Inference time for known test users: {inference_time:.4f} seconds")

metrics_df = pd.DataFrame(results)
print("--- SVD Evaluation on Known Test Users ---")
print(f"Hit Rate@20: {metrics_df['hit'].mean():.4f}")
print(f"Precision@20: {metrics_df['precision'].mean():.4f}")
print(f"Recall@20: {metrics_df['recall'].mean():.4f}")
print(f"F1@20: {metrics_df['f1'].mean():.4f}")


Inference time for known test users: 0.0339 seconds
--- SVD Evaluation on Known Test Users ---
Hit Rate@20: 0.0278
Precision@20: 0.0014
Recall@20: 0.0278
F1@20: 0.0026


In [32]:
# Cold start: popular movies fallback and evaluation
popular_movies = list(train_data['movie_id'].value_counts().index[:20])
cold_results = []
for uid in cold_start_users:
    true_items = list(test_data[test_data['user_id'] == uid]['movie_id'])
    rec_items = popular_movies
    metrics = compute_metrics(rec_items, true_items)
    metrics["user"] = uid
    cold_results.append(metrics)

cold_df = pd.DataFrame(cold_results)
print("--- Cold Start Evaluation ---")
print(f"Hit Rate@20: {cold_df['hit'].mean():.4f}")
print(f"Precision@20: {cold_df['precision'].mean():.4f}")
print(f"Recall@20: {cold_df['recall'].mean():.4f}")
print(f"F1@20: {cold_df['f1'].mean():.4f}")

--- Cold Start Evaluation ---
Hit Rate@20: 0.0832
Precision@20: 0.0042
Recall@20: 0.0832
F1@20: 0.0079


In [43]:
# Sample recommendations output
print("\n=== Sample Recommendations (Known User) ===")
sample_uid = metrics_df.sample(1).iloc[0]['user']
uidx = user_id_to_index[sample_uid]
rec_indices, rec_scores = get_top_n_scores(uidx, pred_ratings, n=20)
rec_items = [movie_index_to_id[i] for i in rec_indices]
print(f"User {sample_uid}:")
for i, (mid, score) in enumerate(zip(rec_items, rec_scores), start=1):
    print(f"  {i}. {mid} (score: {score:.3f})")

print("\n=== Sample Recommendations (Cold Start User) ===")
sample_cold_uid = cold_df.sample(1).iloc[0]['user']
print(f"User {sample_cold_uid}:")
for i, mid in enumerate(popular_movies, start=1):
    print(f"  {i}. {mid}")


=== Sample Recommendations (Known User) ===
User 26738.0:
  1. pulp+fiction+1994 (score: 4.000)
  2. star+wars+1977 (score: 0.000)
  3. blade+runner+1982 (score: 0.000)
  4. nausica+of+the+valley+of+the+wind+1984 (score: 0.000)
  5. the+lord+of+the+rings+the+fellowship+of+the+ring+2001 (score: 0.000)
  6. the+lord+of+the+rings+the+two+towers+2002 (score: 0.000)
  7. whiplash+2014 (score: 0.000)
  8. the+incredibles+2004 (score: 0.000)
  9. se7en+1995 (score: 0.000)
  10. coraline+2009 (score: 0.000)
  11. the+lion+king+1994 (score: 0.000)
  12. back+to+the+future+1985 (score: 0.000)
  13. interstellar+2014 (score: 0.000)
  14. the+godfather+1972 (score: 0.000)
  15. spirited+away+2001 (score: 0.000)
  16. ponyo+2008 (score: 0.000)
  17. fight+club+1999 (score: 0.000)
  18. forrest+gump+1994 (score: 0.000)
  19. the+matrix+1999 (score: 0.000)
  20. the+shining+1980 (score: 0.000)

=== Sample Recommendations (Cold Start User) ===
User 57749.0:
  1. the+shawshank+redemption+1994
  2. int

In [34]:
# Overall summary
num_known = len(metrics_df)
num_cold = len(cold_df)
overall_hit_rate = (metrics_df['hit'].sum() + cold_df['hit'].sum()) / (num_known + num_cold)
print(f"\nOverall Hit Rate@20: {overall_hit_rate:.4f}")


Overall Hit Rate@20: 0.0824
