In [15]:
import implicit
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd

In [16]:
ratings = pd.read_parquet('../data/ratings_clean.parquet')
print(ratings.info())
ratings = ratings.drop(columns=['age_days', 'day_of_week'])

print(ratings.info())
print('Number of unique movieIds:', ratings['movieId'].nunique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30594215 entries, 0 to 30594214
Data columns (total 5 columns):
 #   Column       Dtype  
---  ------       -----  
 0   userId       int64  
 1   movieId      int64  
 2   rating       float64
 3   day_of_week  int64  
 4   age_days     float32
dtypes: float32(1), float64(1), int64(3)
memory usage: 1.0 GB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30594215 entries, 0 to 30594214
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int64  
 1   movieId  int64  
 2   rating   float64
dtypes: float64(1), int64(2)
memory usage: 700.2 MB
None
Number of unique movieIds: 84133


# Prepare LOOCV

In [17]:
holdout_df = pd.read_parquet('../data/ratings_LOOCV.parquet')
holdout_df = holdout_df.rename(columns={'holdout_movieId': 'movieId'})
holdout_df['to_remove'] = True
print(holdout_df.info())

print(len(ratings))
merged = ratings.merge(holdout_df, on=['userId', 'movieId'], how='left')
ratings_loocv = merged[merged['to_remove'].isna()].drop(columns='to_remove')
print(ratings_loocv.info())

<class 'pandas.core.frame.DataFrame'>
Index: 157023 entries, 59 to 30594056
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   userId     157023 non-null  int64
 1   movieId    157023 non-null  int64
 2   to_remove  157023 non-null  bool 
dtypes: bool(1), int64(2)
memory usage: 3.7 MB
None
30594215
<class 'pandas.core.frame.DataFrame'>
Index: 30437192 entries, 0 to 30594214
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int64  
 1   movieId  int64  
 2   rating   float64
dtypes: float64(1), int64(2)
memory usage: 928.9 MB
None


In [18]:
# print(f'userIds ratings: {ratings['userId'].nunique()}, ratings_loocv: {ratings_loocv['userId'].nunique()}')
# assert ratings['userId'].nunique() == ratings_loocv['userId'].nunique()
# print(f'Movieids ratings: {ratings['movieId'].nunique()}, ratings_loocv: {ratings_loocv['movieId'].nunique()}')
# assert ratings['movieId'].nunique() == ratings_loocv['movieId'].nunique()
# Ignore the 9 movies that are missing, remove them from the matrix used for CF

In [19]:
movies = pd.read_csv('../data/Movies_final_ML.csv')
print(movies.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87585 entries, 0 to 87584
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movieId               87585 non-null  int64  
 1   imdbId                87585 non-null  object 
 2   vote_average          86493 non-null  float64
 3   vote_count            86493 non-null  float64
 4   status                86493 non-null  object 
 5   release_date          87455 non-null  object 
 6   revenue               86493 non-null  float64
 7   runtime               86493 non-null  float64
 8   adult                 86493 non-null  object 
 9   backdrop_path         70442 non-null  object 
 10  budget                86493 non-null  float64
 11  homepage              17774 non-null  object 
 12  original_language     86493 non-null  object 
 13  original_title        86493 non-null  object 
 14  overview              85667 non-null  object 
 15  popularity         

In [20]:
user_map = {user_id: idx for idx, user_id in enumerate(ratings_loocv['userId'].unique())}
movie_map = {movie_id: idx for idx, movie_id in enumerate(ratings_loocv['movieId'].unique())}

reverse_user_map = {v: k for k, v in user_map.items()}
reverse_movie_map = {v: k for k, v in movie_map.items()}

holdout_filtered = holdout_df[holdout_df['movieId'].isin(ratings_loocv['movieId'].unique())]
print(f"Holdout before filter: {len(holdout_df)}")
print(f"Holdout after filter: {len(holdout_filtered)}")

user_idx = ratings_loocv['userId'].map(user_map)
movie_idx = ratings_loocv['movieId'].map(movie_map)

Holdout before filter: 157023
Holdout after filter: 157014


In [21]:
sparse_matrix = csr_matrix((ratings_loocv['rating'].values, (user_idx, movie_idx)), shape=(len(user_map), len(movie_map)))
print(sparse_matrix.shape)

(157023, 84124)


In [22]:
# Print values from the sparse matrix and from the ratings DataFrame to compare if they are correct
triplets = zip(*sparse_matrix.nonzero(), sparse_matrix.data)

for r, c, v in list(triplets)[:5]:
    print(f"Row {r}, Col {c} → {v}")

print()
print(ratings[ratings['movieId'] == reverse_movie_map[0]]['rating'].head(5))

Row 0, Col 0 → 4.0
Row 0, Col 1 → 3.0
Row 0, Col 2 → 5.0
Row 0, Col 3 → 5.0
Row 0, Col 4 → 5.0

0       4.0
289     5.0
1284    4.5
4304    4.0
5346    4.0
Name: rating, dtype: float64


In [23]:
movies_in_ratings = movies[movies['movieId'].isin(ratings_loocv['movieId'].unique())]

idx_to_title = {
    movie_map[movieId]: title
    for movieId, title in zip(movies_in_ratings['movieId'], movies_in_ratings['title'])
}

print(f"Title mapping created for {len(idx_to_title)} movies")

Title mapping created for 84124 movies


In [24]:
als_model = implicit.als.AlternatingLeastSquares(factors=32, regularization=0.01, iterations=50)
als_model.fit(sparse_matrix)

  0%|          | 0/50 [00:00<?, ?it/s]

In [25]:
zero_cols = (sparse_matrix.sum(axis=0) == 0).sum()
print(f"Zero columns in sparse matrix: {zero_cols}")  # Should be 0

missing_movies = set(holdout_filtered['movieId']) - set(ratings_loocv['movieId'])
print(f"Holdout movies missing from training: {len(missing_movies)}")

rec_ids, _ = als_model.recommend(userid=0, user_items=sparse_matrix[0], N=10)
for rec_id in rec_ids:
    assert rec_id < len(movie_map), f"Invalid recommendation index: {rec_id}"

Zero columns in sparse matrix: 0
Holdout movies missing from training: 0


In [26]:
recommended_item_ids, scores = als_model.recommend(
    userid=0,
    user_items=sparse_matrix[0],
    N=5,
    filter_already_liked_items=True
)

print("We recommend:")
for item_idx, score in zip(recommended_item_ids, scores):
    print(f"- {idx_to_title[item_idx]} (Score: {score:.3f})")

We recommend:
- The Godfather (Score: 1.021)
- One Flew Over the Cuckoo's Nest (Score: 1.010)
- American Beauty (Score: 0.927)
- Good Will Hunting (Score: 0.903)
- The Shawshank Redemption (Score: 0.900)


# Evaluation

In [27]:
# np.random.seed(213)
# sample_users = np.random.choice(
#     holdout_filtered['userId'].unique(),
#     size=10000,
#     replace=False
# )

# holdout_filtered = holdout_filtered[holdout_filtered['userId'].isin(sample_users)]
print(f"Sample holdout size: {len(holdout_filtered)}")

Sample holdout size: 157014


In [28]:
from tqdm import tqdm

ks = [5, 10, 20, 50, 100]

holdout_map = holdout_filtered.set_index('userId')['movieId'].to_dict()

print(f"Evaluating {len(holdout_map)} unique users")

for K in tqdm(ks):
    recalls, mrrs, ndcgs = [], [], []

    for userId, true_holdout_item in tqdm(holdout_map.items()):
        idx = user_map[userId]

        recommended_item_ids, scores = als_model.recommend(
            userid=idx,
            user_items=sparse_matrix[idx],
            N=K,
            filter_already_liked_items=True
        )

        true_set = {true_holdout_item}
        recommended_movie_ids = [reverse_movie_map[r] for r in recommended_item_ids]

        # Recall@K
        recalls.append(int(any(r in true_set for r in recommended_movie_ids)))

        # MRR@K
        rr = 0.0
        for rank, movie_id in enumerate(recommended_movie_ids, 1):
            if movie_id in true_set:
                rr = 1.0 / rank
                break
        mrrs.append(rr)

        # nDCG@K
        relevance_scores = [1.0 if movie_id in true_set else 0.0 for movie_id in recommended_movie_ids]
        dcg = sum(rel / np.log2(rank + 1) for rank, rel in enumerate(relevance_scores, 1))
        idcg = 1.0 # For leave-one-out, the ideal DCG is always 1.0 if the item is found
        ndcg = dcg / idcg
        ndcgs.append(ndcg)

    print(f'Recall@{K}: {float(np.mean(recalls)):.4f}, MRR@{K}: {float(np.mean(mrrs)):.4f}, nDCG@{K}: {float(np.mean(ndcgs)):.4f}')

Evaluating 157014 unique users


100%|██████████| 157014/157014 [01:23<00:00, 1876.30it/s]
 20%|██        | 1/5 [01:23<05:35, 83.76s/it]

Recall@5: 0.2960, MRR@5: 0.1939, nDCG@5: 0.2192


100%|██████████| 157014/157014 [01:31<00:00, 1707.80it/s]
 40%|████      | 2/5 [02:55<04:25, 88.60s/it]

Recall@10: 0.3841, MRR@10: 0.2056, nDCG@10: 0.2476


100%|██████████| 157014/157014 [01:36<00:00, 1627.69it/s]
 60%|██████    | 3/5 [04:32<03:04, 92.21s/it]

Recall@20: 0.4794, MRR@20: 0.2122, nDCG@20: 0.2717


100%|██████████| 157014/157014 [02:07<00:00, 1228.50it/s]
 80%|████████  | 4/5 [06:40<01:46, 106.28s/it]

Recall@50: 0.6126, MRR@50: 0.2164, nDCG@50: 0.2982


100%|██████████| 157014/157014 [02:44<00:00, 956.12it/s] 
100%|██████████| 5/5 [09:24<00:00, 112.88s/it]

Recall@100: 0.7114, MRR@100: 0.2178, nDCG@100: 0.3142





In [29]:
als_model.save('collaborative_filtering.npz')