In [None]:
import implicit
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd

In [23]:
ratings = pd.read_parquet('../datasets/ratings_clean.parquet')
print(ratings.info())
ratings = ratings.drop(columns=['age_days', 'day_of_week'])

print(ratings.info())
print('Number of unique movieIds:', ratings['movieId'].nunique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30594215 entries, 0 to 30594214
Data columns (total 5 columns):
 #   Column       Dtype  
---  ------       -----  
 0   userId       int64  
 1   movieId      int64  
 2   rating       float64
 3   day_of_week  int64  
 4   age_days     float32
dtypes: float32(1), float64(1), int64(3)
memory usage: 1.0 GB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30594215 entries, 0 to 30594214
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int64  
 1   movieId  int64  
 2   rating   float64
dtypes: float64(1), int64(2)
memory usage: 700.2 MB
None
Number of unique movieIds: 84133


# Prepare LOOCV

In [24]:
holdout_df = pd.read_parquet('../datasets/ratings_LOOCV.parquet')
holdout_df = holdout_df.rename(columns={'holdout_movieId': 'movieId'})
holdout_df['to_remove'] = True
print(holdout_df.info())

print(len(ratings))
merged = ratings.merge(holdout_df, on=['userId', 'movieId'], how='left')
ratings_loocv = merged[merged['to_remove'].isna()].drop(columns='to_remove')
print(ratings_loocv.info())

<class 'pandas.core.frame.DataFrame'>
Index: 157023 entries, 59 to 30594056
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   userId     157023 non-null  int64
 1   movieId    157023 non-null  int64
 2   to_remove  157023 non-null  bool 
dtypes: bool(1), int64(2)
memory usage: 3.7 MB
None
30594215
<class 'pandas.core.frame.DataFrame'>
Index: 30437192 entries, 0 to 30594214
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int64  
 1   movieId  int64  
 2   rating   float64
dtypes: float64(1), int64(2)
memory usage: 928.9 MB
None


In [25]:
print(f'userIds ratings: {ratings['userId'].nunique()}, ratings_loocv: {ratings_loocv['userId'].nunique()}')
assert ratings['userId'].nunique() == ratings_loocv['userId'].nunique()
print(f'Movieids ratings: {ratings['movieId'].nunique()}, ratings_loocv: {ratings_loocv['movieId'].nunique()}')
# assert ratings['movieId'].nunique() == ratings_loocv['movieId'].nunique()
# Ignore the 9 movies that are missing, remove them from the matrix used for CF

userIds ratings: 157023, ratings_loocv: 157023
Movieids ratings: 84133, ratings_loocv: 84124


In [26]:
movies = pd.read_csv('../datasets/Movies_final_ML.csv')
print(movies.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87585 entries, 0 to 87584
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movieId               87585 non-null  int64  
 1   imdbId                87585 non-null  object 
 2   vote_average          86493 non-null  float64
 3   vote_count            86493 non-null  float64
 4   status                86493 non-null  object 
 5   release_date          87455 non-null  object 
 6   revenue               86493 non-null  float64
 7   runtime               86493 non-null  float64
 8   adult                 86493 non-null  object 
 9   backdrop_path         70442 non-null  object 
 10  budget                86493 non-null  float64
 11  homepage              17774 non-null  object 
 12  original_language     86493 non-null  object 
 13  original_title        86493 non-null  object 
 14  overview              85667 non-null  object 
 15  popularity         

In [27]:
user_map = {user_id: idx for idx, user_id in enumerate(ratings_loocv['userId'].unique())}
movie_map = {movie_id: idx for idx, movie_id in enumerate(movies['movieId'].unique())}

reverse_user_map = {v: k for k, v in user_map.items()}
reverse_movie_map = {v: k for k, v in movie_map.items()}

user_idx = ratings_loocv['userId'].map(user_map)
movie_idx = ratings_loocv['movieId'].map(movie_map)

sparse_matrix = csr_matrix((ratings_loocv['rating'].values, (user_idx, movie_idx)), shape=(len(user_map), len(movie_map)))
print(sparse_matrix.shape)

(157023, 87585)


In [28]:
# Print values from the sparse matrix and from the ratings DataFrame to compare if they are correct
triplets = zip(*sparse_matrix.nonzero(), sparse_matrix.data)

for r, c, v in list(triplets)[:5]:
    print(f"Row {r}, Col {c} → {v}")

print()
print(ratings[ratings['movieId'] == reverse_movie_map[0]]['rating'].head(5))

Row 0, Col 16 → 4.0
Row 0, Col 24 → 1.0
Row 0, Col 28 → 2.0
Row 0, Col 29 → 5.0
Row 0, Col 31 → 5.0

908     2.5
1665    4.0
1854    3.0
1964    5.0
2146    3.0
Name: rating, dtype: float64


In [29]:
idx_to_title = {movie_map[movieId]: title for i, (movieId, title) in enumerate(zip(movies['movieId'], movies['title']))}

print(idx_to_title)



In [30]:
als_model = implicit.als.AlternatingLeastSquares(factors=32, regularization=0.01, iterations=50)
als_model.fit(sparse_matrix)

  check_blas_config()


  0%|          | 0/50 [00:00<?, ?it/s]

In [31]:
recommended_item_ids, scores = als_model.recommend(
    userid=0,
    user_items=sparse_matrix[0],
    N=5,
    filter_already_liked_items=True
)

print("We recommend:")
for item_idx, score in zip(recommended_item_ids, scores):
    print(f"- {idx_to_title[item_idx]} (Score: {score:.3f})")

We recommend:
- The Godfather (Score: 1.024)
- One Flew Over the Cuckoo's Nest (Score: 1.011)
- American Beauty (Score: 0.926)
- Good Will Hunting (Score: 0.901)
- The Shawshank Redemption (Score: 0.900)


# Evaluation

In [36]:
from tqdm import tqdm

K=20

recalls, mrrs, ndcgs = [], [], []

print(len(holdout_df))
for _, userId in tqdm(enumerate(holdout_df['userId'])):
    idx = user_map[userId]

    recommended_item_ids, scores = als_model.recommend(
        userid=idx,
        user_items=sparse_matrix[idx],
        N=K,
        filter_already_liked_items=True
    )

    likedMovies = ratings[ratings['userId'] == userId]['movieId'].values

    # Recall@K
    recalls.append(int(any(reverse_movie_map[r] in likedMovies for r in recommended_item_ids)))

    # MRR@K
    rr = 0.0
    for rank, idx in enumerate(recommended_item_ids, 1):
        if reverse_movie_map[idx] in likedMovies:
            rr = 1.0 / rank
            break
    mrrs.append(rr)

    # nDCG@K
    relevance_scores = [1.0 if reverse_movie_map[movie_idx] in likedMovies else 0.0 for movie_idx in recommended_item_ids]
    dcg = sum(rel / np.log2(rank + 1) for rank, rel in enumerate(relevance_scores, 1) if rel > 0)

    ideal_relevance = [1.0] * min(len(likedMovies), K)
    idcg = sum(rel / np.log2(rank + 1) for rank, rel in enumerate(ideal_relevance, 1))

    ndcg = dcg / idcg if idcg > 0 else 0.0
    ndcgs.append(ndcg)

print(f'Recall@{K}: {float(np.mean(recalls))}, "MRR@{K}: {float(np.mean(mrrs))}, nDCG@{K}: {float(np.mean(ndcgs))}')

157023


157023it [43:13, 60.54it/s]


Recall@20: 0.4796876890646593, "MRR@20: 0.21231583259557352, nDCG@20: 0.03861870448500913
