In [66]:
import pandas as pd
import numpy as np
from item_based import ItemBasedRecommender
from metrics_knn_based import KnnMetricsCalculator, KnnTestMetricsCalculator
from tune_k_item_based import tune_k
from rating import get_explicit_rating, get_implicit_rating_out_of_positive_ratings_df, split_matrix_csr, sanity_check_implicit_rating, sanity_check_explicit_split, sanity_check_explicit_matrix


In [67]:
# Load Yelp dataset (JSON format)

movielens_file_path = "/Users/masoud/Downloads/MovieLens_1M_Dataset/ratings.csv"
movielens_df = pd.read_csv(movielens_file_path)
movielens_df.head()




Unnamed: 0,user_id,movie_id,Rating,Date
0,1,1193,5,2000-12-31
1,1,661,3,2000-12-31
2,1,914,3,2000-12-31
3,1,3408,4,2000-12-31
4,1,2355,5,2001-01-06


In [68]:
movielens_df = movielens_df.rename(columns={
    'user_id': 'user_id',         # stays the same (optional)
    'movie_id': 'business_id',
    'Rating': 'stars',
    'Date': 'date'
})

In [69]:
movielens_df["date"] = pd.to_datetime(movielens_df["date"]).astype(np.int64) // 10 ** 9
movielens_df

Unnamed: 0,user_id,business_id,stars,date
0,1,1193,5,978220800
1,1,661,3,978220800
2,1,914,3,978220800
3,1,3408,4,978220800
4,1,2355,5,978739200
...,...,...,...,...
1000204,6040,1091,1,956707200
1000205,6040,1094,5,956620800
1000206,6040,562,5,956620800
1000207,6040,1096,4,956707200


In [70]:
explicit_ratings, last_dates, user_mapping, item_mapping = get_explicit_rating(movielens_df, "user_id",
                                                                               "business_id", "stars", "date")

explicit_ratings.toarray(), last_dates.toarray()

(array([[5., 3., 3., ..., 0., 0., 0.],
        [5., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 3., 4., ..., 0., 0., 0.],
        [4., 0., 0., ..., 0., 0., 0.]]),
 array([[978220800, 978220800, 978220800, ...,         0,         0,
                 0],
        [978220800,         0,         0, ...,         0,         0,
                 0],
        [        0,         0,         0, ...,         0,         0,
                 0],
        ...,
        [        0,         0,         0, ...,         0,         0,
                 0],
        [        0, 956620800, 956620800, ...,         0,         0,
                 0],
        [957657600,         0,         0, ...,         0,         0,
                 0]]))

In [71]:
sanity_check_explicit_matrix(explicit_ratings=explicit_ratings, last_dates=last_dates, review_df=movielens_df)

Unnamed: 0,Source,Calculated metrics,Value
0,Explicit ratings matrix,Non-zero entries,1000209
1,Last dates matrix,Non-zero entries,1000209
2,Filtered review DataFrame,"Unique (user_id, business_id) pairs",1000209


In [72]:
DIVISIONS = [0.1, 0.2, 0.7]

In [73]:
test_matrix, validation_matrix, train_matrix = split_matrix_csr(explicit_ratings, last_dates, DIVISIONS)
train_matrix.toarray(), validation_matrix.toarray(), test_matrix.toarray()

(array([[5., 3., 3., ..., 0., 0., 0.],
        [5., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 3., 4., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [4., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]))

In [74]:
sanity_check_explicit_split(train_matrix=train_matrix, validation_matrix=validation_matrix, test_matrix=test_matrix, explicit_matrix=explicit_ratings)

Unnamed: 0,Split,Number of interactions,Part of factual interactions
0,Train,701042,70.09%
1,Validation,199516,19.95%
2,Test,99651,9.96%
3,Explicit total,1000209,100.0%
4,Factual total,1000209,100%


In [75]:
IMPLICIT_THRESHOLD = 4

In [76]:
implicit_ratings = get_implicit_rating_out_of_positive_ratings_df(df=movielens_df, user_field='user_id',item_field='business_id', rating_field='stars', implicit_threshold=IMPLICIT_THRESHOLD)

len(implicit_ratings.keys()), movielens_df['user_id'].nunique()

(6038, 6040)

In [77]:
sanity_check_implicit_rating(initial_df=movielens_df, implicit_ratings=implicit_ratings, implicit_threshold=IMPLICIT_THRESHOLD)

Unnamed: 0,Metric,Value
0,Number of reviews (stars >= threshold),575281
1,Number of reviews in implicit_ratings,575281
2,Unique users in initial reviews,6038
3,Unique users in implicit_ratings,6038
4,Unique businesses in initial reviews,3533
5,Unique businesses in implicit_ratings,3533


In [79]:
# Tune k using the elbow method and lowest validation RMSE
best_k, elbow_k, results_df, *_ = tune_k(
    train_matrix,
    validation_matrix,
    test_matrix,
    user_mapping=user_mapping,
    item_mapping=item_mapping
)

print(f"Best K (lowest validation RMSE): {best_k}")
print(f"Elbow K (stability cutoff): {elbow_k}")



Starting K tuning for Item-Based Collaborative Filtering...


Testing different K values: 100%|██████████| 20/20 [26:58<00:00, 80.94s/it]


Tuning Complete
Best K (lowest validation RMSE): 100
Elbow Point K: 30
Best K (lowest validation RMSE): 100
Elbow K (stability cutoff): 30





In [80]:
# Train the model with optimal k
recommender = ItemBasedRecommender(k=elbow_k)
recommender.fit(train_matrix, user_mapping, item_mapping)


In [81]:
# Generate predictions for all unrated items
prediction_matrix = recommender.predict_matrix()


In [82]:
# Calculate RMSE
rmse_calculator = KnnMetricsCalculator(
    test_matrix=test_matrix,
    model=recommender,
    idx_to_user_id=user_mapping['idx_to_id'],
    idx_to_item_id=item_mapping['idx_to_id']
)
rmse = rmse_calculator.calculate_rmse()
print(f"Test RMSE: {rmse:.4f}")


Test RMSE: 1.0892


In [83]:
# Evaluate Top-N unexpectedness
top_n_metrics = KnnTestMetricsCalculator(
    test_matrix=test_matrix,
    model=recommender,
    idx_to_user_id=user_mapping['idx_to_id'],
    idx_to_item_id=item_mapping['idx_to_id'],
    n=10
)
unexpectedness = top_n_metrics.calculate_unexpectedness()
print(f"Unexpectedness: {unexpectedness:.4f}")


Unexpectedness: 0.0172
