In [26]:
import pandas as pd
import numpy as np
from item_based import ItemBasedRecommender
from metrics_knn_based import KnnMetricsCalculator, KnnTestMetricsCalculator
from tune_k_item_based import tune_k
from rating import get_explicit_rating, get_implicit_rating_out_of_positive_ratings_df, split_matrix_csr, sanity_check_implicit_rating, sanity_check_explicit_split, sanity_check_explicit_matrix


In [27]:
# Load Yelp dataset (JSON format)

movielens_file_path = "/Users/masoud/Downloads/MovieLens_1M_Dataset/ratings.csv"
movielens_df = pd.read_csv(movielens_file_path)
movielens_df.head()




Unnamed: 0,user_id,movie_id,Rating,Date
0,1,1193,5,2000-12-31
1,1,661,3,2000-12-31
2,1,914,3,2000-12-31
3,1,3408,4,2000-12-31
4,1,2355,5,2001-01-06


In [28]:
movielens_df["Date"] = pd.to_datetime(movielens_df["Date"]).astype(np.int64) // 10 ** 9
movielens_df

Unnamed: 0,user_id,movie_id,Rating,Date
0,1,1193,5,978220800
1,1,661,3,978220800
2,1,914,3,978220800
3,1,3408,4,978220800
4,1,2355,5,978739200
...,...,...,...,...
1000204,6040,1091,1,956707200
1000205,6040,1094,5,956620800
1000206,6040,562,5,956620800
1000207,6040,1096,4,956707200


In [None]:
# rating_matrix, date_matrix, user_mapping, item_mapping = get_explicit_rating(
#     movielens_df,
#     user_field='user_id',
#     item_field='movie_id',
#     rating_field='Rating',
#     date_field='Date'
# )

In [29]:
explicit_ratings, last_dates, user_mapping, item_mapping = get_explicit_rating(movielens_df, "user_id",
                                                                               "movie_id", "Rating", "Date")

explicit_ratings.toarray(), last_dates.toarray()

(array([[5., 3., 3., ..., 0., 0., 0.],
        [5., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 3., 4., ..., 0., 0., 0.],
        [4., 0., 0., ..., 0., 0., 0.]]),
 array([[978220800, 978220800, 978220800, ...,         0,         0,
                 0],
        [978220800,         0,         0, ...,         0,         0,
                 0],
        [        0,         0,         0, ...,         0,         0,
                 0],
        ...,
        [        0,         0,         0, ...,         0,         0,
                 0],
        [        0, 956620800, 956620800, ...,         0,         0,
                 0],
        [957657600,         0,         0, ...,         0,         0,
                 0]]))

In [30]:
sanity_check_explicit_matrix(explicit_ratings=explicit_ratings, last_dates=last_dates, review_df=movielens_df)

KeyError: 'business_id'

In [None]:
# Split into training, validation, and test sets based on time
train_matrix, val_matrix, test_matrix = split_matrix_csr(
    rating_matrix,
    date_matrix,
    ratios=[0.7, 0.15, 0.15]
)


In [None]:
# Tune k using the elbow method and lowest validation RMSE
best_k, elbow_k, results_df, *_ = tune_k(
    train_matrix,
    val_matrix,
    test_matrix,
    user_mapping=user_mapping,
    item_mapping=item_mapping
)

print(f"Best K (lowest validation RMSE): {best_k}")
print(f"Elbow K (stability cutoff): {elbow_k}")


In [None]:
# Train the model with optimal k
recommender = ItemBasedRecommender(k=elbow_k)
recommender.fit(train_matrix, user_mapping, item_mapping)


In [None]:
# Generate predictions for all unrated items
prediction_matrix = recommender.predict_matrix()


In [None]:
# Calculate RMSE
rmse_calculator = KnnMetricsCalculator(
    test_matrix=test_matrix,
    model=recommender,
    idx_to_user_id=user_mapping['idx_to_id'],
    idx_to_item_id=item_mapping['idx_to_id']
)
rmse = rmse_calculator.calculate_rmse()
print(f"Test RMSE: {rmse:.4f}")


In [None]:
# Evaluate Top-N unexpectedness
top_n_metrics = KnnTestMetricsCalculator(
    test_matrix=test_matrix,
    model=recommender,
    idx_to_user_id=user_mapping['idx_to_id'],
    idx_to_item_id=item_mapping['idx_to_id'],
    n=10
)
unexpectedness = top_n_metrics.calculate_unexpectedness()
print(f"Unexpectedness: {unexpectedness:.4f}")


## ✅ Final Results
- **RMSE**: Printed above
- **Unexpectedness**: Printed above
- Model: Item-Based Collaborative Filtering (KNN)
- Data: Yelp Review Dataset
