In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [4]:
import pandas as pd

pd.options.mode.chained_assignment = None  # Disable the warning

import pickle

from metrics import RmseCalculator, TestMetricsCalculator
from rating import get_explicit_rating, get_implicit_rating_out_of_positive_ratings_df, split_matrix_csr, \
    sanity_check_implicit_rating, sanity_check_explicit_split, sanity_check_explicit_matrix
from tuning import GridSearchSvdPP

In [29]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

# Feature selection

The only dataset that is necessary for our purposes is **review** dataset since:
- it contains the information about explicit ratings (the mean of the field **Rating** for pairs of users and items, check the chapter **Feature engineering** for more details)
- it contains the information for implicit rating (check the chapter **Feature engineering** for more details)
- it already contains only those users who provided at least one review and those items that received at least one estimation

In [7]:
PATH = '../../eda/dataset_samples/df_movie_lens.parquet'

For this dataset **no one** sampling approach was conducted since the size of final matrix `User` x `Item` interactions will have size `6040 x 3706` that can be processed on our **compute power**

Moreover, there is no features to drop since all of them are necessary to create **explicit** and **implicit** ratings

In [8]:
review_df = pd.read_parquet(PATH)
review_df

Unnamed: 0,user_id,movie_id,Rating,Date,YearMonth
0,5621,358,1,2000-05-23,2000-05
1,5112,2450,1,2000-06-30,2000-06
2,203,147,1,2000-12-16,2000-12
3,4387,1007,1,2000-08-05,2000-08
4,5980,2414,1,2000-04-28,2000-04
...,...,...,...,...,...
9941,1041,2186,5,2000-11-23,2000-11
9942,3842,1963,5,2000-08-10,2000-08
9943,1324,1213,5,2000-11-21,2000-11
9944,3580,353,5,2000-08-19,2000-08


In [9]:
review_df['timestamp'] = pd.to_datetime(review_df["Date"]).astype("int64") // 10**9
review_df

Unnamed: 0,user_id,movie_id,Rating,Date,YearMonth,timestamp
0,5621,358,1,2000-05-23,2000-05,959040000
1,5112,2450,1,2000-06-30,2000-06,962323200
2,203,147,1,2000-12-16,2000-12,976924800
3,4387,1007,1,2000-08-05,2000-08,965433600
4,5980,2414,1,2000-04-28,2000-04,956880000
...,...,...,...,...,...,...
9941,1041,2186,5,2000-11-23,2000-11,974937600
9942,3842,1963,5,2000-08-10,2000-08,965865600
9943,1324,1213,5,2000-11-21,2000-11,974764800
9944,3580,353,5,2000-08-19,2000-08,966643200


# Explicit rating extracting

Calculating the **explicit rating** for the MovieLens dataset. 

The output consists of two CSR matrices with identical structure: the first matrix contains **the mean review rating** given by user *u_i* to business *b_i*, and the second matrix stores **the timestamp of the latest review** at the same positions. 

Additionally, two utility dictionaries are provided, containing mappings **between IDs and matrix indices** (and vice versa).

In [10]:
explicit_ratings, last_dates, user_mapping, item_mapping = get_explicit_rating(review_df, "user_id",
                                                                               "movie_id", "Rating", "timestamp")

explicit_ratings.toarray(), last_dates.toarray()

(array([[1., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[959040000,         0,         0, ...,         0,         0,
                 0],
        [        0, 962323200,         0, ...,         0,         0,
                 0],
        [        0,         0, 976924800, ...,         0,         0,
                 0],
        ...,
        [        0,         0,         0, ...,         0,         0,
                 0],
        [        0,         0,         0, ...,         0,         0,
                 0],
        [        0,         0,         0, ...,         0,         0,
                 0]]))

**Sanity check**:
* the amount of filled cells in the sparse matrices (`.nnz`) must be the same as **the number of unique pairs** of users and items
* the amount is **the same**

In [10]:
sanity_check_explicit_matrix(explicit_ratings=explicit_ratings, last_dates=last_dates, review_df=review_df,
                             user_field="user_id", item_field="movie_id")

Unnamed: 0,Source,Calculated metrics,Value
0,Explicit ratings matrix,Non-zero entries,9946
1,Last dates matrix,Non-zero entries,9946
2,Filtered review DataFrame,"Unique (user_id, business_id) pairs",9946


# Train / validation / test split

Define the divisions within the initial matrix (**test / validation / train** according to the documentation of split function)

In [11]:
DIVISIONS = [0.1, 0.2, 0.7]

Split matrix in proportions `0.1, 0.2, 0.7` for **test**, **validation** and **train** set.

In [12]:
test_matrix, validation_matrix, train_matrix = split_matrix_csr(explicit_ratings, last_dates, DIVISIONS)
train_matrix.toarray(), validation_matrix.toarray(), test_matrix.toarray()

(array([[1., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]))

**Sanity check** (verify that the explicit matrix has been correctly split into **train, validation, and test** subsets):
* The total number of interactions (nnz) in the splits matches the original explicit matrix.
* The proportions of data in each split (Train, Validation, Test) **almost** align with the intended ratios.
* No interactions are lost during the split.

In [13]:
sanity_check_explicit_split(train_matrix=train_matrix, validation_matrix=validation_matrix, test_matrix=test_matrix, explicit_matrix=explicit_ratings)

Unnamed: 0,Split,Number of interactions,Part of factual interactions
0,Train,7013,70.51%
1,Validation,1981,19.92%
2,Test,952,9.57%
3,Explicit total,9946,100.0%
4,Factual total,9946,100%


# Implicit rating extraction for train dataset

**The threshold** for implicit ratings calculations (only positive ratings are considered and explicit ratings are `from 1 to 5`)

In [43]:
IMPLICIT_THRESHOLD = 4

Calculate the **implicit rating** in the following way:
* calculate the amount of reviews from `u_i` to `b_i` that have the number of starts is above `IMPLICIT_THRESHOLD`

Final artifact:
* dict in the format `{<user_id>: {<item_id>: <amount of positive reviews>} }`

In [44]:
implicit_ratings = get_implicit_rating_out_of_positive_ratings_df(df=review_df, user_field='user_id',item_field='movie_id', rating_field='Rating', implicit_threshold=IMPLICIT_THRESHOLD)

len(implicit_ratings.keys()), review_df['user_id'].nunique()

(579, 597)

**Sanity check** (verify the correctness of the `implicit_ratings` matrix creation from the filtered review DataFrame using a specified `IMPLICIT_THRESHOLD`):

Metrics:
* Total number of reviews in the original dataset that meet or exceed the implicit threshold.
* Confirms that all qualifying reviews were included in the final implicit ratings' matrix.
* Number of distinct users present in the MovieLens dataset before conversion.
* Ensures that no user information was lost during transformation.
* Total movies that were reviewed in the original dataset.
* Confirms that all relevant business interactions are retained.

Results: 
* All user and movies counts match between the initial and processed datasets.
* The number of implicit ratings is equal to the number of qualifying reviews — indicating a correct threshold-based transformation.

In [45]:
sanity_check_implicit_rating(initial_df=review_df, implicit_ratings=implicit_ratings, implicit_threshold=IMPLICIT_THRESHOLD, user_field='user_id',item_field='movie_id', rating_field='Rating')

Unnamed: 0,Metric,Value
0,Number of reviews (stars >= threshold),5667
1,Number of reviews in implicit_ratings,5667
2,Unique users in initial reviews,579
3,Unique users in implicit_ratings,579
4,Unique businesses in initial reviews,1607
5,Unique businesses in implicit_ratings,1607


# Hyperparameters tuning

Define potential values of hyperparameters for the implementation of **SVD++** 

In [46]:
svd_pp_param_grid = {
    'n_factors': [20, 40, 70, 100, 150],    # Number of latent factors
    'n_epochs': [10, 20, 30, 50],           # Number of training iterations
    'lr_all': [0.002, 0.005, 0.007, 0.01],  # Learning rate
    'reg_all': [0.02, 0.05, 0.1, 0.2]       # Regularization strength
}

Using **train and validation** dataset conduct grid search based on **RMSE** metric and extract the best hyperparameters for the target metric on the validation dataset. 

Best hyperparameters (based on validation matrix):
* **learning rate**: 0.005
* **number of epochs**: 30
* **number of hidden factors**: 20
* **regularization term**: 0.02

Best RMSE: **1.054** (in average the model makes in 1.054 point in rating estimation)

In [47]:
grid_search_svd_pp = GridSearchSvdPP(train_matrix=train_matrix, val_matrix=validation_matrix,
                                     implicit_rating=implicit_ratings, user_mapping=user_mapping,
                                     item_mapping=item_mapping, param_grid=svd_pp_param_grid)

best_params, best_score, best_svdpp_model = grid_search_svd_pp.run()

print(f"Best params: {best_params}")
print(f"Best RMSE: {best_score}")

INFO:root:Try number: 1
INFO:root:Train with params: {'lr_all': 0.002, 'n_epochs': 10, 'n_factors': 20, 'reg_all': 0.02}
INFO:root:Epoch: 1
INFO:root:Epoch: 2
INFO:root:Epoch: 3
INFO:root:Epoch: 4
INFO:root:Epoch: 5
INFO:root:Epoch: 6
INFO:root:Epoch: 7
INFO:root:Epoch: 8
INFO:root:Epoch: 9
INFO:root:Epoch: 10
INFO:root:Current common score: 1.1082709944887077
INFO:root:Try number: 2
INFO:root:Train with params: {'lr_all': 0.002, 'n_epochs': 10, 'n_factors': 20, 'reg_all': 0.05}
INFO:root:Epoch: 1
INFO:root:Epoch: 2
INFO:root:Epoch: 3
INFO:root:Epoch: 4
INFO:root:Epoch: 5
INFO:root:Epoch: 6
INFO:root:Epoch: 7
INFO:root:Epoch: 8
INFO:root:Epoch: 9
INFO:root:Epoch: 10
INFO:root:Current common score: 1.1071043865468924
INFO:root:Try number: 3
INFO:root:Train with params: {'lr_all': 0.002, 'n_epochs': 10, 'n_factors': 20, 'reg_all': 0.1}
INFO:root:Epoch: 1
INFO:root:Epoch: 2
INFO:root:Epoch: 3
INFO:root:Epoch: 4
INFO:root:Epoch: 5
INFO:root:Epoch: 6
INFO:root:Epoch: 7
INFO:root:Epoch: 8
IN

Best params: {'lr_all': 0.005, 'n_epochs': 30, 'n_factors': 20, 'reg_all': 0.02}
Best RMSE: 1.0543983585112708


The following code saves the result object to reuse **the trained model** in the service


In [50]:
with open("../../models/svd_pp_movie_lens.pkl", "wb") as f:
    pickle.dump(best_svdpp_model, f)

# Model testing

Load the saved model back from memory

In [13]:
with open("../../models/svd_pp_movie_lens.pkl", 'rb') as f:
    test_model = pickle.load(f)

test_model

<svdpp.SVDpp at 0x14d7a6f30>

Create a metrics evaluator

In [14]:
metrics_calculator = TestMetricsCalculator(test_matrix=test_matrix, model=test_model,
                                           idx_to_user_id=user_mapping['idx_to_id'],
                                           idx_to_item_id=item_mapping['idx_to_id'])

INFO:root:Create top-10 recommendations' list
INFO:root:User: 0 -- top 10 list -- [(577, 4.085757560271237), (1453, 3.9710130758063005), (288, 3.950557307239543), (569, 3.919254417464998), (2059, 3.8819107839488947), (714, 3.880120346091093), (1168, 3.8173667654016024), (416, 3.7931380053287516), (205, 3.7805416111705474), (1155, 3.7709653798619165)]
INFO:root:User: 1 -- top 10 list -- [(569, 4.0972832307538445), (1639, 4.095396730761019), (714, 3.993313570535505), (1098, 3.9623029633845306), (800, 3.9523884482530294), (205, 3.900272469524115), (1804, 3.873630523370993), (819, 3.854056314711756), (1784, 3.8218534187165014), (1777, 3.8057119449644237)]
INFO:root:User: 2 -- top 10 list -- [(577, 3.3525279483775217), (819, 3.236750056079246), (1144, 3.2322702205930343), (1804, 3.227008555366919), (1639, 3.226897491683676), (1483, 3.2057528353524125), (569, 3.20025304407452), (904, 3.183589676452457), (800, 3.151548102018377), (1385, 3.1514488753439034)]
INFO:root:User: 3 -- top 10 list --

As it's possible to see, we're in the **cold start** state with our `test set`, because:
- maximum popularity is less than **0.02** (the maximum percentage of users that consider a particular item as **relevant** is less than **2%**) 
- Percentage of filled pairs `user - item` (this is considered as a flag for **relevance**) is less than **0.07%**

It means that most probably we'll get **novel** and **relevant** system, but it'll have bad **diversity** and **coverage** (in the cases when it's linked to `relevance flag`)

In [16]:
metrics_calculator.get_test_set_statistic()

Mean popularity               0.001875
Max popularity                0.010050
Min popularity                0.000000
Number of pairs         1345638.000000
Non-null pairs (u-i)        952.000000
% of non-null pairs           0.070747
Relevant pairs (u-i)        952.000000
% of relevant pairs           0.070747
dtype: object

## Relevance metrics

RMSE on `test dataset`: **0.988** (in average the model makes in 0.987 point in rating estimation) 

We assume that this is RMSE on the real data (the newest one)

In [18]:
rmse_calculator = RmseCalculator(matrix=test_matrix, model=test_model,
                                    idx_to_user_id=user_mapping['idx_to_id'],
                                    idx_to_item_id=item_mapping['idx_to_id'])

rmse_calculator.calculate_rmse()

0.9878035344405094

**Recovery** checks how close relevant* items to the top of the RLs. 

However, since we're in the **cold start** position with our test dataset, **no one** relevant* item appeared in all RLs 

\* **relevant item `j`** - item that has rating from user `i` in the test dataset

In [19]:
metrics_calculator.calculate_recovery()

There is no one relevant item in the top-10 recommendation list => Recovery can't be calculated


## Diversity metric
As a diversity metric **(normalized) aggregation diversity** was chosen. This metric can be used for both purposes - **inter-user diversity** and **coverage** since initially it calculates amount of unique items among all the RLs:
- we normalize by the `amount of recommendations` to calculate the level of **diversity** (which percent of recommendations is unique)
- we normalize by the `ammount of avaliable items` to calculate the level of **coverage** (which percent of all items has appeared in the RLs)

Final result:
-  **0.015** => our system mostly recommend **the same items** across the lists (can't reach **1** in current setup - size of catalog < amount of recommendations)

In [20]:
metrics_calculator.calculate_agg_div()

0.01541038525963149

## Coverage metric
As a coverage metric **(normalized) Item space coverage** was used. This metric serves for 2 purposes:
- check how many unique items appears in the RLs (also can be considered as **coverage**)
- check how uniform items distributed across the RLs

We can't directly conclude which behaviour our model has based on this metric, it serves for comparison between models

In [21]:
metrics_calculator.calculate_item_space_coverage()

10.087078263531248

Apart from that, as already was mentioned, AggDiv can be used as **coverage metric**

Its result is **0.041** which highlights that most part of the catalog **wasn't used** in the RLs 

In [22]:
metrics_calculator.calculate_agg_div(is_coverage=True)

0.04081632653061224

## Novelty metric
As novelty metric **(normalized) Item degree-based Novelty** is used: it shows the level of **unpopular** items that are in the RLs. 

The final result is dispirited and upsetting - **0.29** - which means system mostly recommend the popular items

In [25]:
metrics_calculator.calculate_normalized_item_deg()

0.29035925502908155

## Serendipity metric
Two metrics are used:
- **Unexpectedness** (`False` flag) == amount of recommended items above `mean popularity` across all the users
- **Serendipity** == amount of recommended items above `mean popularity` and relevant at the same time across all the users

1. Since test set almost doesn't contain relevant items (**cold start**), serendipity is **0** == there is no items above mean popularity and relevant at the same time
2. However, the system shows the "ok" level of `unexpectedness` - **54.81%**

In [23]:
metrics_calculator.calculate_serendipity()

0.0

In [24]:
metrics_calculator.calculate_serendipity(False)

0.5480737018425453

## Key takeaways

Final conclusions about the **SVD++** with **Yelp**:
- Since the test dataset is in the **cold start** state, most part of the items in the set are **irrelevant** => the system doesn't produce relevant and serendipitous recommendations at all 
- The accuracy of the system remains debatable since the deviation is close to **1 point** in explicit rating (RMSE close to 1)
- System also doesn't cover most part of the items from catalog and don't recommend unique lists mostly
- System also recommends mostly **popular items** (however more than 50% of them are still below the mean popularity)

In [30]:
metrics_calculator.generate_metrics_summary_df(rmse_calculator.calculate_rmse())

There is no one relevant item in the top-10 recommendation list => Recovery can't be calculated


Unnamed: 0,Metric,Area,Value,Value Range,Meaning
0,Recovery,Relevance,,"[0, 0.9]",How early relevant items appear in top-N recommendations
1,Normalized AggDiv (diversity),Inter-user diversity,0.01541,"[0, 1]",Proportion of unique items recommended across all users divided by the amount of recommendations
2,Normalized AggDiv (coverage),Coverage,0.040816,"[0, 1]",Proportion of unique items recommended across all users divided by the size of catalog
3,Item Space Coverage,Coverage,10.087,"[0, Not defined]",Shows how many unique items and how often appears in the RLs (ideally a lot of different items recommended uniformly)
4,Normalized ItemDeg,Novelty,0.29,"[0, 1]",Novelty of recommended items based on inverse (log) item popularity
5,Unexpectedness (no relevance),Serendipity,0.548,"[0, 1]",Proportion of items that are unexpected (less popular than average)
6,Serendipity (with relevance),Serendipity,0.0,"[0, 1]",Proportion of unexpected and relevant items in top-N recommendations
7,RMSE,Relevance,0.988,"[0, 6]",Root Mean Square Error between predicted and actual ratings


The meanings of the metrics and their ranges

In [33]:
metrics_calculator.get_range_of_metrics()

Unnamed: 0,Metric,Min,Max,Explanation
0,Item space coverage,0,Not defined,"small - recommendations focuses on several item only or aren't balanced, big - recommendations are distributed uniformly across a lot of items"
1,Recovery,0,0.9,"0 - all the relevant items on the top of the list, 0.9 - all relevant items in the bottom of the list, None - no relevant items in the RLs"
2,Normalized AggDiv (diversity),0,1,"0 - only 1 item was recommended for everyone, 1 - all recommendations are different"
3,Normalized AggDiv (coverage),0,1,"0 - only 1 item was recommended, 1 - all the items from catalog were recommended"
4,Unexpectedness (with_relevance=False),0,1,"0 - there is no unexpected item (popularity below the average) in all RLs, 1 - all the items are unexpected"
5,Serendipity (with_relevance=True),0,1,"0 - there is no serendipitous item (popularity below the average + relevant) in all RLs, 1 - all the items are serendipitous"
6,Normalized ItemDeg,0,1,"0 - the most popular items are used (no novelty), 1 - all items are the most unpopular (the best novelty)"
