In [31]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
import os
import sys

library_path = os.path.abspath("../library")
if library_path not in sys.path:
    sys.path.append(library_path)

In [33]:
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [34]:
import pandas as pd

pd.options.mode.chained_assignment = None  # Disable the warning

import pickle
import numpy as np

from metrics import RmseCalculator, TestMetricsCalculator
from rating import get_explicit_rating, get_implicit_rating_out_of_positive_ratings_df, split_matrix_csr, \
    sanity_check_implicit_rating, sanity_check_explicit_split, sanity_check_explicit_matrix
from tuning import GridSearchSvdPP

# Feature selection

The only dataset that is necessary for our purposes is **review** dataset since:
- it contains the information about explicit ratings (the mean of the field **Rating** for pairs of users and items, check the chapter **Feature engineering** for more details)
- it contains the information for implicit rating (check the chapter **Feature engineering** for more details)
- it already contains only those users who provided at least one review and those items that received at least one estimation

In [35]:
PATH = '../../../eda/dataset_samples/df_movie_lens.parquet'

For this dataset **no one** sampling approach was conducted since the size of final matrix `User` x `Item` interactions will have size `6040 x 3706` that can be processed on our **compute power**

Moreover, there is no features to drop since all of them are necessary to create **explicit** and **implicit** ratings

In [36]:
review_df = pd.read_parquet(PATH)
review_df

Unnamed: 0,user_id,movie_id,Rating,Date,YearMonth
0,5621,358,1,2000-05-23,2000-05
1,5112,2450,1,2000-06-30,2000-06
2,203,147,1,2000-12-16,2000-12
3,4387,1007,1,2000-08-05,2000-08
4,5980,2414,1,2000-04-28,2000-04
...,...,...,...,...,...
9941,1041,2186,5,2000-11-23,2000-11
9942,3842,1963,5,2000-08-10,2000-08
9943,1324,1213,5,2000-11-21,2000-11
9944,3580,353,5,2000-08-19,2000-08


In [37]:
review_df['timestamp'] = pd.to_datetime(review_df["Date"]).astype("int64") // 10**9
review_df

Unnamed: 0,user_id,movie_id,Rating,Date,YearMonth,timestamp
0,5621,358,1,2000-05-23,2000-05,959040000
1,5112,2450,1,2000-06-30,2000-06,962323200
2,203,147,1,2000-12-16,2000-12,976924800
3,4387,1007,1,2000-08-05,2000-08,965433600
4,5980,2414,1,2000-04-28,2000-04,956880000
...,...,...,...,...,...,...
9941,1041,2186,5,2000-11-23,2000-11,974937600
9942,3842,1963,5,2000-08-10,2000-08,965865600
9943,1324,1213,5,2000-11-21,2000-11,974764800
9944,3580,353,5,2000-08-19,2000-08,966643200


# Explicit rating extracting

Calculating the **explicit rating** for the MovieLens dataset. 

The output consists of two CSR matrices with identical structure: the first matrix contains **the mean review rating** given by user *u_i* to business *b_i*, and the second matrix stores **the timestamp of the latest review** at the same positions. 

Additionally, two utility dictionaries are provided, containing mappings **between IDs and matrix indices** (and vice versa).

In [38]:
explicit_ratings, last_dates, user_mapping, item_mapping = get_explicit_rating(review_df, "user_id",
                                                                               "movie_id", "Rating", "timestamp")

explicit_ratings.toarray(), last_dates.toarray()

(array([[1., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[959040000,         0,         0, ...,         0,         0,
                 0],
        [        0, 962323200,         0, ...,         0,         0,
                 0],
        [        0,         0, 976924800, ...,         0,         0,
                 0],
        ...,
        [        0,         0,         0, ...,         0,         0,
                 0],
        [        0,         0,         0, ...,         0,         0,
                 0],
        [        0,         0,         0, ...,         0,         0,
                 0]]))

**Sanity check**:
* the amount of filled cells in the sparse matrices (`.nnz`) must be the same as **the number of unique pairs** of users and items
* the amount is **the same**

In [39]:
sanity_check_explicit_matrix(explicit_ratings=explicit_ratings, last_dates=last_dates, review_df=review_df,
                             user_field="user_id", item_field="movie_id")

Unnamed: 0,Source,Calculated metrics,Value
0,Explicit ratings matrix,Non-zero entries,9946
1,Last dates matrix,Non-zero entries,9946
2,Filtered review DataFrame,"Unique (user_id, business_id) pairs",9946


# Train / validation / test split

Define the divisions within the initial matrix (**test / validation / train** according to the documentation of split function)

In [40]:
DIVISIONS = [0.1, 0.2, 0.7]

Split matrix in proportions `0.1, 0.2, 0.7` for **test**, **validation** and **train** set.

In [41]:
test_matrix, validation_matrix, train_matrix = split_matrix_csr(explicit_ratings, last_dates, DIVISIONS)
train_matrix.toarray(), validation_matrix.toarray(), test_matrix.toarray()

(array([[1., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]))

**Sanity check** (verify that the explicit matrix has been correctly split into **train, validation, and test** subsets):
* The total number of interactions (nnz) in the splits matches the original explicit matrix.
* The proportions of data in each split (Train, Validation, Test) **almost** align with the intended ratios.
* No interactions are lost during the split.

In [42]:
sanity_check_explicit_split(train_matrix=train_matrix, validation_matrix=validation_matrix, test_matrix=test_matrix, explicit_matrix=explicit_ratings)

Unnamed: 0,Split,Number of interactions,Part of factual interactions
0,Train,7013,70.51%
1,Validation,1981,19.92%
2,Test,952,9.57%
3,Explicit total,9946,100.0%
4,Factual total,9946,100%


# Implicit rating extraction for train dataset

**The threshold** for implicit ratings calculations (only positive ratings are considered and explicit ratings are `from 1 to 5`)

In [43]:
IMPLICIT_THRESHOLD = 4

Calculate the **implicit rating** in the following way:
* calculate the amount of reviews from `u_i` to `b_i` that have the number of starts is above `IMPLICIT_THRESHOLD`

Final artifact:
* dict in the format `{<user_id>: {<item_id>: <amount of positive reviews>} }`

In [44]:
implicit_ratings = get_implicit_rating_out_of_positive_ratings_df(df=review_df, user_field='user_id',item_field='movie_id', rating_field='Rating', implicit_threshold=IMPLICIT_THRESHOLD)

len(implicit_ratings.keys()), review_df['user_id'].nunique()

(579, 597)

**Sanity check** (verify the correctness of the `implicit_ratings` matrix creation from the filtered review DataFrame using a specified `IMPLICIT_THRESHOLD`):

Metrics:
* Total number of reviews in the original dataset that meet or exceed the implicit threshold.
* Confirms that all qualifying reviews were included in the final implicit ratings' matrix.
* Number of distinct users present in the MovieLens dataset before conversion.
* Ensures that no user information was lost during transformation.
* Total movies that were reviewed in the original dataset.
* Confirms that all relevant business interactions are retained.

Results: 
* All user and movies counts match between the initial and processed datasets.
* The number of implicit ratings is equal to the number of qualifying reviews — indicating a correct threshold-based transformation.

In [45]:
sanity_check_implicit_rating(initial_df=review_df, implicit_ratings=implicit_ratings, implicit_threshold=IMPLICIT_THRESHOLD, user_field='user_id',item_field='movie_id', rating_field='Rating')

Unnamed: 0,Metric,Value
0,Number of reviews (stars >= threshold),5667
1,Number of reviews in implicit_ratings,5667
2,Unique users in initial reviews,579
3,Unique users in implicit_ratings,579
4,Unique businesses in initial reviews,1607
5,Unique businesses in implicit_ratings,1607


# Hyperparameters tuning

Define potential values of hyperparameters for the implementation of **SVD++** 

In [46]:
svd_pp_param_grid = {
    'n_factors': [20, 40, 70, 100, 150],    # Number of latent factors
    'n_epochs': [10, 20, 30, 50],           # Number of training iterations
    'lr_all': [0.002, 0.005, 0.007, 0.01],  # Learning rate
    'reg_all': [0.02, 0.05, 0.1, 0.2]       # Regularization strength
}

Using **train and validation** dataset conduct grid search based on **RMSE** metric and extract the best hyperparameters for the target metric on the validation dataset. 

Best hyperparameters (based on validation matrix):
* **learning rate**: _
* **number of epochs**: _
* **number of hidden factors**: _
* **regularization term**: _

Best RMSE: **1.111** (in average the model makes in 1.111 point in rating estimation)

In [47]:
grid_search_svd_pp = GridSearchSvdPP(train_matrix=train_matrix, val_matrix=validation_matrix,
                                     implicit_rating=implicit_ratings, user_mapping=user_mapping,
                                     item_mapping=item_mapping, param_grid=svd_pp_param_grid)

best_params, best_score, best_svdpp_model = grid_search_svd_pp.run()

print(f"Best params: {best_params}")
print(f"Best RMSE: {best_score}")

INFO:root:Try number: 1
INFO:root:Train with params: {'lr_all': 0.002, 'n_epochs': 10, 'n_factors': 20, 'reg_all': 0.02}
INFO:root:Epoch: 1
INFO:root:Epoch: 2
INFO:root:Epoch: 3
INFO:root:Epoch: 4
INFO:root:Epoch: 5
INFO:root:Epoch: 6
INFO:root:Epoch: 7
INFO:root:Epoch: 8
INFO:root:Epoch: 9
INFO:root:Epoch: 10
INFO:root:Current common score: 1.1082709944887077
INFO:root:Try number: 2
INFO:root:Train with params: {'lr_all': 0.002, 'n_epochs': 10, 'n_factors': 20, 'reg_all': 0.05}
INFO:root:Epoch: 1
INFO:root:Epoch: 2
INFO:root:Epoch: 3
INFO:root:Epoch: 4
INFO:root:Epoch: 5
INFO:root:Epoch: 6
INFO:root:Epoch: 7
INFO:root:Epoch: 8
INFO:root:Epoch: 9
INFO:root:Epoch: 10
INFO:root:Current common score: 1.1071043865468924
INFO:root:Try number: 3
INFO:root:Train with params: {'lr_all': 0.002, 'n_epochs': 10, 'n_factors': 20, 'reg_all': 0.1}
INFO:root:Epoch: 1
INFO:root:Epoch: 2
INFO:root:Epoch: 3
INFO:root:Epoch: 4
INFO:root:Epoch: 5
INFO:root:Epoch: 6
INFO:root:Epoch: 7
INFO:root:Epoch: 8
IN

Best params: {'lr_all': 0.005, 'n_epochs': 30, 'n_factors': 20, 'reg_all': 0.02}
Best RMSE: 1.0543983585112708


# Model saving

The following code saves the result object to reuse **the trained model** in the service


In [50]:
with open("./models/svd_pp_movie_lens.pkl", "wb") as f:
    pickle.dump(best_svdpp_model, f)

# Model testing

In [49]:
metrics_calculator = RmseCalculator(test_matrix=test_matrix, model=best_svdpp_model,
                                    idx_to_user_id=user_mapping['idx_to_id'],
                                    idx_to_item_id=item_mapping['idx_to_id'])

metrics_calculator.calculate_rmse()

0.9878035344405094