In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

library_path = os.path.abspath("../library")
if library_path not in sys.path:
    sys.path.append(library_path)

In [3]:
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [4]:
import pandas as pd

pd.options.mode.chained_assignment = None  # Disable the warning

import pickle
import numpy as np

from metrics import RmseCalculator, SvdTestMetricsCalculator
from rating import get_explicit_rating, get_implicit_rating_out_of_positive_ratings_df, split_matrix_csr, \
    sanity_check_implicit_rating, sanity_check_explicit_split, sanity_check_explicit_matrix
from tuning import GridSearchSvdPP

# Feature selection

The only dataset that is necessary for our purposes is **review** dataset since:
- it contains the information about explicit ratings (the mean of the field **Rating** for pairs of users and items, check the chapter **Feature engineering** for more details)
- it contains the information for implicit rating (check the chapter **Feature engineering** for more details)
- it already contains only those users who provided at least one review and those items that received at least one estimation

In [5]:
PATH = '../../../eda/dataset_samples/df_movie_lens.parquet'

For this dataset **no one** sampling approach was conducted since the size of final matrix `User` x `Item` interactions will have size `6040 x 3706` that can be processed on our **compute power**

Moreover, there is no features to drop since all of them are necessary to create **explicit** and **implicit** ratings

In [6]:
review_df = pd.read_parquet(PATH)
review_df

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


# Explicit rating extracting

Calculating the **explicit rating** for the MovieLens dataset. 

The output consists of two CSR matrices with identical structure: the first matrix contains **the mean review rating** given by user *u_i* to business *b_i*, and the second matrix stores **the timestamp of the latest review** at the same positions. 

Additionally, two utility dictionaries are provided, containing mappings **between IDs and matrix indices** (and vice versa).

In [7]:
explicit_ratings, last_dates, user_mapping, item_mapping = get_explicit_rating(review_df, "UserID",
                                                                               "MovieID", "Rating", "Timestamp")

explicit_ratings.toarray(), last_dates.toarray()

(array([[5., 3., 3., ..., 0., 0., 0.],
        [5., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 3., 4., ..., 0., 0., 0.],
        [4., 0., 0., ..., 0., 0., 0.]]),
 array([[978300760, 978302109, 978301968, ...,         0,         0,
                 0],
        [978298413,         0,         0, ...,         0,         0,
                 0],
        [        0,         0,         0, ...,         0,         0,
                 0],
        ...,
        [        0,         0,         0, ...,         0,         0,
                 0],
        [        0, 956706019, 956705872, ...,         0,         0,
                 0],
        [957716612,         0,         0, ...,         0,         0,
                 0]]))

**Sanity check**:
* the amount of filled cells in the sparse matrices (`.nnz`) must be the same as **the number of unique pairs** of users and items
* the amount is **the same**

In [8]:
sanity_check_explicit_matrix(explicit_ratings=explicit_ratings, last_dates=last_dates, review_df=review_df,
                             user_field="UserID", item_field="MovieID")

Unnamed: 0,Source,Calculated metrics,Value
0,Explicit ratings matrix,Non-zero entries,1000209
1,Last dates matrix,Non-zero entries,1000209
2,Filtered review DataFrame,"Unique (user_id, business_id) pairs",1000209


# Train / validation / test split

Define the divisions within the initial matrix (**test / validation / train** according to the documentation of split function)

In [9]:
DIVISIONS = [0.1, 0.2, 0.7]

Split matrix in proportions `0.1, 0.2, 0.7` for **test**, **validation** and **train** set.

In [10]:
test_matrix, validation_matrix, train_matrix = split_matrix_csr(explicit_ratings, last_dates, DIVISIONS)
train_matrix.toarray(), validation_matrix.toarray(), test_matrix.toarray()

(array([[5., 3., 3., ..., 0., 0., 0.],
        [5., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 4., ..., 0., 0., 0.],
        [4., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 3., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]))

**Sanity check** (verify that the explicit matrix has been correctly split into **train, validation, and test** subsets):
* The total number of interactions (nnz) in the splits matches the original explicit matrix.
* The proportions of data in each split (Train, Validation, Test) **almost** align with the intended ratios.
* No interactions are lost during the split.

In [11]:
sanity_check_explicit_split(train_matrix=train_matrix, validation_matrix=validation_matrix, test_matrix=test_matrix, explicit_matrix=explicit_ratings)

Unnamed: 0,Split,Number of interactions,Part of factual interactions
0,Train,701042,70.09%
1,Validation,199516,19.95%
2,Test,99651,9.96%
3,Explicit total,1000209,100.0%
4,Factual total,1000209,100%


# Implicit rating extraction for train dataset

**The threshold** for implicit ratings calculations (only positive ratings are considered and explicit ratings are `from 1 to 5`)

In [12]:
IMPLICIT_THRESHOLD = 4

Calculate the **implicit rating** in the following way:
* calculate the amount of reviews from `u_i` to `b_i` that have the number of starts is above `IMPLICIT_THRESHOLD`

Final artifact:
* dict in the format `{<user_id>: {<item_id>: <amount of positive reviews>} }`

In [13]:
implicit_ratings = get_implicit_rating_out_of_positive_ratings_df(df=review_df, user_field='UserID',item_field='MovieID', rating_field='Rating', implicit_threshold=IMPLICIT_THRESHOLD)

len(implicit_ratings.keys()), review_df['UserID'].nunique()

(6038, 6040)

**Sanity check** (verify the correctness of the `implicit_ratings` matrix creation from the filtered review DataFrame using a specified `IMPLICIT_THRESHOLD`):

Metrics:
* Total number of reviews in the original dataset that meet or exceed the implicit threshold.
* Confirms that all qualifying reviews were included in the final implicit ratings' matrix.
* Number of distinct users present in the MovieLens dataset before conversion.
* Ensures that no user information was lost during transformation.
* Total movies that were reviewed in the original dataset.
* Confirms that all relevant business interactions are retained.

Results: 
* All user and movies counts match between the initial and processed datasets.
* The number of implicit ratings is equal to the number of qualifying reviews — indicating a correct threshold-based transformation.

In [14]:
sanity_check_implicit_rating(initial_df=review_df, implicit_ratings=implicit_ratings, implicit_threshold=IMPLICIT_THRESHOLD, user_field='UserID',item_field='MovieID', rating_field='Rating')

Unnamed: 0,Metric,Value
0,Number of reviews (stars >= threshold),575281
1,Number of reviews in implicit_ratings,575281
2,Unique users in initial reviews,6038
3,Unique users in implicit_ratings,6038
4,Unique businesses in initial reviews,3533
5,Unique businesses in implicit_ratings,3533


# Hyperparameters tuning

Define potential values of hyperparameters for the implementation of **SVD++** 

In [15]:
svd_pp_param_grid = {
    'n_factors': [20, 40, 70, 100, 150],    # Number of latent factors
    'n_epochs': [10, 20, 30, 50],           # Number of training iterations
    'lr_all': [0.002, 0.005, 0.007, 0.01],  # Learning rate
    'reg_all': [0.02, 0.05, 0.1, 0.2]       # Regularization strength
}

Using **train and validation** dataset conduct grid search based on **RMSE** metric and extract the best hyperparameters for the target metric on the validation dataset. 

Best hyperparameters (based on validation matrix):
* **learning rate**: _
* **number of epochs**: _
* **number of hidden factors**: _
* **regularization term**: _

Best RMSE: **1.111** (in average the model makes in 1.111 point in rating estimation)

In [16]:
grid_search_svd_pp = GridSearchSvdPP(train_matrix=train_matrix, val_matrix=validation_matrix,
                                     implicit_rating=implicit_ratings, user_mapping=user_mapping,
                                     item_mapping=item_mapping, param_grid=svd_pp_param_grid)

best_params, best_score, best_svdpp_model = grid_search_svd_pp.run()

print(f"Best params: {best_params}")
print(f"Best RMSE: {best_score}")

INFO:root:Try number: 1
INFO:root:Train with params: {'lr_all': 0.002, 'n_epochs': 10, 'n_factors': 20, 'reg_all': 0.02}
INFO:root:Epoch: 1


KeyboardInterrupt: 

# Model testing

In [None]:
metrics_calculator = RmseCalculator(test_matrix=test_matrix, model=best_svdpp_model,
                                    idx_to_user_id=user_mapping['idx_to_id'],
                                    idx_to_item_id=item_mapping['idx_to_id'])

metrics_calculator.calculate_rmse()

# Model saving

The following code saves the result object to reuse **the trained model** in the service


In [None]:
with open("./models/svd_pp_movie_lens.pkl", "wb") as f:
    pickle.dump(best_svdpp_model, f)