In [20]:
import time
from copy import deepcopy

import numpy as np
import pandas as pd
import rectools
from rectools import Columns
from rectools.dataset import Dataset
from rectools.dataset import Interactions
from rectools.metrics import (
    Precision,
    Recall,
    MAP,
    NDCG,
    calc_metrics,
    Serendipity,
    MeanInvUserFreq
)
from rectools.model_selection import TimeRangeSplitter
from rectools.models import RandomModel, PopularModel
from tqdm import tqdm

In [21]:
df_interactions = pd.read_csv('../data/interactions.csv')
df_interactions.rename(
    columns={
        "last_watch_dt": rectools.Columns.Datetime,
        "total_dur": rectools.Columns.Weight},
    inplace=True
)

interactions = Interactions(df_interactions)

items = pd.read_csv('../data/items.csv')

In [22]:
class RecoEvaluation:
    def __init__(self, models, metrics, splitter, k):
        self.models = models
        self.metrics = metrics
        self.splitter = splitter
        self.k = k

    def evaluate(self, interactions):
        n_splits = self.splitter.n_splits
        results = []

        fold_iterator = self.splitter.split(interactions, collect_fold_stats=True)

        for train_ids, test_ids, fold_info in tqdm(fold_iterator, total=n_splits):
            print(f"Fold {fold_info['i_split']}")
            print(fold_info)

            df_train = interactions.df.iloc[train_ids]
            dataset = Dataset.construct(df_train)

            df_test = interactions.df.iloc[test_ids][Columns.UserItem]
            test_users = np.unique(df_test[Columns.User])

            catalog = df_train[Columns.Item].unique()

            for model_name, model in self.models.items():
                model_copy = deepcopy(model)  # Ensure a deep copy of the model
                start = time.time()
                model_copy.fit(dataset)
                end = time.time()
                recos = model_copy.recommend(
                    users=test_users,
                    dataset=dataset,
                    k=self.k,
                    filter_viewed=True,
                )

                metric_values = calc_metrics(
                    self.metrics,
                    reco=recos,
                    interactions=df_test,
                    prev_interactions=df_train,
                    catalog=catalog,
                )

                res = {"fold": fold_info["i_split"], "model": model_name, "time": end - start}
                res.update(metric_values)
                results.append(res)

        return pd.DataFrame(results)  #.groupby("model").mean()

In [23]:
SEED = 42
N_SPLITS = 3
K = 10

models = {
    'RandomModel': RandomModel(random_state=SEED),
    'PopularModel': PopularModel()
}

metrics = {
    'Precision@1': Precision(k=1),
    'Precision@5': Precision(k=5),
    'Precision@10': Precision(k=10),
    'Recall@1': Recall(k=1),
    'Recall@5': Recall(k=5),
    'Recall@10': Recall(k=10),
    'MAP@1': MAP(k=1, divide_by_k=False),
    'MAP@5': MAP(k=5, divide_by_k=False),
    'MAP@10': MAP(k=10, divide_by_k=False),
    'NDCG@1': NDCG(k=1, log_base=3),
    'NDCG@5': NDCG(k=5, log_base=3),
    'NDCG@10': NDCG(k=10, log_base=3),
    'MIUF@1': MeanInvUserFreq(k=1),
    'MIUF@5': MeanInvUserFreq(k=5),
    'MIUF@10': MeanInvUserFreq(k=10),
    'Serendipity@1': Serendipity(k=1),
    'Serendipity@5': Serendipity(k=5),
    'Serendipity@10': Serendipity(k=10)
}

splitter = TimeRangeSplitter(
    "7D",
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [24]:
evaluator = RecoEvaluation(models, metrics, splitter, K)
results_df = evaluator.evaluate(interactions)

  0%|          | 0/3 [00:00<?, ?it/s]

Fold 0
{'i_split': 0, 'start': Timestamp('2021-08-02 00:00:00', freq='7D'), 'end': Timestamp('2021-08-09 00:00:00', freq='7D'), 'train': 4266013, 'train_users': 797423, 'train_items': 15237, 'test': 263681, 'test_users': 98184, 'test_items': 6602}


 33%|███▎      | 1/3 [00:15<00:31, 15.52s/it]

Fold 1
{'i_split': 1, 'start': Timestamp('2021-08-09 00:00:00', freq='7D'), 'end': Timestamp('2021-08-16 00:00:00', freq='7D'), 'train': 4649162, 'train_users': 850489, 'train_items': 15415, 'test': 279422, 'test_users': 103511, 'test_items': 6698}


 67%|██████▋   | 2/3 [00:31<00:16, 16.03s/it]

Fold 2
{'i_split': 2, 'start': Timestamp('2021-08-16 00:00:00', freq='7D'), 'end': Timestamp('2021-08-23 00:00:00', freq='7D'), 'train': 5051815, 'train_users': 906071, 'train_items': 15577, 'test': 298878, 'test_users': 110076, 'test_items': 6679}


100%|██████████| 3/3 [00:49<00:00, 16.60s/it]


In [25]:
results_df.groupby("model").mean()

Unnamed: 0_level_0,fold,time,Precision@1,Recall@1,Precision@5,Recall@5,Precision@10,Recall@10,NDCG@1,NDCG@5,NDCG@10,MAP@1,MAP@5,MAP@10,MIUF@1,MIUF@5,MIUF@10,Serendipity@1,Serendipity@5,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
PopularModel,1.0,1.380344,0.076432,0.04272,0.052402,0.137413,0.033903,0.173492,0.076432,0.057932,0.043084,0.04272,0.078295,0.084109,2.377055,3.066979,3.71339,2e-06,3e-06,2e-06
RandomModel,1.0,3.6e-05,0.000169,6.9e-05,0.000176,0.000345,0.000176,0.000687,0.000169,0.000173,0.000175,6.9e-05,0.000155,0.0002,15.616147,15.611575,15.610409,4e-06,6e-06,6e-06


In [27]:
pivot_results = pd.DataFrame(results_df).groupby(["fold" ,"model"], sort=False).agg(
    ["mean"])
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,time,Precision@1,Recall@1,Precision@5,Recall@5,Precision@10,Recall@10,NDCG@1,NDCG@5,NDCG@10,MAP@1,MAP@5,MAP@10,MIUF@1,MIUF@5,MIUF@10,Serendipity@1,Serendipity@5,Serendipity@10
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
fold,model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
0,RandomModel,3.6e-05,0.000163,5e-05,0.000163,0.0003,0.000181,0.000726,0.000163,0.000161,0.000175,5e-05,0.000124,0.000178,15.588202,15.588625,15.587408,5e-06,5e-06,6e-06
0,PopularModel,1.201503,0.084026,0.047435,0.054225,0.143095,0.035483,0.181957,0.084026,0.060611,0.045331,0.047435,0.08319,0.089605,2.401723,3.079837,3.711584,2e-06,3e-06,2e-06
1,RandomModel,3.4e-05,0.000145,6e-05,0.000185,0.000361,0.000165,0.000615,0.000145,0.000178,0.000167,6e-05,0.000158,0.000193,15.621959,15.61366,15.610649,4e-06,6e-06,5e-06
1,PopularModel,1.298657,0.074466,0.04191,0.051848,0.136661,0.033572,0.172428,0.074466,0.056829,0.042315,0.04191,0.076906,0.082607,2.373249,3.065811,3.712928,2e-06,3e-06,2e-06
2,RandomModel,3.9e-05,0.0002,9.7e-05,0.00018,0.000372,0.000183,0.000719,0.0002,0.000182,0.000183,9.7e-05,0.000182,0.000228,15.638278,15.632439,15.63317,5e-06,6e-06,7e-06
2,PopularModel,1.640873,0.070806,0.038816,0.051134,0.132482,0.032655,0.166089,0.070806,0.056357,0.041607,0.038816,0.074789,0.080114,2.356194,3.055289,3.715659,2e-06,2e-06,2e-06


In [16]:
class RecoVisualisation:
    def __init__(self, model, dataset, user_ids, item_data):
        self.model = model
        self.dataset = dataset
        self.user_ids = user_ids
        self.item_data = item_data

    def visualise(self):
        for user_id in self.user_ids:
            user_history = self.dataset.interactions.df.loc[
                self.dataset.interactions.df.user_id == user_id]
            user_recos = self.model.recommend(
                users=[user_id],
                dataset=self.dataset,
                k=K,
                filter_viewed=True
            )

            if not user_history.empty:
                user_history = user_history.merge(
                    self.item_data,
                    left_on='item_id',
                    right_on='item_id',
                ).sort_values(by='datetime').tail(10)
                print(f"\nUser ID {user_id} - Watched items:")
                display(user_history)

            if not user_recos.empty:
                user_recos = user_recos.merge(
                    self.item_data,
                    left_on='item_id',
                    right_on='item_id',
                )
                print(f"\nUser ID {user_id} - Recommended items:")
                display(user_recos)

In [17]:
user_ids = [666262, 672861, 955527]

model = RandomModel(random_state=42)
dataset = Dataset.construct(df_interactions)
model.fit(dataset)

reco_viz = RecoVisualisation(model, dataset, user_ids,
                             items[['item_id', 'title', 'genres', 'release_year']])

In [18]:
reco_viz.visualise()


User ID 666262 - Watched items:


Unnamed: 0,user_id,item_id,weight,datetime,title,genres,release_year
0,666262,93,2435.0,2021-07-21,Дом ночных призраков,"зарубежные, криминал, детективы, ужасы",1959.0



User ID 666262 - Recommended items:


Unnamed: 0,user_id,item_id,score,rank,title,genres,release_year
0,666262,7419,10,1,Ода радости,комедии,2019.0
1,666262,9109,9,2,Последняя битва,"драмы, военные",2017.0
2,666262,13917,8,3,Преисподняя,"драмы, детективы, триллеры, вестерн",2016.0
3,666262,13332,7,4,Лихорадка,ужасы,2003.0
4,666262,1331,6,5,Вечность,драмы,2016.0
5,666262,15448,5,6,Леший,"триллеры, криминал, детективы",1997.0
6,666262,1123,4,7,Богема,"драмы, мюзиклы, мелодрамы",2005.0
7,666262,9933,3,8,Шедевр,"драмы, комедии",2018.0
8,666262,3287,2,9,Единоборства для детей (3-6 лет) Школа героев ...,"единоборства, фитнес, для детей",2020.0
9,666262,8478,1,10,Американский пирог: Все в сборе,комедии,2012.0



User ID 672861 - Watched items:


Unnamed: 0,user_id,item_id,weight,datetime,title,genres,release_year
0,672861,25,110883.0,2021-07-26,Медвежонок Винни и его друзья,"мюзиклы, мультфильм, приключения, комедии",2011.0
1,672861,32,12662.0,2021-08-01,В ритме сердца,"драмы, мюзиклы, мелодрамы",2011.0



User ID 672861 - Recommended items:


Unnamed: 0,user_id,item_id,score,rank,title,genres,release_year
0,672861,7419,10,1,Ода радости,комедии,2019.0
1,672861,9109,9,2,Последняя битва,"драмы, военные",2017.0
2,672861,13917,8,3,Преисподняя,"драмы, детективы, триллеры, вестерн",2016.0
3,672861,13332,7,4,Лихорадка,ужасы,2003.0
4,672861,1331,6,5,Вечность,драмы,2016.0
5,672861,15448,5,6,Леший,"триллеры, криминал, детективы",1997.0
6,672861,1123,4,7,Богема,"драмы, мюзиклы, мелодрамы",2005.0
7,672861,9933,3,8,Шедевр,"драмы, комедии",2018.0
8,672861,3287,2,9,Единоборства для детей (3-6 лет) Школа героев ...,"единоборства, фитнес, для детей",2020.0
9,672861,8478,1,10,Американский пирог: Все в сборе,комедии,2012.0



User ID 955527 - Watched items:


Unnamed: 0,user_id,item_id,weight,datetime,title,genres,release_year
0,955527,21,19820.0,2021-07-20,Признание 5,для взрослых,2014.0



User ID 955527 - Recommended items:


Unnamed: 0,user_id,item_id,score,rank,title,genres,release_year
0,955527,7419,10,1,Ода радости,комедии,2019.0
1,955527,9109,9,2,Последняя битва,"драмы, военные",2017.0
2,955527,13917,8,3,Преисподняя,"драмы, детективы, триллеры, вестерн",2016.0
3,955527,13332,7,4,Лихорадка,ужасы,2003.0
4,955527,1331,6,5,Вечность,драмы,2016.0
5,955527,15448,5,6,Леший,"триллеры, криминал, детективы",1997.0
6,955527,1123,4,7,Богема,"драмы, мюзиклы, мелодрамы",2005.0
7,955527,9933,3,8,Шедевр,"драмы, комедии",2018.0
8,955527,3287,2,9,Единоборства для детей (3-6 лет) Школа героев ...,"единоборства, фитнес, для детей",2020.0
9,955527,8478,1,10,Американский пирог: Все в сборе,комедии,2012.0
