## Импорты

In [90]:
import os
import random
import zipfile as zf
from copy import deepcopy
from pprint import pprint
from time import time
from typing import Any

import numpy as np
import pandas as pd
import requests
from IPython.display import display
from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.metrics import MAP, NDCG, MeanInvUserFreq, Precision, Recall, Serendipity, calc_metrics
from rectools.metrics.base import MetricAtK
from rectools.model_selection import Splitter, TimeRangeSplitter
from rectools.models import PopularModel, RandomModel
from rectools.models.base import ModelBase
from tqdm import tqdm

In [8]:
RANDOM_STATE = 42
random.seed(RANDOM_STATE)
os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

In [9]:
K_RECOS = 10
N_SPLITS = 3

## Инициализация датасета

In [10]:
url = "https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip"

In [11]:
req = requests.get(url, stream=True)

with open("kion.zip", "wb") as fd:
    total_size_in_bytes = int(req.headers.get("Content-Length", 0))
    progress_bar = tqdm(desc="kion dataset download", total=total_size_in_bytes, unit="iB", unit_scale=True)
    for chunk in req.iter_content(chunk_size=2**20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78.8M/78.8M [02:01<00:00, 747kiB/s]

In [12]:
files = zf.ZipFile("kion.zip", "r")
files.extractall()
files.close()

Немного предобработаем датасет

In [13]:
interactions = pd.read_csv("data_original/interactions.csv", parse_dates=["last_watch_dt"])

interactions.rename(columns={"last_watch_dt": Columns.Datetime, "total_dur": Columns.Weight}, inplace=True)

In [15]:
interactions = Interactions(interactions)

In [16]:
interactions.df.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0


In [17]:
users = pd.read_csv("data_original/users.csv")
items = pd.read_csv("data_original/items.csv")

## Подсчет метрик

Инициализируем модели

In [36]:
models: dict[str, ModelBase] = {"random": RandomModel(random_state=RANDOM_STATE), "popular": PopularModel()}

Инициализиуем метрики

In [37]:
metrics: dict[str, MetricAtK] = {}
for k in [1, 5, 10]:
    metrics.update(
        {
            f"top@{k}_precision": Precision(k=k),
            f"top@{k}_recall": Recall(k=k),
            f"top@{k}_ndcg": NDCG(k=k),
            f"top@{k}_map": MAP(k=k),
            f"top@{k}_serendipity": Serendipity(k=k),
            f"top@{k}_mean_inv_user_freq": MeanInvUserFreq(k=k),
        }
    )

Инициализиурем splitter

In [38]:
splitter: Splitter = TimeRangeSplitter(
    test_size="7D",
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [43]:
def calculate_metrics(
    metrics: dict[str, MetricAtK], models: dict[str, ModelBase], splitter: Splitter, k_recos: int
) -> dict[str, Any]:
    results = []
    fold_iterator = splitter.split(interactions, collect_fold_stats=True)

    for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=N_SPLITS):
        df_train = interactions.df.iloc[train_ids]
        dataset = Dataset.construct(df_train)

        df_test = interactions.df.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])

        catalog = df_train[Columns.Item].unique()

        for model_name, model in models.items():
            init_model = deepcopy(model)
            start_train_time = time()
            init_model.fit(dataset)
            train_time = time() - start_train_time
            recos = init_model.recommend(
                users=test_users,
                dataset=dataset,
                k=k_recos,
                filter_viewed=True,
            )
            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )
            res = {"fold": fold_info["i_split"], "model": model_name, "train_time (sec)": train_time}
            res.update(metric_values)
            results.append(res)
    return results

In [44]:
report = calculate_metrics(metrics, models, splitter, K_RECOS)


  0%|                                                                                                                                                                                                     | 0/3 [00:00<?, ?it/s][A
 33%|███████████████████████████████████████████████████████████████                                                                                                                              | 1/3 [00:24<00:49, 24.85s/it][A
 67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                               | 2/3 [00:51<00:25, 25.86s/it][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:18<00:00, 26.23s/it][A


In [48]:
pivot_results = pd.DataFrame(report).drop(columns="fold").groupby(["model"], sort=False).agg(["mean"])
mean_metric_subset = [
    (metric, agg) for metric, agg in pivot_results.columns if metric != "train_time" and agg == "mean"
]
(
    pivot_results.style.highlight_min(subset=mean_metric_subset, color="lightcoral", axis=0).highlight_max(
        subset=mean_metric_subset, color="lightgreen", axis=0
    )
)

Unnamed: 0_level_0,train_time (sec),top@1_precision,top@1_recall,top@5_precision,top@5_recall,top@10_precision,top@10_recall,top@1_ndcg,top@5_ndcg,top@10_ndcg,top@1_map,top@5_map,top@10_map,top@1_mean_inv_user_freq,top@5_mean_inv_user_freq,top@10_mean_inv_user_freq,top@1_serendipity,top@5_serendipity,top@10_serendipity
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
random,2.8e-05,0.000169,6.9e-05,0.000176,0.000345,0.000176,0.000687,0.000169,0.000173,0.000175,6.9e-05,0.000155,0.0002,15.616147,15.611575,15.610409,4e-06,6e-06,6e-06
popular,1.772211,0.076432,0.04272,0.052402,0.137413,0.033903,0.173492,0.076432,0.057932,0.043084,0.04272,0.078295,0.084109,2.377055,3.066979,3.71339,2e-06,3e-06,2e-06


## Визуализация результатов обучения

In [115]:
def visualize(model: ModelBase, dataset, user_ids: list[int], item_data: dict, k_recos: int):
    recos = model.recommend(users=user_ids, k=k_recos, dataset=dataset, filter_viewed=True)
    recos["type"] = "reco"
    recos.drop("score", axis=1, inplace=True)
    history = dataset.interactions.df[dataset.interactions.df["user_id"].isin(user_ids)].sort_values(
        ["user_id", "datetime"]
    )
    history["rank"] = history.sort_values("datetime").groupby(["user_id"]).datetime.rank()
    history["type"] = "history"
    history.drop(["datetime", "weight"], axis=1, inplace=True)

    report = pd.concat([recos, history])

    count_views = interactions.df.groupby("item_id").count()["user_id"]
    report = report.merge(item_data, how="inner", on="item_id")
    count_views.name = "count"
    report = report.merge(count_views, how="inner", on="item_id")

    report.sort_values(["user_id", "type"], inplace=True)
    report.set_index(["user_id", "item_id"], inplace=True)

    return report

In [116]:
user_ids = [666262, 672861, 955527]
dataset = Dataset.construct(interactions.df)
item_data = items[["item_id", "title", "genres"]]

In [117]:
reports = {}
for model_name, model in models.items():
    init_model = deepcopy(model)
    init_model.fit(dataset)
    reports[model_name] = visualize(init_model, dataset, user_ids, item_data, K_RECOS)

In [118]:
for model_name, report in reports.items():
    pprint(model_name)
    display(report)

'random'


Unnamed: 0_level_0,Unnamed: 1_level_0,rank,type,title,genres,count
user_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
666262,93,1.0,history,Дом ночных призраков,"зарубежные, криминал, детективы, ужасы",1
666262,7419,1.0,reco,Ода радости,комедии,6
666262,9109,2.0,reco,Последняя битва,"драмы, военные",239
666262,13917,3.0,reco,Преисподняя,"драмы, детективы, триллеры, вестерн",877
666262,13332,4.0,reco,Лихорадка,ужасы,7
666262,1331,5.0,reco,Вечность,драмы,190
666262,15448,6.0,reco,Леший,"триллеры, криминал, детективы",166
666262,1123,7.0,reco,Богема,"драмы, мюзиклы, мелодрамы",136
666262,9933,8.0,reco,Шедевр,"драмы, комедии",323
666262,3287,9.0,reco,Единоборства для детей (3-6 лет) Школа героев ...,"единоборства, фитнес, для детей",6


'popular'


Unnamed: 0_level_0,Unnamed: 1_level_0,rank,type,title,genres,count
user_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
666262,93,1.0,history,Дом ночных призраков,"зарубежные, криминал, детективы, ужасы",1
666262,10440,1.0,reco,Хрустальный,"триллеры, детективы",202457
666262,15297,2.0,reco,Клиника счастья,"драмы, мелодрамы",193123
666262,9728,3.0,reco,Гнев человеческий,"боевики, триллеры",132865
666262,13865,4.0,reco,Девятаев,"драмы, военные, приключения",122119
666262,4151,5.0,reco,Секреты семейной жизни,комедии,91167
666262,3734,6.0,reco,Прабабушка легкого поведения,комедии,74803
666262,2657,7.0,reco,Подслушано,"драмы, триллеры",68581
666262,4880,8.0,reco,Афера,комедии,55043
666262,142,9.0,reco,Маша,"драмы, триллеры",45367
