In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import sys
sys.path.append('../../evaluation/')
sys.path.append('../../models')      

In [3]:
import os
import random
import warnings
import zipfile as zf
from copy import deepcopy
from pprint import pprint

from itertools import product
import numpy as np
import pandas as pd
import requests
from IPython.display import display
from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.metrics import MAP, NDCG, MeanInvUserFreq, Precision, Recall, Serendipity
from rectools.metrics.base import MetricAtK
from rectools.model_selection import Splitter, TimeRangeSplitter
from rectools.models import PopularModel, RandomModel
from rectools.models.base import ModelBase
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
from tqdm import tqdm

from metrics import calculate_metrics
from user_knn import UserKnn
from visualization import visualize_metrics, visualize_training_result

In [4]:
RANDOM_STATE = 42
random.seed(RANDOM_STATE)
os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

In [5]:
K_RECOS = 10
N_SPLITS = 3

## Инициализация датасета

In [6]:
url = "https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip"

In [7]:
# req = requests.get(url, stream=True)

# with open("kion.zip", "wb") as fd:
#     total_size_in_bytes = int(req.headers.get("Content-Length", 0))
#     progress_bar = tqdm(desc="kion dataset download", total=total_size_in_bytes, unit="iB", unit_scale=True)
#     for chunk in req.iter_content(chunk_size=2**20):
#         progress_bar.update(len(chunk))
#         fd.write(chunk)

In [8]:
# files = zf.ZipFile("kion.zip", "r")
# files.extractall()
# files.close()

In [9]:
interactions = pd.read_csv("data_original/interactions.csv", parse_dates=["last_watch_dt"])

interactions.rename(columns={"last_watch_dt": Columns.Datetime, "total_dur": Columns.Weight}, inplace=True)

In [10]:
interactions = Interactions(interactions)

In [11]:
interactions.df.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0


In [12]:
users = pd.read_csv("data_original/users.csv")
items = pd.read_csv("data_original/items.csv")

## Эксперименты

1. Реализация тюнинга гиперпараметров

Какие параметры будем тюнить:
 - k соседей для knn backbone класса UserKnn
 - типы knn backbone


In [13]:
metrics: dict[str, MetricAtK] = {}
for k in [1, 5, 10]:
    metrics.update(
        {
            f"top@{k}_precision": Precision(k=k),
            f"top@{k}_recall": Recall(k=k),
            f"top@{k}_ndcg": NDCG(k=k),
            f"top@{k}_map": MAP(k=k),
            f"top@{k}_serendipity": Serendipity(k=k),
            f"top@{k}_mean_inv_user_freq": MeanInvUserFreq(k=k),
        }
    )

Инициализируем сплиттер

In [14]:
splitter = TimeRangeSplitter(
test_size="7D",
n_splits=N_SPLITS,
filter_already_seen=True,
filter_cold_items=True,
filter_cold_users=True)

Инициализируем набор моделей c разным K

In [21]:
models = [UserKnn(CosineRecommender(), 50), UserKnn(BM25Recommender(), 50), UserKnn(TFIDFRecommender(), 50)]

Запускаем перебор гиперпараметров

In [None]:
model_metrics = []
for model in tqdm(models, total=len(models)):
    result = calculate_metrics(interactions=interactions, metrics=metrics, model=model, splitter=splitter, k_recos=K_RECOS)
    model_metrics.extend(result)

  0%|                                                                                                                                                                                                    | 0/3 [00:00<?, ?it/s]

  0%|          | 0/797423 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]

In [None]:
visualize_metrics(model_metrics)