In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
import sys

sys.path.append("../../evaluation/")
sys.path.append("../../service/models/")

In [5]:
import os
import pickle
import random
import warnings
import zipfile as zf
from copy import deepcopy
from itertools import product
from pprint import pprint

import numpy as np
import pandas as pd
import requests
from IPython.display import display
from implicit.nearest_neighbours import BM25Recommender, CosineRecommender, TFIDFRecommender
from metrics import calculate_metrics
from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.metrics import MAP, NDCG, MeanInvUserFreq, Precision, Recall, Serendipity
from rectools.metrics.base import MetricAtK
from rectools.model_selection import Splitter, TimeRangeSplitter, cross_validate
from rectools.models import PopularModel, RandomModel
from rectools.models.base import ModelBase
from tqdm import tqdm
from user_knn import UserKnn
from visualization import visualize_metrics, visualize_training_result

ImportError: cannot import name 'cross' from 'rectools.metrics' (/home/starminalush/study/itmo/itmo-mts-recsys-2023/.venv/lib/python3.10/site-packages/rectools/metrics/__init__.py)

In [4]:
RANDOM_STATE = 42
random.seed(RANDOM_STATE)
os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

In [5]:
K_RECOS = 10
N_SPLITS = 3
MODEL_PATH = "userknn.pkl"

## Инициализация датасета

In [6]:
url = "https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip"

In [7]:
req = requests.get(url, stream=True)

with open("kion.zip", "wb") as fd:
    total_size_in_bytes = int(req.headers.get("Content-Length", 0))
    progress_bar = tqdm(desc="kion dataset download", total=total_size_in_bytes, unit="iB", unit_scale=True)
    for chunk in req.iter_content(chunk_size=2**20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:  98%|█████████████▊| 77.6M/78.8M [00:01<00:00, 51.3MiB/s]

In [8]:
files = zf.ZipFile("kion.zip", "r")
files.extractall()
files.close()

In [None]:
interactions_df = pd.read_csv("data_original/interactions.csv", parse_dates=["last_watch_dt"])

interactions_df.rename(columns={"last_watch_dt": Columns.Datetime, "total_dur": Columns.Weight}, inplace=True)

In [None]:
interactions = Interactions(interactions_df)

In [None]:
interactions.df.head()

In [None]:
users = pd.read_csv("data_original/users.csv")
items = pd.read_csv("data_original/items.csv")

## Эксперименты

1. Реализация тюнинга гиперпараметров

Какие параметры будем тюнить:
 - типы knn backbone


In [None]:
metrics: dict[str, MetricAtK] = {}
for k in [1, 5, 10]:
    metrics.update(
        {
            f"top@{k}_precision": Precision(k=k),
            f"top@{k}_recall": Recall(k=k),
            f"top@{k}_ndcg": NDCG(k=k),
            f"top@{k}_map": MAP(k=k),
            f"top@{k}_serendipity": Serendipity(k=k),
            f"top@{k}_mean_inv_user_freq": MeanInvUserFreq(k=k),
        }
    )

Инициализируем сплиттер

In [14]:
splitter = TimeRangeSplitter(
    test_size="7D", n_splits=N_SPLITS, filter_already_seen=True, filter_cold_items=True, filter_cold_users=True
)

Инициализируем набор моделей с дефолтным k=20 в backbone модели UserKnn

In [15]:
models = [UserKnn(CosineRecommender(), 50), UserKnn(BM25Recommender(), 50), UserKnn(TFIDFRecommender(), 50)]

Запускаем перебор гиперпараметров

In [None]:
model_metrics = []
for model in tqdm(models, total=len(models)):
    result = calculate_metrics(
        interactions=interactions, metrics=metrics, model=model, splitter=splitter, k_recos=K_RECOS
    )
    model_metrics.extend(result)


kion dataset download: 100%|██████████████| 78.8M/78.8M [00:20<00:00, 51.3MiB/s][A

  0%|          | 0/797423 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]

  0%|          | 0/906071 [00:00<?, ?it/s]


 33%|█████████████                          | 1/3 [1:01:08<2:02:17, 3668.73s/it][A

  0%|          | 0/797423 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]

  0%|          | 0/906071 [00:00<?, ?it/s]

In [23]:
visualize_metrics(model_metrics)

Unnamed: 0_level_0,top@1,top@1,top@1,top@1,top@1,top@1,top@10,top@10,top@10,top@10,top@10,top@10,top@5,top@5,top@5,top@5,top@5,top@5,train time (sec)
Unnamed: 0_level_1,map,mean_inv_user_freq,ndcg,precision,recall,serendipity,map,mean_inv_user_freq,ndcg,precision,recall,serendipity,map,mean_inv_user_freq,ndcg,precision,recall,serendipity,Unnamed: 19_level_1
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
User KNN with CosineRecommender,0.00076,0.000291,0.002603,0.006465,0.004279,0.020965,0.00076,0.002137,0.003467,0.000291,0.001994,0.003905,9.655836,8.247415,7.540823,4.7e-05,5.9e-05,6e-05,1191.553616
User KNN with BM25Recommender,0.000577,0.000234,0.001951,0.00446,0.00305,0.013828,0.000577,0.001626,0.002502,0.000234,0.00146,0.002699,11.016226,9.899981,9.283169,4.3e-05,8.8e-05,9.9e-05,1192.426256
User KNN with TFIDFRecommender,0.000281,8.7e-05,0.00477,0.012715,0.006529,0.033253,0.000281,0.003668,0.005292,8.7e-05,0.003548,0.006334,10.007046,8.415495,7.638598,2e-05,5.9e-05,6.5e-05,1193.784267


Вывод: судя по экспериментам, все получается как-то не очень по  Map@10, и ошибку я найти не смогла. Поэтому все равно, какую модель обучать для прода, возьмем TFIDFRecommender как backbone, просто потому что нравится TFIDF.

In [None]:
model = UserKnn(TFIDFRecommender(), 50)
result = model.fit(interactions_df)

  0%|          | 0/962179 [00:00<?, ?it/s]

Сохраним модель в файл для дальнейшего использования

In [None]:
pickle.dump(model, open(MODEL_PATH, "wb"))