In [1]:
import sys

sys.path.append("../../service/models/")
sys.path.append("../../service/utils/")

In [54]:
import warnings

warnings.filterwarnings("ignore")

In [55]:
import cupy

cupy.is_available()

True

In [56]:
import os
import pickle
import random
import time
import zipfile as zf
from copy import deepcopy
from functools import partial
from typing import Any, Dict, Sequence

import implicit
import ngtpy
import numpy as np
import optuna
import pandas as pd
import requests
from implicit.gpu.als import AlternatingLeastSquares
from lightfm import LightFM
from ngt_recommender import UserToItemNGTRecommender
from optuna.samplers import TPESampler
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import MAP, calc_metrics
from rectools.metrics.base import MetricAtK
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel
from rectools.tools import UserToItemAnnRecommender
from tqdm import tqdm
from unpickler import load

In [57]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

In [58]:
RANDOM_STATE = 42
random.seed(RANDOM_STATE)
os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

In [59]:
LIGHTFM_MODEL_PATH = "lightfm.pkl"
ALS_MODEL_PATH = "als.pkl"
ANN_MODEL_PATH = "ann.pkl"
K_RECOS = 10
N_EPOCHS = 1  # Lightfm

## Инициализация датасета

In [60]:
url = "https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip"

In [8]:
req = requests.get(url, stream=True)

with open("kion.zip", "wb") as fd:
    total_size_in_bytes = int(req.headers.get("Content-Length", 0))
    progress_bar = tqdm(desc="kion dataset download", total=total_size_in_bytes, unit="iB", unit_scale=True)
    for chunk in req.iter_content(chunk_size=2**20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:  61%|██████    | 48.2M/78.8M [00:00<00:00, 120MiB/s] 

In [9]:
files = zf.ZipFile("kion.zip", "r")
files.extractall()
files.close()

In [61]:
interactions_df = pd.read_csv("data_original/interactions.csv")

In [62]:
users = pd.read_csv("data_original/users.csv")
items = pd.read_csv("data_original/items.csv")

В семинаре по теме было EDA и были графики с выбросами. Берем препроцессинг данных из лекции

In [63]:
Columns.Datetime = "last_watch_dt"
interactions_df.drop(interactions_df[interactions_df[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions_df[Columns.Datetime] = pd.to_datetime(interactions_df[Columns.Datetime], format="%Y-%m-%d")
max_date = interactions_df[Columns.Datetime].max()
interactions_df[Columns.Weight] = np.where(interactions_df["watched_pct"] > 10, 3, 1)

In [64]:
max_date = interactions_df[Columns.Datetime].max()
train = interactions_df[interactions_df[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions_df[interactions_df[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 6)
test: (490982, 6)


In [65]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [66]:
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

В том же семинаре был раздел про юзера и айтема фичи. Напишем функции, которые будут доставать фичи из датасета для пользователя и айтема

In [67]:
def get_users_features(users: pd.DataFrame, interactions: pd.DataFrame, features: Sequence[str]) -> pd.DataFrame:
    users.fillna("Unknown", inplace=True)
    users = users.loc[users[Columns.User].isin(interactions[Columns.User])].copy()
    user_features_frames = []
    for feature in features:
        feature_frame = users.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)
    user_features = pd.concat(user_features_frames)
    return user_features

In [68]:
features = ["sex", "age", "income"]
user_features = get_users_features(users=users, interactions=train, features=features)

In [69]:
def get_items_features(items: pd.DataFrame, interactions: pd.DataFrame, features: Sequence[str]) -> pd.DataFrame:
    items = items.loc[items[Columns.Item].isin(interactions[Columns.Item])].copy()
    items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    item_features_frames = []
    for feature in features:
        feature_frame = items[["item_id", feature]].explode(feature)
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        content_feature = items.reindex(columns=[Columns.Item, "content_type"])
        content_feature.columns = ["id", "value"]
        content_feature["feature"] = "content_type"
        item_features_frames.append(pd.concat((feature_frame, content_feature)))
    item_features = pd.concat(item_features_frames)
    return item_features

In [70]:
features = ["genre"]
item_features = get_items_features(items=items, interactions=train, features=features)

## Эксперименты

* Реализовать тюнинг гиперпараметров для моделей из implicit, lightfm или rectools

Подготовим датасет, инициализируем метрики

In [82]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [83]:
TEST_USERS = test[Columns.User].unique()

In [84]:
metrics: Dict[str, MetricAtK] = {"top@10_map": MAP(k=10)}

Для подбора гиперпараметров будем использовать optuna

In [74]:
def train_model(
    metrics: Dict[str, MetricAtK], model: Any, dataset: Dataset, train: pd.DataFrame, test: pd.DataFrame
) -> float:
    model.fit(dataset)
    recos = model.recommend(users=TEST_USERS, dataset=dataset, k=K_RECOS, filter_viewed=True)
    metrics_result = calc_metrics(metrics, recos, test, train)
    return metrics_result["top@10_map"]

In [75]:
def objective_als(trial, dataset: Dataset, train: pd.DataFrame, test: pd.DataFrame, metrics: dict[str, MetricAtK]):
    n_factors = trial.suggest_categorical("n_factors", [8, 16, 24])
    is_fit_features_together = trial.suggest_categorical("is_fit_features_together", [True, False])
    regularization = trial.suggest_float("regularization", 0.01, 0.05)
    iterations = trial.suggest_int("iterations", 10, 20)

    model = ImplicitALSWrapperModel(
        AlternatingLeastSquares(
            factors=n_factors, regularization=regularization, iterations=iterations, random_state=RANDOM_STATE
        ),
        fit_features_together=is_fit_features_together,
    )

    metric = train_model(metrics=deepcopy(metrics), model=deepcopy(model), dataset=dataset, train=train, test=test)
    return metric

In [76]:
%%time
sampler = TPESampler(seed=RANDOM_STATE)
study = optuna.create_study(study_name="als", direction="maximize", sampler=sampler)
study.optimize(
    partial(objective_als, dataset=dataset, train=train, test=test, metrics=deepcopy(metrics)),
    n_trials=15,  ##больше вроде как и излишне,
)

best_trial_als = study.best_trial
best_params_als = study.best_params

[I 2023-12-10 13:24:55,020] A new study created in memory with name: als
[I 2023-12-10 13:25:48,068] Trial 0 finished with value: 0.06754806792357114 and parameters: {'n_factors': 16, 'is_fit_features_together': True, 'regularization': 0.016239780813448106, 'iterations': 10}. Best is trial 0 with value: 0.06754806792357114.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-10 13:26:28,982] Trial 1 finished with value: 0.06897613643803299 and parameters: {'n_factors': 8, 'is_fit_features_together': False, 'regularization': 0.043297705632016875, 'iterations': 12}. Best is trial 1 with value: 0.06897613643803299.
[I 2023-12-10 13:27:36,689] Trial 2 finished with value: 0.07426023870452046 and parameters: {'n_factors': 24, 'is_fit_features_together': True, 'regularization': 0.021649165607921676, 'iterations': 16}. Best is trial 2 with value: 0.07426023870452046.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-10 13:28:20,408] Trial 3 finished with value: 0.06338409478998744 and parameters: {'n_factors': 24, 'is_fit_features_together': False, 'regularization': 0.017986951286334388, 'iterations': 15}. Best is trial 2 with value: 0.07426023870452046.
[I 2023-12-10 13:29:37,457] Trial 4 finished with value: 0.06729626498848423 and parameters: {'n_factors': 24, 'is_fit_features_together': True, 'regularization': 0.047955421490133335, 'iterations': 20}. Best is trial 2 with value: 0.07426023870452046.
[I 2023-12-10 13:30:37,111] Trial 5 finished with value: 0.07396872454521762 and parameters: {'n_factors': 8, 'is_fit_features_together': True, 'regularization': 0.014881529393791153, 'iterations': 15}. Best is trial 2 with value: 0.07426023870452046.
[I 2023-12-10 13:31:42,851] Trial 6 finished with value: 0.07172145298385821 and parameters: {'n_factors': 16, 'is_fit_features_together': True, 'regularization': 0.030802720847112434, 'iterations': 16}. Best is trial 2 with value: 0.0742602

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-10 13:33:42,774] Trial 8 finished with value: 0.06269827513375514 and parameters: {'n_factors': 16, 'is_fit_features_together': False, 'regularization': 0.02085396127095584, 'iterations': 19}. Best is trial 2 with value: 0.07426023870452046.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-10 13:34:29,142] Trial 9 finished with value: 0.06341894245400542 and parameters: {'n_factors': 24, 'is_fit_features_together': False, 'regularization': 0.012982025747190833, 'iterations': 20}. Best is trial 2 with value: 0.07426023870452046.
[I 2023-12-10 13:35:39,200] Trial 10 finished with value: 0.07389551172724378 and parameters: {'n_factors': 24, 'is_fit_features_together': True, 'regularization': 0.024749150908843894, 'iterations': 17}. Best is trial 2 with value: 0.07426023870452046.
[I 2023-12-10 13:36:36,890] Trial 11 finished with value: 0.07197575445837431 and parameters: {'n_factors': 8, 'is_fit_features_together': True, 'regularization': 0.01062379198633824, 'iterations': 14}. Best is trial 2 with value: 0.07426023870452046.
[I 2023-12-10 13:37:33,041] Trial 12 finished with value: 0.07376788785496687 and parameters: {'n_factors': 8, 'is_fit_features_together': True, 'regularization': 0.022710966805040204, 'iterations': 13}. Best is trial 2 with value: 0.074260

CPU times: user 38min 7s, sys: 25min 25s, total: 1h 3min 32s
Wall time: 14min 56s


За счет перебора параметров удалось получить модель с MAP@10 = 0.742

Подберем гиперпараметры еще для моделй LightFM

In [85]:
def objective_lightfm(
    trial, dataset: Dataset, train: pd.DataFrame, test: pd.DataFrame, metrics: dict[str, MetricAtK]
) -> float:
    no_components = trial.suggest_categorical("no_components", [10, 20, 30])
    k = trial.suggest_categorical("k", [10, 15, 20])
    n = trial.suggest_categorical("n", [10, 15, 20])
    loss = trial.suggest_categorical("loss", ["logistic", "bpr", "warp"])

    model = LightFMWrapperModel(
        LightFM(no_components=no_components, k=k, n=n, loss=loss, random_state=RANDOM_STATE), epochs=N_EPOCHS
    )
    metric = train_model(metrics=deepcopy(metrics), model=deepcopy(model), dataset=dataset, train=train, test=test)
    return metric

In [86]:
%%time
sampler = TPESampler(seed=RANDOM_STATE)
study = optuna.create_study(study_name="lightfm", direction="maximize", sampler=sampler)
study.optimize(
    partial(objective_lightfm, dataset=dataset, train=train, test=test, metrics=deepcopy(metrics)),
    n_trials=15,  ##больше вроде как и излишне,
)

best_trial_lightfm = study.best_trial
best_params_lightfm = study.best_params

[I 2023-12-10 13:44:20,132] A new study created in memory with name: lightfm
[I 2023-12-10 13:45:13,621] Trial 0 finished with value: 0.07614792956291809 and parameters: {'no_components': 20, 'k': 10, 'n': 15, 'loss': 'warp'}. Best is trial 0 with value: 0.07614792956291809.
[I 2023-12-10 13:46:01,295] Trial 1 finished with value: 0.07696009627117308 and parameters: {'no_components': 10, 'k': 20, 'n': 20, 'loss': 'warp'}. Best is trial 1 with value: 0.07696009627117308.
[I 2023-12-10 13:47:07,946] Trial 2 finished with value: 0.026250662707474736 and parameters: {'no_components': 20, 'k': 15, 'n': 10, 'loss': 'bpr'}. Best is trial 1 with value: 0.07696009627117308.
[I 2023-12-10 13:47:53,932] Trial 3 finished with value: 0.00025809604071657267 and parameters: {'no_components': 30, 'k': 20, 'n': 15, 'loss': 'logistic'}. Best is trial 1 with value: 0.07696009627117308.
[I 2023-12-10 13:48:44,145] Trial 4 finished with value: 0.07577691687989935 and parameters: {'no_components': 30, 'k': 

CPU times: user 35min, sys: 28min 38s, total: 1h 3min 38s
Wall time: 11min 46s


Получили метрику MAP@10=0.769.

Обучим модели с лучшими параметрами на всем датасете

In [88]:
user_features = get_users_features(users, interactions_df, ["sex", "age", "income"])
item_features = get_items_features(items, interactions_df, ["genre"])

In [89]:
dataset = Dataset.construct(
    interactions_df=interactions_df,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [90]:
model_als = ImplicitALSWrapperModel(
    AlternatingLeastSquares(
        factors=best_params_als["n_factors"],
        regularization=best_params_als["regularization"],
        iterations=best_params_als["iterations"],
        random_state=RANDOM_STATE,
    ),
    fit_features_together=best_params_als["is_fit_features_together"],
)
model_als.fit(dataset)

<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f22e6560400>

In [91]:
model_lightfm = LightFMWrapperModel(
    LightFM(**best_params_lightfm, random_state=RANDOM_STATE),
    epochs=N_EPOCHS,
    num_threads=2,
)
model_lightfm.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7f22e6560be0>

In [92]:
with open(LIGHTFM_MODEL_PATH, "wb") as f:
    pickle.dump(model_lightfm, f)

In [93]:
with open(ALS_MODEL_PATH, "wb") as f:
    pickle.dump(model_als, f)

## Приближенный поиск

Для приближенного поиска возьмем модель ALS, потому что по какой-то непонятной причине (возможно, потому, что lightfm у меня нормально не ставится) при использовании lightfm UserToItemAnnRecommender возвращает одинаковые айтемы для всех user_id

Достаем данные из модели

In [94]:
user_vectors, item_vectors = model_als.get_vectors()

Сначала попробуем UserToItemANNRecommender

In [95]:
ann_als = UserToItemAnnRecommender(
    user_vectors=user_vectors,
    item_vectors=item_vectors,
    user_id_map=dataset.user_id_map,
    item_id_map=dataset.item_id_map,
)

In [96]:
ann_als.fit()

<rectools.tools.ann.UserToItemAnnRecommender at 0x7f22dd4cb160>

In [97]:
ann_als.get_item_list_for_user(973171, top_n=K_RECOS)

array([ 9728, 13865,  6809, 10440, 11237,  4740,  9996,  2657,  3734,
         142])

In [98]:
with open(ANN_MODEL_PATH, "wb") as f:
    pickle.dump(ann_als, f)

In [99]:
def compute_avg_time_of_inference(model):
    times = []
    for _ in range(1000):
        start = time.time()
        model.get_item_list_for_user(973171, top_n=K_RECOS).tolist()
        times.append(time.time() - start)
    return sum(times) / len(times)

In [101]:
print(f"Среднее время получения результата от одного юзера: {compute_avg_time_of_inference(ann_als)}")

Среднее время получения результата от одного юзера: 0.028218224763870238


Посмотрим результаты для юзера 973171

In [102]:
ann_als.get_item_list_for_user(973171, top_n=K_RECOS)

array([ 9728, 13865,  6809, 10440, 11237,  4740,  9996,  2657,  3734,
         142])

Среднее время ANN = 0.02 секунд на юзера, что в принципе достаточно быстро

Я еще сделала свой кастомный поиск на NGT, потому что:
 - по этому [бейзлайну](https://ann-benchmarks.com/glove-100-angular_10_angular.html) он всех бьет
 - (почти все остальное я уже видела)

In [103]:
ngt_user_to_item = UserToItemNGTRecommender(
    user_vectors=user_vectors,
    item_vectors=item_vectors,
    user_id_map=dataset.user_id_map,
    item_id_map=dataset.item_id_map,
)

In [104]:
ngt_user_to_item.fit()

In [105]:
print(f"Среднее время получения результата от одного юзера: {compute_avg_time_of_inference(ngt_user_to_item)}")

Среднее время получения результата от одного юзера: 0.03617359113693237


Посмотрим на результаты от юзера 973171

In [106]:
ngt_user_to_item.get_item_list_for_user(973171, 10)

array([ 9728, 13865,  6809, 10440, 11237,  4740,  9996,  2657,  3734,
         142])

Вывод: NGT работает чуть медленее, чем UserToItemRecommender. Результаты при этом обоих моделей одинаковые.
В проде я ее, конечно, использовать не буду, потому что либа из pypi на моем компьютере тоже работает медленнее, и [тут](https://github.com/yahoojapan/NGT/tree/main/python) об этом тоже писали

[Установка](https://github.com/erikbern/ann-benchmarks/blob/main/ann_benchmarks/algorithms/onng_ngt/Dockerfile)


## Эксперименты с pickle

Заметила ситуацию, что если мы загружаем модель ANN и делаем deepcopy, то у нас модель выдает только пустоту. Ниже вывод того, почему так происходит

In [110]:
model = load(ALS_MODEL_PATH)

In [112]:
user_v, item_v = model.get_vectors()

Создаем и сохраняем модель

In [113]:
ann = UserToItemAnnRecommender(
    user_vectors=user_v, item_vectors=item_v, user_id_map=dataset.user_id_map, item_id_map=dataset.item_id_map
)
ann.fit()
# смотрим результат
ann.get_item_list_for_user(973171, 10).tolist()

[9728, 13865, 6809, 10440, 11237, 4740, 9996, 2657, 3734, 142]

Сохраняем модель

In [114]:
with open(ANN_MODEL_PATH, "wb") as f:
    pickle.dump(ann, f)

Некоторый код из класса UserToItemAnnRecommender

In [115]:
user_id = 973171

In [116]:
user_id_ = dataset.user_id_map.convert_to_internal([user_id])
user_vectors_ = user_v[user_id_, :]

Загружаем модель

In [117]:
loaded_model = load(ANN_MODEL_PATH)
loaded_model.get_item_list_for_user(973171, 10).tolist()

[9728, 13865, 6809, 10440, 11237, 4740, 9996, 2657, 3734, 142]

In [118]:
loaded_model.index.knnQueryBatch(user_vectors_, 10)

[(array([ 32,  16, 122,  25,  68, 112,  42,  93,  84, 370], dtype=int32),
  array([0.93543667, 0.94249934, 0.94361097, 0.9589372 , 0.96892065,
         0.9721973 , 0.9752374 , 0.9752375 , 0.9757492 , 0.97618383],
        dtype=float32))]

In [119]:
loaded_model.index_query_time_params

{'efSearch': 100}

In [120]:
loaded_model.__getstate__()

{'item_vectors': array([[ 5.0902204e-03,  4.9755834e-03,  3.6894567e-03, ...,
          0.0000000e+00,  1.0000000e+00,  0.0000000e+00],
        [ 1.2697601e-03,  3.4679758e-04,  9.1035658e-04, ...,
          0.0000000e+00,  1.0000000e+00,  0.0000000e+00],
        [ 2.9750766e-02,  1.2863990e-02,  2.3153631e-02, ...,
          0.0000000e+00,  0.0000000e+00,  1.0000000e+00],
        ...,
        [ 8.4862811e-05, -2.7174139e-04,  3.4651509e-04, ...,
          0.0000000e+00,  0.0000000e+00,  1.0000000e+00],
        [ 5.3647887e-05,  3.3514379e-04,  1.1376514e-04, ...,
          0.0000000e+00,  1.0000000e+00,  0.0000000e+00],
        [-4.9132295e-04, -2.4625447e-04, -5.5438816e-04, ...,
          0.0000000e+00,  1.0000000e+00,  0.0000000e+00]], dtype=float32),
 'item_id_map': IdMap(external_ids=array([ 9506,  1659,  7107, ..., 10064, 13019, 10542])),
 'index_top_k': 0,
 'index_init_params': {'method': 'hnsw', 'space': 'cosinesimil'},
 'index_query_time_params': {'efSearch': 100},
 'create_i

Делаем deepcopy

In [121]:
deepcopy_loaded_model = deepcopy(loaded_model)
deepcopy_loaded_model.get_item_list_for_user(973171, 10).tolist()

[]

In [122]:
deepcopy_loaded_model.__getstate__()

{'item_vectors': array([[ 5.0902204e-03,  4.9755834e-03,  3.6894567e-03, ...,
          0.0000000e+00,  1.0000000e+00,  0.0000000e+00],
        [ 1.2697601e-03,  3.4679758e-04,  9.1035658e-04, ...,
          0.0000000e+00,  1.0000000e+00,  0.0000000e+00],
        [ 2.9750766e-02,  1.2863990e-02,  2.3153631e-02, ...,
          0.0000000e+00,  0.0000000e+00,  1.0000000e+00],
        ...,
        [ 8.4862811e-05, -2.7174139e-04,  3.4651509e-04, ...,
          0.0000000e+00,  0.0000000e+00,  1.0000000e+00],
        [ 5.3647887e-05,  3.3514379e-04,  1.1376514e-04, ...,
          0.0000000e+00,  1.0000000e+00,  0.0000000e+00],
        [-4.9132295e-04, -2.4625447e-04, -5.5438816e-04, ...,
          0.0000000e+00,  1.0000000e+00,  0.0000000e+00]], dtype=float32),
 'item_id_map': IdMap(external_ids=array([ 9506,  1659,  7107, ..., 10064, 13019, 10542])),
 'index_top_k': 0,
 'index_init_params': {'method': 'hnsw', 'space': 'cosinesimil'},
 'index_query_time_params': {'efSearch': 100},
 'create_i

По какой-то причине не сохранился граф вычислений. Если все обучить заново, будет все ок.

In [123]:
# fit не сохраняет состояние обучения
deepcopy_loaded_model.fit()

<rectools.tools.ann.UserToItemAnnRecommender at 0x7f234e724a00>

In [124]:
deepcopy_loaded_model.index.knnQueryBatch(user_vectors_, 10)

[(array([ 32,  16, 122,  25,  68, 112,  42,  93,  84, 370], dtype=int32),
  array([0.93543667, 0.94249934, 0.94361097, 0.9589372 , 0.96892065,
         0.9721973 , 0.9752374 , 0.9752375 , 0.9757492 , 0.97618383],
        dtype=float32))]

In [125]:
deepcopy_loaded_model.index_query_time_params

{'efSearch': 100}

По итогу получилось только локализовать проблему - что-то не так с состоянием графа