In [10]:
import sys
sys.path.append('../../service/models/')
sys.path.append('../../service/utils/')

In [11]:
import warnings
warnings.filterwarnings('ignore')

In [12]:
import os
import pickle
import random
import zipfile as zf

import implicit
import numpy as np
import time
import pandas as pd
import requests
from implicit.gpu.als import AlternatingLeastSquares
from lightfm import LightFM
from rectools import Columns
from rectools.dataset import Dataset
from copy import deepcopy
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel
from rectools.metrics import  MAP, calc_metrics
from rectools.metrics.base import MetricAtK
from functools import partial
from rectools.tools import UserToItemAnnRecommender
import optuna
from utils.unpickler import load
from ngt_recommender import UserToItemNGTRecommender
from typing import Sequence, Any, Dict
import ngtpy
from optuna.samplers import TPESampler
from tqdm import tqdm

In [13]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

In [14]:
RANDOM_STATE = 42
random.seed(RANDOM_STATE)
os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

In [15]:
MODEL_PATH = "lightfm.pkl"
ANN_MODEL_PATH = 'ann_lightfm.pkl'
K_RECOS=10

## Инициализация датасета

In [7]:
url = "https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip"

In [8]:
req = requests.get(url, stream=True)

with open("kion.zip", "wb") as fd:
    total_size_in_bytes = int(req.headers.get("Content-Length", 0))
    progress_bar = tqdm(desc="kion dataset download", total=total_size_in_bytes, unit="iB", unit_scale=True)
    for chunk in req.iter_content(chunk_size=2**20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 78.6M/78.8M [00:16<00:00, 3.23MiB/s]

In [9]:
files = zf.ZipFile("kion.zip", "r")
files.extractall()
files.close()

In [16]:
interactions_df = pd.read_csv("data_original/interactions.csv")

In [17]:
users = pd.read_csv("data_original/users.csv")
items = pd.read_csv("data_original/items.csv")

В семинаре по теме было EDA и были графики с выбросами. Берем препроцессинг данных из лекции

In [18]:
Columns.Datetime = 'last_watch_dt'
interactions_df.drop(interactions_df[interactions_df[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions_df[Columns.Datetime] = pd.to_datetime(interactions_df[Columns.Datetime], format="%Y-%m-%d")
max_date = interactions_df[Columns.Datetime].max()
interactions_df[Columns.Weight] = np.where(interactions_df["watched_pct"] > 10, 3, 1)

In [19]:
max_date = interactions_df[Columns.Datetime].max()
train = interactions_df[interactions_df[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions_df[interactions_df[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 6)
test: (490982, 6)


In [20]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [21]:
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

В том же семинаре был раздел про юзера и айтема фичи. Напишем функции, которые будут доставать фичи из датасета для пользователя и айтема

In [22]:
def get_users_features(users: pd.DataFrame, interactions: pd.DataFrame, features: Sequence[str]) -> pd.DataFrame:
    users.fillna('Unknown', inplace=True)
    users = users.loc[users[Columns.User].isin(interactions[Columns.User])].copy()
    user_features_frames = []
    for feature in features:
        feature_frame = users.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)
    user_features = pd.concat(user_features_frames)
    return user_features

In [23]:
features = ["sex", "age", "income"]
user_features = get_users_features(users=users, interactions = train, features=features)

In [24]:
def get_items_features(items: pd.DataFrame, interactions: pd.DataFrame, features: Sequence[str]) -> pd.DataFrame:
    items = items.loc[items[Columns.Item].isin(interactions[Columns.Item])].copy()
    items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    item_features_frames = []
    for feature in features:
        feature_frame = items[["item_id", feature]].explode(feature)
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        content_feature = items.reindex(columns=[Columns.Item, "content_type"])
        content_feature.columns = ["id", "value"]
        content_feature["feature"] = "content_type"
        item_features_frames.append(pd.concat((feature_frame, content_feature)))
    item_features = pd.concat(item_features_frames)
    return item_features

In [25]:
features = ["genre"]
item_features = get_items_features(items=items, interactions = train, features=features)

## Эксперименты

* Реализовать тюнинг гиперпараметров для моделей из implicit, lightfm или rectools

Подготовим датасет, инициализируем метрики

In [26]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [27]:
TEST_USERS = test[Columns.User].unique()

In [28]:
metrics: Dict[str, MetricAtK] = {'top@10_map': MAP(k=10)}

Для подбора гиперпараметров будем использовать optuna

In [29]:
def train_model(metrics:Dict[str, MetricAtK], model: Any, dataset:Dataset, train:pd.DataFrame, test:pd.DataFrame) -> float:
    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True
    )
    metrics_result = calc_metrics(metrics, recos, test, train)
    return metrics_result['top@10_map']

In [49]:
def objective_als(trial, dataset:Dataset, train:pd.DataFrame, test:pd.DataFrame, metrics:dict[str, MetricAtK]):
    n_factors = trial.suggest_categorical("n_factors", [8, 16, 24])
    is_fit_features_together = trial.suggest_categorical("is_fit_features_together", [True, False])
    regularization = trial.suggest_float('regularization', 0.01, 0.05)
    iterations = trial.suggest_int('iterations', 10, 20)

    model = ImplicitALSWrapperModel(AlternatingLeastSquares(factors = n_factors, regularization = regularization, iterations = iterations, random_state=RANDOM_STATE), fit_features_together=is_fit_features_together)

    metric =  train_model(
        metrics=deepcopy(metrics),
        model=deepcopy(model),
        dataset=dataset,
        train=train,
        test=test
    )
    return metric

In [50]:
%%time
sampler = TPESampler(seed=RANDOM_STATE)
study = optuna.create_study(study_name="als", direction="maximize", sampler=sampler)
study.optimize(
    partial(objective_als, dataset=dataset, train=train, test=test, metrics=deepcopy(metrics)),
    n_trials=15, ##больше вроде как и излишне, 
)

best_trial = study.best_trial
best_params  = study.best_params

[I 2023-12-08 21:26:53,924] A new study created in memory with name: als
[I 2023-12-08 21:27:31,623] Trial 0 finished with value: 0.06741110056557463 and parameters: {'n_factors': 16, 'is_fit_features_together': True, 'regularization': 0.016239780813448106, 'iterations': 10}. Best is trial 0 with value: 0.06741110056557463.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-08 21:28:00,779] Trial 1 finished with value: 0.06897491937615315 and parameters: {'n_factors': 8, 'is_fit_features_together': False, 'regularization': 0.043297705632016875, 'iterations': 12}. Best is trial 1 with value: 0.06897491937615315.
[I 2023-12-08 21:28:50,166] Trial 2 finished with value: 0.07404851168029583 and parameters: {'n_factors': 24, 'is_fit_features_together': True, 'regularization': 0.021649165607921676, 'iterations': 16}. Best is trial 2 with value: 0.07404851168029583.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-08 21:29:19,748] Trial 3 finished with value: 0.0633863831473171 and parameters: {'n_factors': 24, 'is_fit_features_together': False, 'regularization': 0.017986951286334388, 'iterations': 15}. Best is trial 2 with value: 0.07404851168029583.
[I 2023-12-08 21:30:14,906] Trial 4 finished with value: 0.06401524405433189 and parameters: {'n_factors': 24, 'is_fit_features_together': True, 'regularization': 0.047955421490133335, 'iterations': 20}. Best is trial 2 with value: 0.07404851168029583.
[I 2023-12-08 21:31:00,360] Trial 5 finished with value: 0.0736660319170565 and parameters: {'n_factors': 8, 'is_fit_features_together': True, 'regularization': 0.014881529393791153, 'iterations': 15}. Best is trial 2 with value: 0.07404851168029583.
[I 2023-12-08 21:31:48,324] Trial 6 finished with value: 0.07447957628328786 and parameters: {'n_factors': 16, 'is_fit_features_together': True, 'regularization': 0.030802720847112434, 'iterations': 16}. Best is trial 6 with value: 0.074479576

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-08 21:33:17,249] Trial 8 finished with value: 0.06269825522990526 and parameters: {'n_factors': 16, 'is_fit_features_together': False, 'regularization': 0.02085396127095584, 'iterations': 19}. Best is trial 6 with value: 0.07447957628328786.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-08 21:33:51,220] Trial 9 finished with value: 0.06341866508800317 and parameters: {'n_factors': 24, 'is_fit_features_together': False, 'regularization': 0.012982025747190833, 'iterations': 20}. Best is trial 6 with value: 0.07447957628328786.
[I 2023-12-08 21:34:40,569] Trial 10 finished with value: 0.0724670797701254 and parameters: {'n_factors': 16, 'is_fit_features_together': True, 'regularization': 0.02989435896422701, 'iterations': 17}. Best is trial 6 with value: 0.07447957628328786.
[I 2023-12-08 21:35:31,714] Trial 11 finished with value: 0.07390739453224501 and parameters: {'n_factors': 24, 'is_fit_features_together': True, 'regularization': 0.026499876258765737, 'iterations': 17}. Best is trial 6 with value: 0.07447957628328786.
[I 2023-12-08 21:36:16,025] Trial 12 finished with value: 0.07582936315366962 and parameters: {'n_factors': 24, 'is_fit_features_together': True, 'regularization': 0.025751526095104938, 'iterations': 13}. Best is trial 12 with value: 0.0758

CPU times: user 17min 16s, sys: 20min 44s, total: 38min
Wall time: 10min 49s


За счет перебора параметров удалось получить модель с MAP@10 = 0.758.

Подберем гиперпараметры еще для моделй LightFM

In [30]:
def objective_lightfm(trial, dataset:Dataset, train:pd.DataFrame, test:pd.DataFrame, metrics:dict[str, MetricAtK]) -> float:
    no_components = trial.suggest_categorical('no_components', [10, 20, 30])
    k = trial.suggest_categorical('k', [5, 10, 15])
    n = trial.suggest_categorical('n',[10, 15, 20])
    loss = trial.suggest_categorical('loss', ['logistic', 'bpr', 'warp'])
    
    model = LightFMWrapperModel(LightFM(
        no_components=no_components,
        k=k,
        n=n,
        loss=loss,
        random_state = RANDOM_STATE
    ))
    metric =  train_model(
        metrics=deepcopy(metrics),
        model=deepcopy(model),
        dataset=dataset,
        train=train,
        test=test
    )
    return metric

In [31]:
%%time
sampler = TPESampler(seed=RANDOM_STATE)
study = optuna.create_study(study_name="lightfm", direction="maximize", sampler=sampler)
study.optimize(
    partial(objective_lightfm, dataset=dataset, train=train, test=test, metrics=deepcopy(metrics)),
    n_trials=15, ##больше вроде как и излишне, 
)

best_trial = study.best_trial
best_params  = study.best_params

[I 2023-12-09 22:52:36,589] A new study created in memory with name: lightfm
[I 2023-12-09 22:53:16,493] Trial 0 finished with value: 0.07614792956291809 and parameters: {'no_components': 20, 'k': 5, 'n': 15, 'loss': 'warp'}. Best is trial 0 with value: 0.07614792956291809.
[I 2023-12-09 22:53:57,266] Trial 1 finished with value: 0.07696009627117308 and parameters: {'no_components': 10, 'k': 15, 'n': 20, 'loss': 'warp'}. Best is trial 1 with value: 0.07696009627117308.
[I 2023-12-09 22:54:38,046] Trial 2 finished with value: 0.026250662707474736 and parameters: {'no_components': 20, 'k': 10, 'n': 10, 'loss': 'bpr'}. Best is trial 1 with value: 0.07696009627117308.
[I 2023-12-09 22:55:17,009] Trial 3 finished with value: 0.00025809604071657267 and parameters: {'no_components': 30, 'k': 15, 'n': 15, 'loss': 'logistic'}. Best is trial 1 with value: 0.07696009627117308.
[I 2023-12-09 22:55:59,622] Trial 4 finished with value: 0.07577691687989935 and parameters: {'no_components': 30, 'k': 1

CPU times: user 29min 55s, sys: 26min 27s, total: 56min 23s
Wall time: 9min 42s


Получили метрику MAP@10=0.769. Будем lightfm использовать ее в качестве backbone модели. Обучим модель с лучшими параметрами на всем датасете

In [32]:
user_features = get_users_features(users, interactions_df, ["sex", "age", "income"])
item_features = get_items_features(items, interactions_df, ['genre'])

In [33]:
dataset = Dataset.construct(
    interactions_df=interactions_df,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [34]:
model = LightFMWrapperModel(
    LightFM(
       **best_params,
        random_state=RANDOM_STATE
    )
)
model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7effe7039fc0>

In [35]:
with open(MODEL_PATH, "wb") as f:
    pickle.dump(model, f)

## Приближенный поиск

In [134]:
model = load(MODEL_PATH)

Достаем данные из модели

In [135]:
user_vectors, item_vectors = model.get_vectors(dataset)

Сначала попробуем UserToItemANNRecommender

In [136]:
ann_lightfm = UserToItemAnnRecommender(
    user_vectors=user_vectors,
    item_vectors=item_vectors,
    user_id_map=dataset.user_id_map,
    item_id_map=dataset.item_id_map,
)

In [137]:
ann_lightfm.fit()

<rectools.tools.ann.UserToItemAnnRecommender at 0x7effe6ebe8c0>

In [40]:
def compute_avg_time_of_inference(model):
    times = []
    for _ in range(1000):
        start = time.time()
        model.get_item_list_for_user(973171, top_n=K_RECOS).tolist()
        times.append(time.time() - start)
    return sum(times) / len(times)

In [41]:
print(f"Среднее время получения результата от одного юзера: {compute_avg_time_of_inference(ann_lightfm)}")

Среднее время получения результата от одного юзера: 0.023664658069610597


Посмотрим результаты для юзера 973171

In [42]:
ann_lightfm.get_item_list_for_user(973171, top_n=K_RECOS)

array([  400, 10124,   570, 11514,  4865,  8173, 13282,   118, 12443,
        5133])

Среднее время ANN = 0.02 секунд на юзера, что в принципе достаточно быстро

Я еще сделала свой кастомный поиск на NGT, потому что:
 - по этому [бейзлайну](https://ann-benchmarks.com/glove-100-angular_10_angular.html) он всех бьет
 - (почти все остальное я уже видела)

In [48]:
ngt_user_to_item = UserToItemNGTRecommender(user_vectors=user_vectors, item_vectors=item_vectors, user_id_map=dataset.user_id_map, item_id_map=dataset.item_id_map)

In [49]:
ngt_user_to_item.fit()

In [50]:
print(f"Среднее время получения результата от одного юзера: {compute_avg_time_of_inference(ngt_user_to_item)}")

Среднее время получения результата от одного юзера: 0.0228857319355011


Посмотрим на результаты от юзера 973171

In [52]:
ngt_user_to_item.get_item_list_for_user(973171, 10)

array([16512,   400, 10124,   570, 11514,  4865,  8173, 13282,   118,
       12443])

Вывод: оба алгоритма работают примерно одной и то же скоростью. При этом ранги айтемов у них разные в выдаче.
В проде я ее, конечно, использовать не буду, потому что либа из pypi на моем компьютере тоже работает медленнее, и [тут](https://github.com/yahoojapan/NGT/tree/main/python) об этом тоже писали.

[Установка](https://github.com/erikbern/ann-benchmarks/blob/main/ann_benchmarks/algorithms/onng_ngt/Dockerfile)


Сохраняем ANN в pickle для использования в продуктиве

In [138]:
with open(ANN_MODEL_PATH, "wb") as f:
    pickle.dump(ann_lightfm, f)

## Эксперименты с pickle

Заметила ситуацию, что если мы загружаем модель ANN и делаем deepcopy, то у нас модель выдает только пустоту. Ниже вывод того, почему так происходит

In [54]:
model = load(MODEL_PATH)

In [56]:
user_v, item_v = model.get_vectors(dataset)

Создаем и сохраняем модель

In [101]:
ann = UserToItemAnnRecommender(
    user_vectors=user_v, item_vectors=item_v, user_id_map=dataset.user_id_map, item_id_map=dataset.item_id_map
)
ann.fit()
#смотрим результат
ann.get_item_list_for_user(973171, 10).tolist()

[13535, 7803, 7285, 11854, 12052, 253, 1149, 9202, 14858, 2886]

Сохраняем модель

In [102]:
with open(ANN_MODEL_PATH, "wb") as f:
    pickle.dump(ann, f)

Некоторый код из класса UserToItemAnnRecommender

In [105]:
user_id = 973171

In [106]:
user_id_ = dataset.user_id_map.convert_to_internal([user_id])
user_vectors_ = user_v[user_id_, :]

Загружаем модель

In [108]:
loaded_model = load(ANN_MODEL_PATH)
loaded_model.get_item_list_for_user(973171, 10).tolist()

[13535, 7803, 7285, 11854, 12052, 253, 1149, 9202, 14858, 2886]

In [109]:
loaded_model.index.knnQueryBatch(user_vectors_, 10)

[(array([13561, 10999, 15213, 12758, 10462, 14493, 10135, 15325,  9272,
         15301], dtype=int32),
  array([1.1884196, 1.2066582, 1.2102394, 1.210423 , 1.2112674, 1.2114953,
         1.2123799, 1.2130713, 1.2153078, 1.2163595], dtype=float32))]

In [122]:
loaded_model.index_query_time_params

{'efSearch': 100}

In [126]:
loaded_model.__getstate__()

{'item_vectors': array([[ 1.00000000e+00,  1.81841409e+00,  5.02787411e-01, ...,
          5.58219552e-01,  4.37355697e-01,  1.35657072e-01],
        [ 1.00000000e+00,  1.27003825e+00,  1.01189710e-01, ...,
          6.13796592e-01,  5.99408388e-01,  6.10280037e-01],
        [ 1.00000000e+00,  2.43296099e+00,  1.33600235e-01, ...,
         -1.60736591e-01, -4.25765634e-01, -7.41632938e-01],
        ...,
        [ 1.00000000e+00, -1.55643296e+00,  1.62893534e-03, ...,
         -1.16912150e+00,  1.30278003e+00, -5.87929547e-01],
        [ 1.00000000e+00, -1.64174604e+00, -3.65545213e-01, ...,
         -7.93600321e-01,  6.51289225e-01,  6.02822185e-01],
        [ 1.00000000e+00, -5.31409979e-01, -1.35122269e-01, ...,
         -2.80352145e-01,  1.99857190e-01,  5.62249780e-01]]),
 'item_id_map': IdMap(external_ids=array([ 9506,  1659,  7107, ..., 10064, 13019, 10542])),
 'index_top_k': 0,
 'index_init_params': {'method': 'hnsw', 'space': 'cosinesimil'},
 'index_query_time_params': {'efSear

Делаем deepcopy

In [124]:
deepcopy_loaded_model = deepcopy(loaded_model)
deepcopy_loaded_model.get_item_list_for_user(973171, 10).tolist()

[]

In [133]:
deepcopy_loaded_model.__getstate__()

{'item_vectors': array([[ 1.00000000e+00,  1.81841409e+00,  5.02787411e-01, ...,
          5.58219552e-01,  4.37355697e-01,  1.35657072e-01],
        [ 1.00000000e+00,  1.27003825e+00,  1.01189710e-01, ...,
          6.13796592e-01,  5.99408388e-01,  6.10280037e-01],
        [ 1.00000000e+00,  2.43296099e+00,  1.33600235e-01, ...,
         -1.60736591e-01, -4.25765634e-01, -7.41632938e-01],
        ...,
        [ 1.00000000e+00, -1.55643296e+00,  1.62893534e-03, ...,
         -1.16912150e+00,  1.30278003e+00, -5.87929547e-01],
        [ 1.00000000e+00, -1.64174604e+00, -3.65545213e-01, ...,
         -7.93600321e-01,  6.51289225e-01,  6.02822185e-01],
        [ 1.00000000e+00, -5.31409979e-01, -1.35122269e-01, ...,
         -2.80352145e-01,  1.99857190e-01,  5.62249780e-01]]),
 'item_id_map': IdMap(external_ids=array([ 9506,  1659,  7107, ..., 10064, 13019, 10542])),
 'index_top_k': 0,
 'index_init_params': {'method': 'hnsw', 'space': 'cosinesimil'},
 'index_query_time_params': {'efSear

По какой-то причине не сохранился граф вычислений. Если все обучить заново, будет все ок.

In [118]:
#fit не сохраняет состояние обучения
deepcopy_loaded_model.fit()

<rectools.tools.ann.UserToItemAnnRecommender at 0x7effd48782e0>

In [120]:
#результат разный, потому что пробелмы с random seed
deepcopy_loaded_model.index.knnQueryBatch(user_vectors_, 10)

[(array([ 7113,  3936,  3176, 10099,  7059,  7324,  8651,  8370,  5926,
         10031], dtype=int32),
  array([1.2055503, 1.2059932, 1.2075199, 1.2093303, 1.2097838, 1.2102057,
         1.2106539, 1.210767 , 1.2109034, 1.2111441], dtype=float32))]

In [123]:
deepcopy_loaded_model.index_query_time_params

{'efSearch': 100}

По итогу получилось только локализовать проблему - не сохраняется внутренее состояние графа