In [1]:
import os
import pickle
import random
import zipfile as zf

import implicit
import numpy as np
import pandas as pd
import requests
from implicit.als import AlternatingLeastSquares
from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.models import ImplicitALSWrapperModel
from rectools.tools import UserToItemAnnRecommender
from tqdm import tqdm

from service.utils.unpickler import load

In [2]:
RANDOM_STATE = 42
random.seed(RANDOM_STATE)
os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

In [20]:
MODEL_PATH = "ann.pkl"

## Инициализация датасета

In [6]:
url = "https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip"

In [7]:
req = requests.get(url, stream=True)

with open("kion.zip", "wb") as fd:
    total_size_in_bytes = int(req.headers.get("Content-Length", 0))
    progress_bar = tqdm(desc="kion dataset download", total=total_size_in_bytes, unit="iB", unit_scale=True)
    for chunk in req.iter_content(chunk_size=2**20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 78.6M/78.8M [00:21<00:00, 4.97MiB/s]

In [8]:
files = zf.ZipFile("kion.zip", "r")
files.extractall()
files.close()

In [9]:
interactions_df = pd.read_csv("data_original/interactions.csv", parse_dates=["last_watch_dt"])

interactions_df.rename(columns={"last_watch_dt": Columns.Datetime, "total_dur": Columns.Weight}, inplace=True)

In [10]:
interactions = Interactions(interactions_df)

In [11]:
interactions.df.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0


In [12]:
users = pd.read_csv("data_original/users.csv")
items = pd.read_csv("data_original/items.csv")

## Обучение модели

Учить будем модель ImplicitALS, а потом поверх нее сделаем UserToItemAnnRecommender, потому что у класса приятный интерфейс  	(￣ ￣|||) и потому что приближенный поиск соседей работает быстро

In [13]:
model = ImplicitALSWrapperModel(
    model=AlternatingLeastSquares(factors=4, random_state=RANDOM_STATE, num_threads=8, regularization=0.05, alpha=2.0)
)

  check_blas_config()
kion dataset download: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78.8M/78.8M [00:39<00:00, 4.97MiB/s]

In [15]:
dataset = Dataset.construct(interactions_df)
model.fit(dataset)

<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7fd60b10d960>

In [17]:
user_v, item_v = model.get_vectors()

In [19]:
ann = UserToItemAnnRecommender(
    user_vectors=user_v, item_vectors=item_v, user_id_map=dataset.user_id_map, item_id_map=dataset.item_id_map
)
ann.fit()

<rectools.tools.ann.UserToItemAnnRecommender at 0x7fd6092683a0>

In [23]:
ann.get_item_list_for_user(973171, 10).tolist()

[10811, 6948, 11936, 2303, 14053, 9122, 596, 13600, 734, 6106]

In [26]:
ann.user_id_map

IdMap(external_ids=array([176549, 699317, 656683, ..., 805174, 648596, 697262]))

In [27]:
with open(MODEL_PATH, "wb") as f:
    pickle.dump(ann, f)

## Тестирование работоспособности

In [32]:
test = load("ann.pkl")

In [33]:
test.get_item_list_for_user(973171, 10).tolist()

[10811, 6948, 11936, 2303, 14053, 9122, 596, 13600, 734, 6106]

In [34]:
973171 in test.user_id_map.external_ids

True

In [47]:
ann_cls = ANN(load("ann.pkl"))

In [48]:
ann_cls.get_reco(973171, 10)

1


[10811, 6948, 11936, 2303, 14053, 9122, 596, 13600, 734, 6106]