## Импорты

In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import sys
sys.path.append('../evaluation/')

In [3]:
import os
import random
import warnings
import zipfile as zf
from copy import deepcopy
from pprint import pprint

import numpy as np
import pandas as pd
import requests
from IPython.display import display
from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.metrics import MAP, NDCG, MeanInvUserFreq, Precision, Recall, Serendipity
from rectools.metrics.base import MetricAtK
from rectools.model_selection import Splitter, TimeRangeSplitter
from rectools.models import PopularModel, RandomModel
from rectools.models.base import ModelBase
from tqdm import tqdm

from metrics import calculate_metrics
from visualization import visualize_metrics, visualize_training_result

In [4]:
RANDOM_STATE = 42
random.seed(RANDOM_STATE)
os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

In [5]:
K_RECOS = 10
N_SPLITS = 3

## Инициализация датасета

In [6]:
url = "https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip"

In [7]:
req = requests.get(url, stream=True)

with open("kion.zip", "wb") as fd:
    total_size_in_bytes = int(req.headers.get("Content-Length", 0))
    progress_bar = tqdm(desc="kion dataset download", total=total_size_in_bytes, unit="iB", unit_scale=True)
    for chunk in req.iter_content(chunk_size=2**20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 78.6M/78.8M [00:20<00:00, 3.58MiB/s]

In [8]:
files = zf.ZipFile("kion.zip", "r")
files.extractall()
files.close()

Немного предобработаем датасет

In [9]:
interactions = pd.read_csv("data_original/interactions.csv", parse_dates=["last_watch_dt"])

interactions.rename(columns={"last_watch_dt": Columns.Datetime, "total_dur": Columns.Weight}, inplace=True)

In [10]:
interactions = Interactions(interactions)

In [11]:
interactions.df.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0


In [12]:
users = pd.read_csv("data_original/users.csv")
items = pd.read_csv("data_original/items.csv")

## Подсчет метрик

Инициализируем модели

In [13]:
models: list[ModelBase] = [RandomModel(random_state=RANDOM_STATE), PopularModel()]

Инициализиуем метрики

In [14]:
metrics: dict[str, MetricAtK] = {}
for k in [1, 5, 10]:
    metrics.update(
        {
            f"top@{k}_precision": Precision(k=k),
            f"top@{k}_recall": Recall(k=k),
            f"top@{k}_ndcg": NDCG(k=k),
            f"top@{k}_map": MAP(k=k),
            f"top@{k}_serendipity": Serendipity(k=k),
            f"top@{k}_mean_inv_user_freq": MeanInvUserFreq(k=k),
        }
    )

Инициализиурем splitter

In [15]:
splitter: Splitter = TimeRangeSplitter(
    test_size="7D",
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

Посчитаем метрики и визуализируем результаты обучения

In [None]:
model_metrics = []
for model in tqdm(models, total=len(models)):
    model_metrics.extend(calculate_metrics(interactions=interactions, metrics=metrics, model=model, splitter=splitter, k_recos=K_RECOS))


  0%|                                                                                                                                                                                                     | 0/2 [00:00<?, ?it/s][A
 50%|██████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                              | 1/2 [00:46<00:46, 46.19s/it][A

In [None]:
visualize_metrics(model_metrics)

## Визуализация результатов обучения

Выберем из таблицы пользователей еще пользователей, кроме тех, кто указан в задании. Выбирать будем по следующим признакам:
 - любит фильмы жанра "криминал"
 - посмотрел самый непопулярный фильм

In [None]:
def get_watch_unpopular_movie_user_id(interactions:pd.DataFrame) -> int:
    films_rating = interactions['item_id'].value_counts()
    least_common_film_idx = films_rating.idxmin()
    return interactions[interactions['item_id']==least_common_film_idx].iloc[0]['user_id']

In [None]:
def get_movie_lover_by_genre(items: pd. DataFrame, interactions: pd.DataFrame, genre: str) -> int:
    interactions = pd.merge(interactions, items, on='item_id', how='left')
    condition = interactions['genres'].str.contains(genre, case=False, na=False)
    grouped_counts = condition.groupby(interactions['user_id']).sum()
    return grouped_counts.astype(int).idxmax()

In [None]:
watch_unpopular_movie_user_id = get_watch_unpopular_movie_user_id(interactions=interactions.df)
kriminal_movie_lover_id = get_movie_lover_by_genre(items=items, interactions=interactions.df, genre="криминал")
user_ids = [666262, 672861, 955527, watch_unpopular_movie_user_id, kriminal_movie_lover_id]

In [None]:
dataset = Dataset.construct(interactions.df)
item_data = items[["item_id", "title", "genres"]]

Посчитаем и отобразим результат

In [None]:
reports = {}
for model in models:
    init_model = deepcopy(model)
    init_model.fit(dataset)
    reports[model.__class__.__name__] = visualize_training_result(
        model=init_model, dataset=dataset, user_ids=user_ids, item_data=item_data, k_recos=K_RECOS, interactions=interactions
    )

In [None]:
for model_name, report in reports.items():
    pprint(f"Model name: {model_name}")
    display(report)