## Импорты

In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import sys
sys.path.append('../evaluation/')

In [3]:
import os
import random
import warnings
import zipfile as zf
from copy import deepcopy
from pprint import pprint

import numpy as np
import pandas as pd
import requests
from IPython.display import display
from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.metrics import MAP, NDCG, MeanInvUserFreq, Precision, Recall, Serendipity
from rectools.metrics.base import MetricAtK
from rectools.model_selection import Splitter, TimeRangeSplitter
from rectools.models import PopularModel, RandomModel
from rectools.models.base import ModelBase
from tqdm import tqdm

from metrics import calculate_metrics
from visualization import visualize_metrics, visualize_training_result

In [4]:
RANDOM_STATE = 42
random.seed(RANDOM_STATE)
os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

In [5]:
K_RECOS = 10
N_SPLITS = 3

## Инициализация датасета

In [6]:
url = "https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip"

In [7]:
req = requests.get(url, stream=True)

with open("kion.zip", "wb") as fd:
    total_size_in_bytes = int(req.headers.get("Content-Length", 0))
    progress_bar = tqdm(desc="kion dataset download", total=total_size_in_bytes, unit="iB", unit_scale=True)
    for chunk in req.iter_content(chunk_size=2**20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 78.6M/78.8M [00:23<00:00, 5.76MiB/s]

In [8]:
files = zf.ZipFile("kion.zip", "r")
files.extractall()
files.close()

Немного предобработаем датасет

In [9]:
interactions = pd.read_csv("data_original/interactions.csv", parse_dates=["last_watch_dt"])

interactions.rename(columns={"last_watch_dt": Columns.Datetime, "total_dur": Columns.Weight}, inplace=True)

In [10]:
interactions = Interactions(interactions)

In [11]:
interactions.df.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0


In [12]:
users = pd.read_csv("data_original/users.csv")
items = pd.read_csv("data_original/items.csv")

## Подсчет метрик

Инициализируем модели

In [13]:
models: list[ModelBase] = [RandomModel(random_state=RANDOM_STATE), PopularModel()]

Инициализиуем метрики

In [14]:
metrics: dict[str, MetricAtK] = {}
for k in [1, 5, 10]:
    metrics.update(
        {
            f"top@{k}_precision": Precision(k=k),
            f"top@{k}_recall": Recall(k=k),
            f"top@{k}_ndcg": NDCG(k=k),
            f"top@{k}_map": MAP(k=k),
            f"top@{k}_serendipity": Serendipity(k=k),
            f"top@{k}_mean_inv_user_freq": MeanInvUserFreq(k=k),
        }
    )

Инициализиурем splitter

In [15]:
splitter: Splitter = TimeRangeSplitter(
    test_size="7D",
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

Посчитаем метрики и визуализируем результаты обучения

In [16]:
model_metrics = []
for model in tqdm(models, total=len(models)):
    model_metrics.extend(calculate_metrics(interactions=interactions, metrics=metrics, model=model, splitter=splitter, k_recos=K_RECOS))


kion dataset download: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78.8M/78.8M [00:40<00:00, 5.76MiB/s][A
 50%|██████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                              | 1/2 [00:55<00:55, 55.24s/it][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [01:32<00:00, 46.20s/it][A


In [17]:
visualize_metrics(model_metrics)

Unnamed: 0_level_0,top@1,top@1,top@1,top@1,top@1,top@1,top@10,top@10,top@10,top@10,top@10,top@10,top@5,top@5,top@5,top@5,top@5,top@5,train time (sec)
Unnamed: 0_level_1,map,mean_inv_user_freq,ndcg,precision,recall,serendipity,map,mean_inv_user_freq,ndcg,precision,recall,serendipity,map,mean_inv_user_freq,ndcg,precision,recall,serendipity,Unnamed: 19_level_1
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
RandomModel,0.000169,6.9e-05,0.000176,0.000345,0.000176,0.000687,0.000169,0.000173,0.000175,6.9e-05,0.000155,0.0002,15.616147,15.611575,15.610409,4e-06,6e-06,6e-06,5.2e-05
PopularModel,0.076432,0.04272,0.052402,0.137413,0.033903,0.173492,0.076432,0.057932,0.043084,0.04272,0.078295,0.084109,2.377055,3.066979,3.71339,2e-06,3e-06,2e-06,1.730997


## Визуализация результатов обучения

Выберем из таблицы пользователей еще пользователей, кроме тех, кто указан в задании. Выбирать будем по следующим признакам:
 - любит фильмы жанра "криминал"
 - посмотрел самый непопулярный фильм

In [18]:
def get_watch_unpopular_movie_user_id(interactions:pd.DataFrame) -> int:
    films_rating = interactions['item_id'].value_counts()
    least_common_film_idx = films_rating.idxmin()
    return interactions[interactions['item_id']==least_common_film_idx].iloc[0]['user_id']

In [19]:
def get_movie_lover_by_genre(items: pd. DataFrame, interactions: pd.DataFrame, genre: str) -> int:
    interactions = pd.merge(interactions, items, on='item_id', how='left')
    condition = interactions['genres'].str.contains(genre, case=False, na=False)
    grouped_counts = condition.groupby(interactions['user_id']).sum()
    return grouped_counts.astype(int).idxmax()

In [20]:
watch_unpopular_movie_user_id = get_watch_unpopular_movie_user_id(interactions=interactions.df)
kriminal_movie_lover_id = get_movie_lover_by_genre(items=items, interactions=interactions.df, genre="криминал")
user_ids = [666262, 672861, 955527, watch_unpopular_movie_user_id, kriminal_movie_lover_id]

In [21]:
dataset = Dataset.construct(interactions.df)
item_data = items[["item_id", "title", "genres"]]

Посчитаем и отобразим результат

In [22]:
reports = {}
for model in models:
    init_model = deepcopy(model)
    init_model.fit(dataset)
    reports[model.__class__.__name__] = visualize_training_result(
        model=init_model, dataset=dataset, user_ids=user_ids, item_data=item_data, k_recos=K_RECOS, interactions=interactions
    )

In [23]:
for model_name, report in reports.items():
    pprint(f"Model name: {model_name}")
    display(report)

'Model name: RandomModel'


Unnamed: 0_level_0,Unnamed: 1_level_0,item_id,rank,title,genres,total_views,datetime,weight,watched_pct
user_id,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
666262,history,93,1,Дом ночных призраков,"зарубежные, криминал, детективы, ужасы",1,1,1,1
666262,reco,7419,1,Ода радости,комедии,6,6,6,6
666262,reco,9109,2,Последняя битва,"драмы, военные",239,239,239,239
666262,reco,13917,3,Преисподняя,"драмы, детективы, триллеры, вестерн",877,877,877,877
666262,reco,13332,4,Лихорадка,ужасы,7,7,7,7
666262,reco,1331,5,Вечность,драмы,190,190,190,190
666262,reco,15448,6,Леший,"триллеры, криминал, детективы",166,166,166,166
666262,reco,1123,7,Богема,"драмы, мюзиклы, мелодрамы",136,136,136,136
666262,reco,9933,8,Шедевр,"драмы, комедии",323,323,323,323
666262,reco,3287,9,Единоборства для детей (3-6 лет) Школа героев (3-6 лет) Выпуск 6,"единоборства, фитнес, для детей",6,6,6,6


'Model name: PopularModel'


Unnamed: 0_level_0,Unnamed: 1_level_0,item_id,rank,title,genres,total_views,datetime,weight,watched_pct
user_id,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
666262,history,93,1,Дом ночных призраков,"зарубежные, криминал, детективы, ужасы",1,1,1,1
666262,reco,10440,1,Хрустальный,"триллеры, детективы",202457,202457,202457,202457
666262,reco,15297,2,Клиника счастья,"драмы, мелодрамы",193123,193123,193123,193123
666262,reco,9728,3,Гнев человеческий,"боевики, триллеры",132865,132865,132865,132865
666262,reco,13865,4,Девятаев,"драмы, военные, приключения",122119,122119,122119,122119
666262,reco,4151,5,Секреты семейной жизни,комедии,91167,91167,91167,91167
666262,reco,3734,6,Прабабушка легкого поведения,комедии,74803,74803,74803,74803
666262,reco,2657,7,Подслушано,"драмы, триллеры",68581,68581,68581,68581
666262,reco,4880,8,Афера,комедии,55043,55043,55043,55043
666262,reco,142,9,Маша,"драмы, триллеры",45367,45367,45367,45367
