# Подготовим агрегированный рейтинг по всем интеракциям: транзакциям, добавлениям в закладки, проставлению рейтинга.

In [None]:
import pandas as pd
import os
import numpy as np
import json
import math
from tqdm import tqdm
from scipy import sparse as sp

from sklearn.model_selection import train_test_split

%matplotlib inline

In [None]:
DATA_PATH = "data"

# Catalog

In [None]:
with open(os.path.join(DATA_PATH, 'catalogue.json'), 'r') as f:
    catalogue = json.load(f)
    
catalog = pd.DataFrame({int(k): v for k, v in catalogue.items()}).transpose()

for feature in ["purchase", "rent", "subscription"]:
    catalog[feature] = catalog.availability.apply(lambda x: feature in x).astype(int)
catalog.drop(columns=["availability", "attributes"], inplace=True)

catalog.duration += 5

# Transactions

In [None]:
transactions = pd.read_csv(
    os.path.join(DATA_PATH, 'transactions.csv'),
    dtype={
        'element_uid': np.uint16,
        'user_uid': np.uint32,
        'consumption_mode': 'category',
        'ts': np.float64,
        'watched_time': np.uint64,
        'device_type': np.uint8,
        'device_manufacturer': np.uint8
    }
)

In [None]:
transactions.watched_time = transactions.watched_time / 60
transactions = transactions.merge(catalog, left_on="element_uid", right_index=True, how="left")

transactions["percent_watched"] = transactions.watched_time / transactions.duration

# Bookmarks

In [None]:
bookmarks = pd.read_csv(
    os.path.join(DATA_PATH, 'bookmarks.csv'),
    dtype={
        'element_uid': np.uint16,
        'user_uid': np.uint32,
        'ts': np.float64
    }
)

In [None]:
bookmarks = bookmarks.merge(catalog, left_on="element_uid", right_index=True, how="left")
bookmarks["consumption_mode"] = "B"
bookmarks["percent_watched"] = 0.5
bookmarks["label"] = -1

# Ratings

In [None]:
ratings = pd.read_csv(
    os.path.join(DATA_PATH, 'ratings.csv'),
    dtype={
        'element_uid': np.uint16,
        'user_uid': np.uint32,
        'ts': np.float64,
        'rating': np.uint8
    }
)

In [None]:
ratings[["user_uid", "element_uid", "ts", "rating"]].to_pickle("ratings.pkl")

In [None]:
ratings = ratings.merge(catalog, left_on="element_uid", right_index=True, how="left")
ratings["consumption_mode"] = "R"
# ratings = ratings[ratings.rating > 4]
ratings["percent_watched"] = (ratings.rating + 2) / 10
ratings["label"] = 1
ratings.drop(columns=["rating"], inplace=True)

transactions intersect bookmarks 240k times

# prepare label

Некоторые фильмы пользователь мог посмотреть несколько раз. <br>
Неизвестно сколько серий в каждом из сериалов и частей в многосерийном фильме.

In [None]:
transactions.percent_watched[transactions.type == "movie"] = \
transactions.percent_watched[transactions.type == "movie"].clip(0, 4)

series_q75 = transactions[(transactions.type == "series") & \
             (transactions.duration != 1)].groupby("element_uid")["percent_watched"].quantile(0.5)
series_q75 = dict(series_q75)
def func(x):
    return min(x.percent_watched / series_q75[x.element_uid], 1)
transactions.loc[(transactions.type == "series"), ["percent_watched"]] = \
transactions.loc[(transactions.type == "series"), ["percent_watched", "element_uid"]].apply(func, axis=1)

multipart_q75 = transactions[(transactions.type == \
                              "multipart_movie")].groupby("element_uid")["percent_watched"].quantile(0.2)

def func(x):
    return min(x.percent_watched / multipart_q75[x.element_uid], 1)
transactions.loc[(transactions.type == "multipart_movie"), ["percent_watched"]] = \
transactions.loc[(transactions.type == "multipart_movie"), ["percent_watched", "element_uid"]].apply(func, axis=1)

Пользователь потребил контент если он
- Посмотрел больше половины фильма
- Больше трети сериала
- Купил или взял в аренду

In [None]:
transactions["first"] = transactions.consumption_mode.isin(["P", "R"])
transactions["second"] = (transactions.percent_watched > 0.5) & (transactions.type != "series")
transactions["third"] = (transactions.percent_watched > 1/3) & (transactions.type == "series")
transactions["label"] = (transactions["first"] | transactions.second | transactions.third).astype(int)
print(transactions.label.mean())
transactions.drop(columns=["first", "second", "third"], inplace=True)

Объединим все интерекции

In [None]:
transactions = transactions.append(bookmarks)
transactions = transactions.append(ratings)

Составим агрегированный рейтинг

In [None]:
transactions["rating"] = 0
transactions.rating.loc[transactions.type == "movie"] = \
transactions.percent_watched.loc[transactions.type == "movie"] * 5

transactions.rating.loc[transactions.type == "multipart_movie"] = \
transactions.percent_watched.loc[transactions.type == "multipart_movie"] * 5

transactions.rating.loc[transactions.type == "series"] = \
transactions.percent_watched.loc[transactions.type == "series"] * 10

transactions.loc[transactions.consumption_mode.isin(["P", "R"]), ["rating"]] = 15

Выкинем из выборки неактивных пользователей

In [None]:
user_cnt = transactions.user_uid.value_counts()
user_good = list(user_cnt[user_cnt >= 3].index)
transactions = transactions[transactions.user_uid.isin(user_good)]
len(user_cnt), len(user_good)

In [None]:
transactions.set_index(["element_uid", "user_uid"], inplace=True)

transactions = transactions[["device_type", "device_manufacturer", "feature_1", "feature_2", "feature_3",
                             "feature_4", "feature_5", "type", "purchase", "rent", "subscription", "label",
                            "rating", "ts"]]

In [None]:
transactions.to_pickle(os.path.join(DATA_PATH, "transactions.pkl"))

Сохраним транзакции для оставшихся в выборке пользователей

In [None]:
bookmarks[bookmarks.user_uid.isin(user_good)][["user_uid",
                                               "element_uid",
                                               "ts"]].to_pickle(os.path.join(DATA_PATH, "bookmarks.pkl"))