In [1]:
import warnings

warnings.filterwarnings('ignore')

In [2]:
import cupy

cupy.is_available()

True

In [3]:
import re

import optuna
import pickle
import pandas as pd
from implicit.gpu.als import AlternatingLeastSquares
from lightfm import LightFM
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import Precision, Recall, MAP
from rectools.metrics import calc_metrics
from rectools.model_selection import TimeRangeSplitter
from rectools.tools import UserToItemAnnRecommender
from rectools.model_selection import cross_validate
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel

In [4]:
interactions = pd.read_csv('../data/interactions.csv')
users = pd.read_csv('../data/users.csv')
items = pd.read_csv('../data/items.csv')

Columns.Datetime = 'last_watch_dt'

interactions.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [5]:
interactions[Columns.Datetime].min(), interactions[Columns.Datetime].max()

('2021-03-13', '2021-08-22')

## Preprocess

In [6]:
import numpy as np

interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')
max_date = interactions[Columns.Datetime].max()
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [7]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 6)
test: (490982, 6)


In [8]:
train.drop(train.query("total_dur < 300").index, inplace=True)
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

## Features

### User features

In [9]:
def get_user_features(users, data):
    users = users.loc[users[Columns.User].isin(data[Columns.User])].copy()
    user_features_frames = []
    for feature in ["sex", "age", "income"]:
        feature_frame = users.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)
    return pd.concat(user_features_frames)


user_features = get_user_features(users, train)

In [10]:
user_features

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex
...,...,...,...
840184,529394,income_40_60,income
840186,80113,income_40_60,income
840188,312839,income_60_90,income
840189,191349,income_40_60,income


### Item features

In [11]:
def get_item_features(items, data):
    items = items.loc[items[Columns.Item].isin(data[Columns.Item])].copy()
    items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    genre_feature = items[["item_id", "genre"]].explode("genre")
    genre_feature.columns = ["id", "value"]
    genre_feature["feature"] = "genre"
    content_feature = items.reindex(columns=[Columns.Item, "content_type"])
    content_feature.columns = ["id", "value"]
    content_feature["feature"] = "content_type"
    return pd.concat((genre_feature, content_feature))


item_features = get_item_features(items, train)

In [12]:
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


In [13]:
train_dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

## Metrics

In [14]:
from rectools.metrics import NDCG, MeanInvUserFreq, Serendipity

metrics = {
    'Precision@1': Precision(k=1),
    'Precision@5': Precision(k=5),
    'Precision@10': Precision(k=10),
    'Recall@1': Recall(k=1),
    'Recall@5': Recall(k=5),
    'Recall@10': Recall(k=10),
    'MAP@1': MAP(k=1),
    'MAP@5': MAP(k=5),
    'MAP@10': MAP(k=10),
    'NDCG@1': NDCG(k=1),
    'NDCG@5': NDCG(k=5),
    'NDCG@10': NDCG(k=10),
    'MIUF@1': MeanInvUserFreq(k=1),
    'MIUF@5': MeanInvUserFreq(k=5),
    'MIUF@10': MeanInvUserFreq(k=10),
    'Serendipity@1': Serendipity(k=1),
    'Serendipity@5': Serendipity(k=5),
    'Serendipity@10': Serendipity(k=10)
}

In [15]:
def show_pivot(results, group=False):
    pivot_results = results.groupby(["model"]).mean()

    if group:
        new_columns = sorted([
            (re.split("@", col)[0], int(re.split("@", col)[1])) if "@" in col else (col, "")
            for col in pivot_results.columns])
        pivot_results.columns = pd.MultiIndex.from_tuples(new_columns, names=["Metric", "Value"])

    display(
        pivot_results.style
        .highlight_min(color='lightcoral', axis=0)
        .highlight_max(color='lightgreen', axis=0)
    )

In [None]:
K_RECOS = 10
RANDOM_STATE = 42
N_EPOCHS = 1
N_THREADS = 6
N_SPLITS = 5
TEST_SIZE = "7D"

## ALS tuning

In [None]:
def als_objective(trial, dataset, train, test):
    factors = trial.suggest_categorical('factors', [64, 128, 256])
    iterations = trial.suggest_categorical('iterations', [15, 30, 50])
    fit_features_together = trial.suggest_categorical('fit_features_together', [True, False])
    model = ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            random_state=RANDOM_STATE,
            factors=factors,
            iterations=iterations,
        ),
        fit_features_together=fit_features_together
    )

    model.fit(dataset)
    recos = model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metrics_results = calc_metrics(
        metrics, recos, interactions=test, prev_interactions=train,
        catalog=train[Columns.Item].unique()
    )

    return metrics_results['MAP@10']

In [None]:
study_als = optuna.create_study(direction='maximize')
study_als.optimize(lambda trial: als_objective(trial, train_dataset, train, test), n_trials=20)

In [None]:
best_params_ALS = study_als.best_params
print(best_params_ALS)

## LightFM tuning

In [None]:
def lfm_objective(trial, dataset, train, test):
    no_components = trial.suggest_categorical('no_components', [64, 128, 256])
    loss = trial.suggest_categorical("loss", ["logistic", "bpr", "warp"])
    learning_rate = trial.suggest_float("learning_rate", 0.0001, 0.01, log=True)
    user_alpha = trial.suggest_float("user_alpha", 0, 1)
    item_alpha = trial.suggest_float("item_alpha", 0, 1)

    n_epochs = trial.suggest_int("n_epochs", 3, 7)

    model = LightFMWrapperModel(
        model=LightFM(
            random_state=RANDOM_STATE,
            no_components=no_components,
            loss=loss,
            learning_rate=learning_rate,
            user_alpha=user_alpha,
            item_alpha=item_alpha
        ),
        epochs=n_epochs,
        num_threads=N_THREADS
    )

    model.fit(dataset)
    recos = model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metrics_vals = calc_metrics(
        metrics, recos, interactions=test, prev_interactions=train,
        catalog=train[Columns.Item].unique()
    )

    return metrics_vals['MAP@10']

In [None]:
study_LFM = optuna.create_study(direction='maximize')
study_LFM.optimize(lambda trial: lfm_objective(trial, train_dataset, train, test), n_trials=20)

In [None]:
best_params_LFM = study_LFM.best_params
print(best_params_LFM)

## Cross validation

In [None]:
models = {
    "ALS": ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=best_params_ALS.get("factors"),
            iterations=best_params_ALS.get("iterations"),
            random_state=RANDOM_STATE,
        ),
        fit_features_together=best_params_ALS.get("fit_features_together"),
    ),
    "LightFM": LightFMWrapperModel(
        LightFM(
            no_components=best_params_LFM.get("no_components"),
            loss=best_params_LFM.get("loss"),
            learning_rate=best_params_LFM.get("learning_rate"),
            user_alpha=best_params_LFM.get("user_alpha"),
            item_alpha=best_params_LFM.get("item_alpha"),
            random_state=RANDOM_STATE,
        ),
        epochs=best_params_LFM.get("n_epochs"),
        num_threads=N_THREADS,
    ),
}

In [None]:
splitter = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [None]:
results = cross_validate(train_dataset, splitter, metrics, models, k=10, filter_viewed=True)

In [None]:
df_results = pd.DataFrame.from_dict(results["metrics"]).drop("i_split", axis=1)

In [None]:
df_results

In [None]:
df_results.to_csv('df_results.csv')

In [None]:
show_pivot(df_results, group=True)

Модель ALS справилась лучше, будем использовать её

## Train

In [None]:
user_features_all = get_user_features(users, interactions)
item_features_all = get_item_features(items, interactions)

In [None]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features_all,
    item_features_df=item_features_all,
    cat_user_features=["sex", "age", "income"],
    cat_item_features=["genre", "content_type"],
)

In [None]:
ALS_model = ImplicitALSWrapperModel(
    AlternatingLeastSquares(
        **best_params_ALS,
        random_state=RANDOM_STATE,
    )
)

In [None]:
ALS_model.fit(dataset)

In [None]:
ALS_model.recommend(interactions.user_id.head(1), dataset, k=10, filter_viewed=True)

In [None]:
pickle.dump(ALS_model, open('../saved_models/als.pkl', "wb"))

## ANN

In [None]:
user_vectors, item_vectors = ALS_model.get_vectors()

In [None]:
ALS_ANN = UserToItemAnnRecommender(
    user_vectors=user_vectors,
    item_vectors=item_vectors,
    user_id_map=dataset.user_id_map,
    item_id_map=dataset.item_id_map,
)

In [None]:
ALS_ANN.fit()

In [None]:
ALS_ANN.get_item_list_for_user(interactions.user_id.head(1).values[0], top_n=10).tolist()

In [None]:
pickle.dump(ALS_ANN, open('../saved_models/als_ann.pkl', "wb"))