In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import cupy

cupy.is_available()

True

In [3]:
import re

import optuna
import pickle
import pandas as pd
from implicit.gpu.als import AlternatingLeastSquares
from lightfm import LightFM
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import Precision, Recall, MAP
from rectools.metrics import calc_metrics
from rectools.model_selection import TimeRangeSplitter
from rectools.tools import UserToItemAnnRecommender
from rectools.model_selection import cross_validate
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel

In [4]:
interactions = pd.read_csv('../data/interactions.csv')
users = pd.read_csv('../data/users.csv')
items = pd.read_csv('../data/items.csv')

Columns.Datetime = 'last_watch_dt'

interactions.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [5]:
interactions[Columns.Datetime].min(), interactions[Columns.Datetime].max()

('2021-03-13', '2021-08-22')

## Preprocess

In [6]:
import numpy as np

interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')
max_date = interactions[Columns.Datetime].max()
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [7]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 6)
test: (490982, 6)


In [8]:
train.drop(train.query("total_dur < 300").index, inplace=True)
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

## Features

### User features

In [9]:
def get_user_features(users, data):
    users = users.loc[users[Columns.User].isin(data[Columns.User])].copy()
    user_features_frames = []
    for feature in ["sex", "age", "income"]:
        feature_frame = users.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)
    return pd.concat(user_features_frames)


user_features = get_user_features(users, train)

In [10]:
user_features

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex
...,...,...,...
840184,529394,income_40_60,income
840186,80113,income_40_60,income
840188,312839,income_60_90,income
840189,191349,income_40_60,income


### Item features

In [11]:
def get_item_features(items, data):
    items = items.loc[items[Columns.Item].isin(data[Columns.Item])].copy()
    items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    genre_feature = items[["item_id", "genre"]].explode("genre")
    genre_feature.columns = ["id", "value"]
    genre_feature["feature"] = "genre"
    content_feature = items.reindex(columns=[Columns.Item, "content_type"])
    content_feature.columns = ["id", "value"]
    content_feature["feature"] = "content_type"
    return pd.concat((genre_feature, content_feature))


item_features = get_item_features(items, train)

In [12]:
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


In [13]:
train_dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

## Metrics

In [14]:
from rectools.metrics import NDCG, MeanInvUserFreq, Serendipity

metrics = {
    'Precision@1': Precision(k=1),
    'Precision@5': Precision(k=5),
    'Precision@10': Precision(k=10),
    'Recall@1': Recall(k=1),
    'Recall@5': Recall(k=5),
    'Recall@10': Recall(k=10),
    'MAP@1': MAP(k=1),
    'MAP@5': MAP(k=5),
    'MAP@10': MAP(k=10),
    'NDCG@1': NDCG(k=1),
    'NDCG@5': NDCG(k=5),
    'NDCG@10': NDCG(k=10),
    'MIUF@1': MeanInvUserFreq(k=1),
    'MIUF@5': MeanInvUserFreq(k=5),
    'MIUF@10': MeanInvUserFreq(k=10),
    'Serendipity@1': Serendipity(k=1),
    'Serendipity@5': Serendipity(k=5),
    'Serendipity@10': Serendipity(k=10)
}

In [15]:
def show_pivot(results, group=False):
    pivot_results = results.groupby(["model"]).mean()

    if group:
        new_columns = sorted([
            (re.split("@", col)[0], int(re.split("@", col)[1])) if "@" in col else (col, "")
            for col in pivot_results.columns])
        pivot_results.columns = pd.MultiIndex.from_tuples(new_columns, names=["Metric", "Value"])

    display(
        pivot_results.style
        .highlight_min(color='lightcoral', axis=0)
        .highlight_max(color='lightgreen', axis=0)
    )

In [16]:
K_RECOS = 10
RANDOM_STATE = 42
N_EPOCHS = 1
N_THREADS = 4
N_SPLITS = 5
TEST_SIZE = "7D"

## ALS tuning

In [17]:
def als_objective(trial, dataset, train, test):
    factors = trial.suggest_categorical('factors', [10, 50, 100])
    iterations = trial.suggest_categorical('iterations', [15, 50])

    model = ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            random_state=RANDOM_STATE,
            factors=factors,
            iterations=iterations,
        )
    )

    model.fit(dataset)
    recos = model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metrics_results = calc_metrics(
        metrics, recos, interactions=test, prev_interactions=train,
        catalog=train[Columns.Item].unique()
    )

    return metrics_results['MAP@10']

In [18]:
study_als = optuna.create_study(direction='maximize')
study_als.optimize(lambda trial: als_objective(trial, train_dataset, train, test), n_trials=10)

[I 2023-12-06 11:22:36,973] A new study created in memory with name: no-name-beee96c6-2e98-474a-91a2-67e10ae9905f


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-06 11:26:39,536] Trial 0 finished with value: 0.06385558065679472 and parameters: {'factors': 100, 'iterations': 50}. Best is trial 0 with value: 0.06385558065679472.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-06 11:27:32,637] Trial 1 finished with value: 0.063848353336508 and parameters: {'factors': 100, 'iterations': 50}. Best is trial 0 with value: 0.06385558065679472.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-06 11:28:03,733] Trial 2 finished with value: 0.0633030984260848 and parameters: {'factors': 50, 'iterations': 15}. Best is trial 0 with value: 0.06385558065679472.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-06 11:28:35,182] Trial 3 finished with value: 0.0632984523818297 and parameters: {'factors': 50, 'iterations': 15}. Best is trial 0 with value: 0.06385558065679472.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-06 11:29:28,822] Trial 4 finished with value: 0.06384966222269303 and parameters: {'factors': 100, 'iterations': 50}. Best is trial 0 with value: 0.06385558065679472.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-06 11:29:59,369] Trial 5 finished with value: 0.06330106890242164 and parameters: {'factors': 50, 'iterations': 15}. Best is trial 0 with value: 0.06385558065679472.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-06 11:30:30,601] Trial 6 finished with value: 0.06329584596546124 and parameters: {'factors': 50, 'iterations': 15}. Best is trial 0 with value: 0.06385558065679472.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-06 11:31:23,856] Trial 7 finished with value: 0.06386032755729885 and parameters: {'factors': 100, 'iterations': 50}. Best is trial 7 with value: 0.06386032755729885.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-06 11:31:53,651] Trial 8 finished with value: 0.06330474805696186 and parameters: {'factors': 50, 'iterations': 15}. Best is trial 7 with value: 0.06386032755729885.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-06 11:32:22,949] Trial 9 finished with value: 0.06427611676949428 and parameters: {'factors': 10, 'iterations': 15}. Best is trial 9 with value: 0.06427611676949428.


In [19]:
best_params_ALS = study_als.best_params
print(best_params_ALS)

{'factors': 10, 'iterations': 15}


## LightFM tuning

In [20]:
def lfm_objective(trial, dataset, train, test):
    no_components = trial.suggest_categorical('no_components', [10, 50, 100])
    loss = trial.suggest_categorical("loss", ["logistic", "bpr", "warp"])
    learning_rate = trial.suggest_float("learning_rate", 0.0001, 0.01, log=True)
    user_alpha = trial.suggest_float("user_alpha", 0, 1)
    item_alpha = trial.suggest_float("item_alpha", 0, 1)

    model = LightFMWrapperModel(
        model=LightFM(
            random_state=RANDOM_STATE,
            no_components=no_components,
            loss=loss,
            learning_rate=learning_rate,
            user_alpha=user_alpha,
            item_alpha=item_alpha
        ),
        epochs=N_EPOCHS,
        num_threads=N_THREADS
    )

    model.fit(dataset)
    recos = model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metrics_vals = calc_metrics(
        metrics, recos, interactions=test, prev_interactions=train,
        catalog=train[Columns.Item].unique()
    )

    return metrics_vals['MAP@10']

In [21]:
study_LFM = optuna.create_study(direction='maximize')
study_LFM.optimize(lambda trial: lfm_objective(trial, train_dataset, train, test), n_trials=10)

[I 2023-12-06 11:32:22,977] A new study created in memory with name: no-name-59307908-8931-476e-b563-e3cac1f33685
[I 2023-12-06 11:32:50,606] Trial 0 finished with value: 0.07265809812765157 and parameters: {'no_components': 50, 'loss': 'warp', 'learning_rate': 0.007742501843344539, 'user_alpha': 0.5123587441430492, 'item_alpha': 0.31760021207828404}. Best is trial 0 with value: 0.07265809812765157.
[I 2023-12-06 11:33:16,687] Trial 1 finished with value: 0.00019057919356641156 and parameters: {'no_components': 50, 'loss': 'logistic', 'learning_rate': 0.0002072808212971728, 'user_alpha': 0.14051583996398875, 'item_alpha': 0.8812806102886452}. Best is trial 0 with value: 0.07265809812765157.
[I 2023-12-06 11:33:51,058] Trial 2 finished with value: 0.06429750033772968 and parameters: {'no_components': 100, 'loss': 'bpr', 'learning_rate': 0.0012994186678969763, 'user_alpha': 0.3091429174489484, 'item_alpha': 0.15665319799912714}. Best is trial 0 with value: 0.07265809812765157.
[I 2023-12

In [22]:
best_params_LFM = study_LFM.best_params
print(best_params_LFM)

{'no_components': 50, 'loss': 'warp', 'learning_rate': 0.007742501843344539, 'user_alpha': 0.5123587441430492, 'item_alpha': 0.31760021207828404}


## Cross validation

In [23]:
models = {
    "ALS": ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            **best_params_ALS,
            random_state=RANDOM_STATE,
        ),
        fit_features_together=True,
    ),
    "LightFM": LightFMWrapperModel(
        LightFM(
            **best_params_LFM,
            random_state=RANDOM_STATE,
        ),
        epochs=N_EPOCHS,
        num_threads=N_THREADS,
    ),
}

In [24]:
splitter = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [25]:
results = cross_validate(train_dataset, splitter, metrics, models, k=10, filter_viewed=True)

In [26]:
df_results = pd.DataFrame.from_dict(results["metrics"]).drop("i_split", axis=1)

In [27]:
df_results

Unnamed: 0,model,Precision@1,Recall@1,Precision@5,Recall@5,Precision@10,Recall@10,NDCG@1,NDCG@5,NDCG@10,MAP@1,MAP@5,MAP@10,MIUF@1,MIUF@5,MIUF@10,Serendipity@1,Serendipity@5,Serendipity@10
0,ALS,0.107981,0.06842,0.061768,0.182906,0.039268,0.226803,0.107981,0.071828,0.052614,0.06842,0.113138,0.120289,2.950321,3.739064,4.493684,1.6e-05,1.6e-05,1.9e-05
1,LightFM,0.108641,0.071841,0.065664,0.194812,0.040819,0.236831,0.108641,0.074041,0.053917,0.071841,0.117047,0.124457,2.466258,3.197221,3.844187,8e-06,5e-06,3e-06
2,ALS,0.088842,0.054981,0.053463,0.156332,0.034176,0.19489,0.088842,0.061216,0.045071,0.054981,0.094053,0.100251,3.18787,4.116845,4.9979,2e-05,1.8e-05,2e-05
3,LightFM,0.086931,0.056031,0.057832,0.168967,0.037326,0.211986,0.086931,0.0639,0.047547,0.056031,0.097665,0.104865,2.419423,3.15571,3.894346,7e-06,4e-06,3e-06
4,ALS,0.070986,0.042121,0.047901,0.13595,0.032059,0.175974,0.070986,0.05328,0.040363,0.042121,0.077316,0.08363,2.901963,3.619253,4.385052,1.3e-05,1.4e-05,1.7e-05
5,LightFM,0.00078,0.000303,0.00961,0.028881,0.004994,0.029423,0.00078,0.008275,0.005501,0.000303,0.009833,0.009902,8.882222,11.167473,12.505207,2e-05,1.3e-05,1.5e-05
6,ALS,0.082509,0.04928,0.05025,0.140407,0.032747,0.176789,0.082509,0.057229,0.04259,0.04928,0.083862,0.08974,2.956104,3.663775,4.505235,1.6e-05,1.5e-05,1.9e-05
7,LightFM,0.001369,0.000589,0.011505,0.03381,0.005914,0.034447,0.001369,0.009985,0.006602,0.000589,0.011672,0.011771,8.906238,13.073777,12.757168,3.6e-05,1.1e-05,1.3e-05
8,ALS,0.076388,0.045692,0.04633,0.130041,0.03015,0.164334,0.076388,0.052875,0.039289,0.045692,0.077742,0.083134,2.834639,3.621053,4.542421,1.4e-05,1.7e-05,2.1e-05
9,LightFM,0.076814,0.047112,0.048487,0.136905,0.030927,0.170296,0.076814,0.053875,0.039729,0.047112,0.079398,0.084726,2.358067,3.078199,3.75544,2e-06,2e-06,2e-06


In [28]:
df_results.to_csv('df_results.csv')

In [29]:
show_pivot(df_results, group=True)

Metric,MAP,MAP,MAP,MIUF,MIUF,MIUF,NDCG,NDCG,NDCG,Precision,Precision,Precision,Recall,Recall,Recall,Serendipity,Serendipity,Serendipity
Value,1,5,10,1,5,10,1,5,10,1,5,10,1,5,10,1,5,10
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
ALS,0.085341,0.052099,0.051942,0.149127,0.03368,0.187758,0.085341,0.059286,0.043985,0.052099,0.089222,0.095409,2.96618,3.751998,4.584858,1.6e-05,1.6e-05,1.9e-05
LightFM,0.054907,0.035175,0.038619,0.112675,0.023996,0.136597,0.054907,0.042015,0.030659,0.035175,0.063123,0.067144,5.006442,6.734476,7.35127,1.4e-05,7e-06,7e-06


Модель ALS справилась лучше, будем использовать её

## Train

In [30]:
user_features_all = get_user_features(users, interactions)
item_features_all = get_item_features(items, interactions)

In [31]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features_all,
    item_features_df=item_features_all,
    cat_user_features=["sex", "age", "income"],
    cat_item_features=["genre", "content_type"],
)

In [32]:
ALS_model = ImplicitALSWrapperModel(
    AlternatingLeastSquares(
        **best_params_ALS,
        random_state=RANDOM_STATE,
    )
)

In [33]:
ALS_model.fit(dataset)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f1282b92f20>

In [34]:
ALS_model.recommend(interactions.user_id.head(1), dataset, k=10, filter_viewed=True)

Unnamed: 0,user_id,item_id,score,rank
0,176549,7571,1.314032,1
1,176549,16270,0.808068,2
2,176549,11749,0.740079,3
3,176549,13159,0.704768,4
4,176549,1844,0.704624,5
5,176549,15266,0.703281,6
6,176549,13915,0.687961,7
7,176549,12743,0.66119,8
8,176549,11985,0.658377,9
9,176549,11310,0.623677,10


In [35]:
pickle.dump(ALS_model, open('../saved_models/als.pkl', "wb"))

## ANN

In [36]:
user_vectors, item_vectors = ALS_model.get_vectors()

In [37]:
ALS_ANN = UserToItemAnnRecommender(
    user_vectors=user_vectors,
    item_vectors=item_vectors,
    user_id_map=dataset.user_id_map,
    item_id_map=dataset.item_id_map,
)

In [38]:
ALS_ANN.fit()

<rectools.tools.ann.UserToItemAnnRecommender at 0x7f127faa0460>

In [39]:
ALS_ANN.get_item_list_for_user(interactions.user_id.head(1).values[0], top_n=10).tolist()

[13018, 10761, 7582, 16166, 16270, 3182, 11749, 5411, 12965, 11985]

In [40]:
pickle.dump(ALS_ANN, open('../saved_models/als_ann.pkl', "wb"))