In [1]:
import warnings

warnings.filterwarnings('ignore')

In [2]:
import cupy

cupy.is_available()

True

In [3]:
import re

import optuna
import pickle
import pandas as pd
from implicit.gpu.als import AlternatingLeastSquares
from lightfm import LightFM
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import Precision, Recall, MAP
from rectools.metrics import calc_metrics
from rectools.model_selection import TimeRangeSplitter
from rectools.tools import UserToItemAnnRecommender
from rectools.model_selection import cross_validate
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel

In [4]:
interactions = pd.read_csv('../data/interactions.csv')
users = pd.read_csv('../data/users.csv')
items = pd.read_csv('../data/items.csv')

Columns.Datetime = 'last_watch_dt'

interactions.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [5]:
interactions[Columns.Datetime].min(), interactions[Columns.Datetime].max()

('2021-03-13', '2021-08-22')

## Preprocess

In [6]:
import numpy as np

interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')
max_date = interactions[Columns.Datetime].max()
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [7]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 6)
test: (490982, 6)


In [8]:
train.drop(train.query("total_dur < 300").index, inplace=True)
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

## Features

### User features

In [9]:
def get_user_features(users, data):
    users = users.loc[users[Columns.User].isin(data[Columns.User])].copy()
    user_features_frames = []
    for feature in ["sex", "age", "income"]:
        feature_frame = users.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)
    return pd.concat(user_features_frames)


user_features = get_user_features(users, train)

In [10]:
user_features

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex
...,...,...,...
840184,529394,income_40_60,income
840186,80113,income_40_60,income
840188,312839,income_60_90,income
840189,191349,income_40_60,income


### Item features

In [11]:
def get_item_features(items, data):
    items = items.loc[items[Columns.Item].isin(data[Columns.Item])].copy()
    items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    genre_feature = items[["item_id", "genre"]].explode("genre")
    genre_feature.columns = ["id", "value"]
    genre_feature["feature"] = "genre"
    content_feature = items.reindex(columns=[Columns.Item, "content_type"])
    content_feature.columns = ["id", "value"]
    content_feature["feature"] = "content_type"
    return pd.concat((genre_feature, content_feature))


item_features = get_item_features(items, train)

In [12]:
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


In [13]:
train_dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

## Metrics

In [14]:
from rectools.metrics import NDCG, MeanInvUserFreq, Serendipity

metrics = {
    'Precision@1': Precision(k=1),
    'Precision@5': Precision(k=5),
    'Precision@10': Precision(k=10),
    'Recall@1': Recall(k=1),
    'Recall@5': Recall(k=5),
    'Recall@10': Recall(k=10),
    'MAP@1': MAP(k=1),
    'MAP@5': MAP(k=5),
    'MAP@10': MAP(k=10),
    'NDCG@1': NDCG(k=1),
    'NDCG@5': NDCG(k=5),
    'NDCG@10': NDCG(k=10),
    'MIUF@1': MeanInvUserFreq(k=1),
    'MIUF@5': MeanInvUserFreq(k=5),
    'MIUF@10': MeanInvUserFreq(k=10),
    'Serendipity@1': Serendipity(k=1),
    'Serendipity@5': Serendipity(k=5),
    'Serendipity@10': Serendipity(k=10)
}

In [15]:
def show_pivot(results, group=False):
    pivot_results = results.groupby(["model"]).mean()

    if group:
        new_columns = sorted([
            (re.split("@", col)[0], int(re.split("@", col)[1])) if "@" in col else (col, "")
            for col in pivot_results.columns])
        pivot_results.columns = pd.MultiIndex.from_tuples(new_columns, names=["Metric", "Value"])

    display(
        pivot_results.style
        .highlight_min(color='lightcoral', axis=0)
        .highlight_max(color='lightgreen', axis=0)
    )

In [16]:
K_RECOS = 10
RANDOM_STATE = 42
N_EPOCHS = 1
N_THREADS = 6
N_SPLITS = 5
TEST_SIZE = "7D"

## ALS tuning

In [17]:
def als_objective(trial, dataset, train, test):
    factors = trial.suggest_categorical('factors', [64, 128])
    iterations = trial.suggest_categorical('iterations', [15, 30])
    fit_features_together = trial.suggest_categorical('fit_features_together', [True, False])
    model = ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            random_state=RANDOM_STATE,
            factors=factors,
            iterations=iterations,
        ),
        fit_features_together=fit_features_together
    )

    model.fit(dataset)
    recos = model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metrics_results = calc_metrics(
        metrics, recos, interactions=test, prev_interactions=train,
        catalog=train[Columns.Item].unique()
    )

    return metrics_results['MAP@10']

In [18]:
study_als = optuna.create_study(direction='maximize')
study_als.optimize(lambda trial: als_objective(trial, train_dataset, train, test), n_trials=10)

[I 2023-12-06 14:25:53,577] A new study created in memory with name: no-name-498a71aa-1034-4727-b63a-21a0db0754b1


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-06 14:26:33,568] Trial 0 finished with value: 0.06399587552855357 and parameters: {'factors': 128, 'iterations': 15, 'fit_features_together': False}. Best is trial 0 with value: 0.06399587552855357.
[I 2023-12-06 14:28:09,714] Trial 1 finished with value: 0.05530208285539027 and parameters: {'factors': 128, 'iterations': 30, 'fit_features_together': True}. Best is trial 0 with value: 0.06399587552855357.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-06 14:28:51,009] Trial 2 finished with value: 0.06340623599713466 and parameters: {'factors': 64, 'iterations': 30, 'fit_features_together': False}. Best is trial 0 with value: 0.06399587552855357.
[I 2023-12-06 14:30:18,783] Trial 3 finished with value: 0.05390091348316562 and parameters: {'factors': 128, 'iterations': 30, 'fit_features_together': True}. Best is trial 0 with value: 0.06399587552855357.
[I 2023-12-06 14:31:44,765] Trial 4 finished with value: 0.05506882315456147 and parameters: {'factors': 128, 'iterations': 30, 'fit_features_together': True}. Best is trial 0 with value: 0.06399587552855357.
[I 2023-12-06 14:33:08,917] Trial 5 finished with value: 0.05605362375057268 and parameters: {'factors': 128, 'iterations': 30, 'fit_features_together': True}. Best is trial 0 with value: 0.06399587552855357.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-06 14:33:54,092] Trial 6 finished with value: 0.06374667976805742 and parameters: {'factors': 128, 'iterations': 30, 'fit_features_together': False}. Best is trial 0 with value: 0.06399587552855357.
[I 2023-12-06 14:34:40,645] Trial 7 finished with value: 0.07590319840504346 and parameters: {'factors': 64, 'iterations': 15, 'fit_features_together': True}. Best is trial 7 with value: 0.07590319840504346.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-06 14:35:20,524] Trial 8 finished with value: 0.06340421085851533 and parameters: {'factors': 64, 'iterations': 30, 'fit_features_together': False}. Best is trial 7 with value: 0.07590319840504346.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-06 14:36:00,432] Trial 9 finished with value: 0.06340054768569103 and parameters: {'factors': 64, 'iterations': 30, 'fit_features_together': False}. Best is trial 7 with value: 0.07590319840504346.


In [19]:
best_params_ALS = study_als.best_params
print(best_params_ALS)

{'factors': 64, 'iterations': 15, 'fit_features_together': True}


## LightFM tuning

In [20]:
def lfm_objective(trial, dataset, train, test):
    no_components = trial.suggest_categorical('no_components', [10, 20, 30])
    loss = trial.suggest_categorical("loss", ["logistic", "bpr", "warp"])
    learning_rate = trial.suggest_float("learning_rate", 0.0001, 0.01, log=True)
    user_alpha = trial.suggest_float("user_alpha", 0, 1)
    item_alpha = trial.suggest_float("item_alpha", 0, 1)

    n_epochs = trial.suggest_int("n_epochs", 3, 7)

    model = LightFMWrapperModel(
        model=LightFM(
            random_state=RANDOM_STATE,
            no_components=no_components,
            loss=loss,
            learning_rate=learning_rate,
            user_alpha=user_alpha,
            item_alpha=item_alpha
        ),
        epochs=n_epochs,
        num_threads=N_THREADS
    )

    model.fit(dataset)
    recos = model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metrics_vals = calc_metrics(
        metrics, recos, interactions=test, prev_interactions=train,
        catalog=train[Columns.Item].unique()
    )

    return metrics_vals['MAP@10']

In [21]:
study_LFM = optuna.create_study(direction='maximize')
study_LFM.optimize(lambda trial: lfm_objective(trial, train_dataset, train, test), n_trials=20)

[I 2023-12-06 14:44:11,990] A new study created in memory with name: no-name-d388b942-4436-476d-8e9c-ca35ec5db73c
[I 2023-12-06 14:44:54,043] Trial 0 finished with value: 0.00019070489853690797 and parameters: {'no_components': 20, 'loss': 'logistic', 'learning_rate': 0.0025215926229963615, 'user_alpha': 0.6366322081937311, 'item_alpha': 0.7332202384832438, 'n_epochs': 5}. Best is trial 0 with value: 0.00019070489853690797.
[I 2023-12-06 14:45:31,797] Trial 1 finished with value: 0.03228420127425345 and parameters: {'no_components': 10, 'loss': 'bpr', 'learning_rate': 0.00046525051427849924, 'user_alpha': 0.7666916259805426, 'item_alpha': 0.44348686594959597, 'n_epochs': 4}. Best is trial 1 with value: 0.03228420127425345.
[I 2023-12-06 14:46:01,122] Trial 2 finished with value: 0.000191074853466198 and parameters: {'no_components': 10, 'loss': 'logistic', 'learning_rate': 0.00013524569852223927, 'user_alpha': 0.5071465111844451, 'item_alpha': 0.5176606388957853, 'n_epochs': 4}. Best i

In [22]:
best_params_LFM = study_LFM.best_params
print(best_params_LFM)

{'no_components': 30, 'loss': 'warp', 'learning_rate': 0.005435663137995463, 'user_alpha': 0.12482318873553576, 'item_alpha': 0.10577830153990038, 'n_epochs': 7}


 ## Cross validation

In [23]:
models = {
    "ALS": ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=best_params_ALS.get("factors"),
            iterations=best_params_ALS.get("iterations"),
            random_state=RANDOM_STATE,
        ),
        fit_features_together=best_params_ALS.get("fit_features_together"),
    ),
    "LightFM": LightFMWrapperModel(
        LightFM(
            no_components=best_params_LFM.get("no_components"),
            loss=best_params_LFM.get("loss"),
            learning_rate=best_params_LFM.get("learning_rate"),
            user_alpha=best_params_LFM.get("user_alpha"),
            item_alpha=best_params_LFM.get("item_alpha"),
            random_state=RANDOM_STATE,
        ),
        epochs=best_params_LFM.get("n_epochs"),
        num_threads=N_THREADS,
    ),
}

In [24]:
splitter = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [25]:
results = cross_validate(train_dataset, splitter, metrics, models, k=10, filter_viewed=True)

In [26]:
df_results = pd.DataFrame.from_dict(results["metrics"]).drop("i_split", axis=1)

In [27]:
df_results

Unnamed: 0,model,Precision@1,Recall@1,Precision@5,Recall@5,Precision@10,Recall@10,NDCG@1,NDCG@5,NDCG@10,MAP@1,MAP@5,MAP@10,MIUF@1,MIUF@5,MIUF@10,Serendipity@1,Serendipity@5,Serendipity@10
0,ALS,0.110871,0.070279,0.061746,0.18325,0.039187,0.226363,0.110871,0.072231,0.052823,0.070279,0.114314,0.121325,2.940106,3.712729,4.427078,1.7e-05,1.4e-05,1.7e-05
1,LightFM,0.109591,0.072632,0.066996,0.199032,0.041624,0.241599,0.109591,0.074497,0.054246,0.072632,0.117435,0.124739,2.484068,3.220469,3.804679,8e-06,6e-06,4e-06
2,ALS,0.09579,0.059589,0.058551,0.170933,0.037117,0.210199,0.09579,0.066608,0.048802,0.059589,0.102096,0.108446,2.926304,3.65336,4.432708,1.4e-05,1.7e-05,1.9e-05
3,LightFM,0.086998,0.056075,0.061402,0.179055,0.03846,0.218891,0.086998,0.066432,0.048749,0.056075,0.100362,0.107178,2.419433,3.151028,3.776357,7e-06,5e-06,4e-06
4,ALS,0.079956,0.049218,0.051152,0.146559,0.03317,0.183745,0.079956,0.057718,0.042849,0.049218,0.086453,0.092333,2.935342,3.634549,4.442654,2e-05,1.7e-05,2e-05
5,LightFM,0.082057,0.052285,0.053741,0.154652,0.034554,0.19316,0.082057,0.058774,0.043709,0.052285,0.088137,0.094597,2.393639,3.110923,3.761679,2e-06,3e-06,3e-06
6,ALS,0.086459,0.051891,0.05151,0.144509,0.033551,0.182176,0.086459,0.058903,0.043789,0.051891,0.086983,0.093016,2.945773,3.631755,4.470478,1.8e-05,1.6e-05,2e-05
7,LightFM,0.081392,0.050193,0.052507,0.147997,0.033605,0.183914,0.081392,0.058151,0.042993,0.050193,0.085565,0.091404,2.37287,3.082924,3.764795,2e-06,3e-06,2e-06
8,ALS,0.079176,0.047634,0.047727,0.134362,0.031203,0.170488,0.079176,0.054457,0.040556,0.047634,0.08049,0.086191,2.905897,3.583883,4.478456,1.5e-05,1.7e-05,2.2e-05
9,LightFM,0.076837,0.047117,0.048722,0.137553,0.030625,0.168277,0.076837,0.054054,0.039566,0.047117,0.079584,0.084598,2.357215,3.066911,3.752297,2e-06,2e-06,2e-06


In [30]:
df_results.groupby("model").mean()

Unnamed: 0_level_0,Precision@1,Recall@1,Precision@5,Recall@5,Precision@10,Recall@10,NDCG@1,NDCG@5,NDCG@10,MAP@1,MAP@5,MAP@10,MIUF@1,MIUF@5,MIUF@10,Serendipity@1,Serendipity@5,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ALS,0.09045,0.055722,0.054137,0.155923,0.034846,0.194594,0.09045,0.061983,0.045764,0.055722,0.094067,0.100262,2.930684,3.643255,4.450275,1.7e-05,1.6e-05,2e-05
LightFM,0.087375,0.05566,0.056674,0.163658,0.035774,0.201168,0.087375,0.062381,0.045853,0.05566,0.094216,0.100503,2.405445,3.126451,3.771961,4e-06,4e-06,3e-06


In [31]:
df_results.to_csv('df_results2.csv')

In [29]:
show_pivot(df_results, group=True)

Metric,MAP,MAP,MAP,MIUF,MIUF,MIUF,NDCG,NDCG,NDCG,Precision,Precision,Precision,Recall,Recall,Recall,Serendipity,Serendipity,Serendipity
Value,1,5,10,1,5,10,1,5,10,1,5,10,1,5,10,1,5,10
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
ALS,0.09045,0.055722,0.054137,0.155923,0.034846,0.194594,0.09045,0.061983,0.045764,0.055722,0.094067,0.100262,2.930684,3.643255,4.450275,1.7e-05,1.6e-05,2e-05
LightFM,0.087375,0.05566,0.056674,0.163658,0.035774,0.201168,0.087375,0.062381,0.045853,0.05566,0.094216,0.100503,2.405445,3.126451,3.771961,4e-06,4e-06,3e-06


Модель ALS справилась лучше, будем использовать её

## Train

In [34]:
user_features_all = get_user_features(users, interactions)
item_features_all = get_item_features(items, interactions)

In [35]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features_all,
    item_features_df=item_features_all,
    cat_user_features=["sex", "age", "income"],
    cat_item_features=["genre", "content_type"],
)

## ALS

In [36]:
ALS_model = ImplicitALSWrapperModel(
    AlternatingLeastSquares(
        factors=best_params_ALS.get("factors"),
        iterations=best_params_ALS.get("iterations"),
        random_state=RANDOM_STATE,
    )
)

In [37]:
ALS_model.fit(dataset)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7fa992777910>

In [38]:
ALS_model.recommend(interactions.user_id.head(1), dataset, k=10, filter_viewed=True)

Unnamed: 0,user_id,item_id,score,rank
0,176549,11749,0.92116,1
1,176549,16270,0.857889,2
2,176549,1287,0.833349,3
3,176549,11985,0.800137,4
4,176549,10073,0.798379,5
5,176549,9900,0.777797,6
6,176549,334,0.763961,7
7,176549,12173,0.720251,8
8,176549,13159,0.669694,9
9,176549,14942,0.669363,10


In [None]:
pickle.dump(ALS_model, open('../saved_models/als.pkl', "wb"))

## LightFM

In [39]:
LFM_model = LightFMWrapperModel(
    LightFM(
        no_components=best_params_LFM.get("no_components"),
        loss=best_params_LFM.get("loss"),
        learning_rate=best_params_LFM.get("learning_rate"),
        user_alpha=best_params_LFM.get("user_alpha"),
        item_alpha=best_params_LFM.get("item_alpha"),
        random_state=RANDOM_STATE,
    ),
    epochs=best_params_LFM.get("n_epochs"),
    num_threads=N_THREADS,
)

In [40]:
LFM_model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7fa977944c10>

In [52]:
LFM_model.recommend([interactions.user_id.head(1).values[0]], dataset, k=10, filter_viewed=True)

Unnamed: 0,user_id,item_id,score,rank
0,176549,15297,-0.000789,1
1,176549,10440,-0.000806,2
2,176549,13865,-0.000872,3
3,176549,4151,-0.00088,4
4,176549,3734,-0.0009,5
5,176549,2657,-0.000927,6
6,176549,4880,-0.000935,7
7,176549,6809,-0.000949,8
8,176549,142,-0.000962,9
9,176549,8636,-0.000966,10


## ANN

In [53]:
user_vectors_als, item_vectors_als = ALS_model.get_vectors()

In [54]:
ALS_ANN = UserToItemAnnRecommender(
    user_vectors=user_vectors_als,
    item_vectors=item_vectors_als,
    user_id_map=dataset.user_id_map,
    item_id_map=dataset.item_id_map,
)

In [57]:
user_vectors_lfm, item_vectors_lfm = LFM_model.get_vectors(dataset)

In [58]:
LFM_ANN = UserToItemAnnRecommender(
    user_vectors=user_vectors_lfm,
    item_vectors=item_vectors_lfm,
    user_id_map=dataset.user_id_map,
    item_id_map=dataset.item_id_map,
)

In [59]:
ALS_ANN.fit()

<rectools.tools.ann.UserToItemAnnRecommender at 0x7fa976201bd0>

In [60]:
ALS_ANN.get_item_list_for_user(interactions.user_id.head(1).values[0], top_n=10).tolist()

[14470, 7582, 7102, 101, 12324, 10761, 9506, 9728, 13243, 11749]

In [61]:
LFM_ANN.fit()

<rectools.tools.ann.UserToItemAnnRecommender at 0x7fa976200fa0>

In [62]:
LFM_ANN.get_item_list_for_user(interactions.user_id.head(1).values[0], top_n=10).tolist()

[15297, 10440, 9728, 13865, 4151, 3734, 2657, 4880, 6809, 142]

In [63]:
pickle.dump(ALS_ANN, open('../saved_models/als_ann.pkl', "wb"))

In [64]:
pickle.dump(LFM_ANN, open('../saved_models/lfm_ann.pkl', "wb"))