## Imports

In [6]:
import numpy as np
import pandas as pd
from rectools import Columns
from rectools.dataset import Dataset
from rectools.model_selection import TimeRangeSplitter
from rectools.metrics import Precision, Recall, MAP, MeanInvUserFreq, Serendipity, calc_metrics
from rectools.dataset.interactions import Interactions
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
from rectools.models.implicit_knn import ImplicitItemKNNWrapperModel
from tqdm.notebook import tqdm

In [7]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 200)

## Data

In [8]:
interactions = pd.read_csv('../data/kion_train/interactions.csv')
users = pd.read_csv('../data/kion_train/users.csv')
items = pd.read_csv('../data/kion_train/items.csv')

In [9]:
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight},
                    inplace=True)

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

## Prepare train

In [10]:
n_folds=5
unit='W' 
n_units=1

periods = n_folds + 1
freq = f"{n_units}{unit}"

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + n_units, unit=unit)  
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Start date and last date of the test fold: {start_date, last_date}")
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(Interactions(interactions))}")

Start date and last date of the test fold: (Timestamp('2021-07-11 00:00:00'), Timestamp('2021-08-22 00:00:00'))
Test fold borders: ['2021-07-11' '2021-07-18' '2021-07-25' '2021-08-01' '2021-08-08'
 '2021-08-15']
Real number of folds: 5


In [11]:
(train_ids, test_ids, fold_info) = cv.split(Interactions(interactions), collect_fold_stats=True).__next__()
train = interactions.loc[train_ids]
test = interactions.loc[test_ids]

In [12]:
users_inv_mapping = dict(enumerate(train['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}

In [13]:
items_inv_mapping = dict(enumerate(train['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

In [14]:
print(f"users_mapping amount: {len(users_mapping)}")
print(f"items_mapping amount: {len(items_mapping)}")

users_mapping amount: 640144
items_mapping amount: 14711


In [15]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=None,
    item_features_df=None
)

## CosineRecommender, TFIDFRecommender, UserKNN BMP25 with CV

In [16]:
k = [10, 20, 30, 50, 100]

models = {}

for i in k:
    
    models[f"CosineRecommender_k_{i}"] = ImplicitItemKNNWrapperModel(model=CosineRecommender(K=i))
    models[f"TFIDFRecommender_k_{i}"] = ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=i))
    models[f"BM25Recommender_k_{i}"] = ImplicitItemKNNWrapperModel(model=BM25Recommender(K=i))   

In [17]:
metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

catalog = train['item_id'].unique()

In [19]:
metrics_result = {}

fold_iter = cv.split(Interactions(interactions), collect_fold_stats=True)
    
for idx_fold, (train_ids, test_ids, fold_info) in tqdm(enumerate(fold_iter)):
    
    idx_fold += 1
    
    train = interactions.loc[train_ids]
    test = interactions.loc[test_ids]
    
    catalog = train['user_id'].unique()

    dataset = Dataset.construct(
        interactions_df=train,
        user_features_df=None,
        item_features_df=None

    )
        
    for model_name in models:
        

        model = models[model_name]
        model.fit(dataset)
        
        recs = model.recommend(
                test['user_id'].unique(), 
                dataset=dataset, 
                k=10, 
                filter_viewed=False
        )

        metric_values_itemknn = calc_metrics(
                metrics,
                reco=recs,
                interactions=test,
                prev_interactions=train,
                catalog=catalog
            )

        metrics_result[f"{model_name}_fold_{idx_fold}"] = metric_values_itemknn
        metric_values_itemknn.update({"fold" : idx_fold})

0it [00:00, ?it/s]

In [20]:
df_metrics = pd.DataFrame.from_dict(metrics_result, orient='index')

In [21]:
df_metrics

Unnamed: 0,prec@10,recall@10,novelty,serendipity,fold
CosineRecommender_k_10_fold_1,0.021501,0.126566,8.041791,1.055256e-06,1
TFIDFRecommender_k_10_fold_1,0.027819,0.158146,6.794618,1.672625e-06,1
BM25Recommender_k_10_fold_1,0.040183,0.22201,4.04386,5.120128e-07,1
CosineRecommender_k_20_fold_1,0.022033,0.129254,7.946546,1.087935e-06,1
TFIDFRecommender_k_20_fold_1,0.028068,0.159188,6.724442,1.673228e-06,1
BM25Recommender_k_20_fold_1,0.040361,0.222558,4.004138,5.016682e-07,1
CosineRecommender_k_30_fold_1,0.022143,0.129775,7.91411,1.095616e-06,1
TFIDFRecommender_k_30_fold_1,0.028116,0.159716,6.698602,1.649734e-06,1
BM25Recommender_k_30_fold_1,0.040307,0.222317,4.000939,4.712224e-07,1
CosineRecommender_k_50_fold_1,0.022225,0.13032,7.879579,1.10855e-06,1
