In [1]:
import warnings

import pandas as pd
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
from rectools import Columns
from rectools.dataset import Interactions
from rectools.metrics import MAP, MeanInvUserFreq, calc_metrics
from rectools.metrics import Precision, Recall, NDCG, Serendipity
from rectools.model_selection import TimeRangeSplitter

from models.userknn import UserKnn

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)
import pickle
import re



## Data

In [28]:
interactions_df = pd.read_csv('../data/interactions.csv')

interactions_df.rename(columns={'last_watch_dt': Columns.Datetime,
                                'total_dur': Columns.Weight}, inplace=True)

# interactions = Interactions(interactions_df)
interactions = Interactions(interactions_df.sample(frac=0.05))

interactions.df.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
4095320,923232,3935,2021-06-14,6159.0,100.0
1135938,405953,1112,2021-06-15,235.0,5.0
3883989,876404,6066,2021-03-20,348.0,6.0
5014988,519719,13959,2021-06-03,5526.0,20.0
4965468,991247,1168,2021-04-17,7403.0,100.0


## Cross validation

In [29]:
N_SPLITS = 5
TEST_SIZE = '7D'

In [30]:
cv = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [31]:
metrics = {
    'Precision@10': Precision(k=10),
    'Recall@10': Recall(k=10),
    'NDCG@10': NDCG(k=10),
    'map@10': MAP(k=10),
    'novelty': MeanInvUserFreq(k=10),
    'Serendipity@10': Serendipity(k=10)
}

models = {
    'cosine_userknn': CosineRecommender(),
    'tfidf_userknn': TFIDFRecommender(),
    'BM25_userknn': BM25Recommender()
}

In [32]:
def show_pivot(results, group=False):
    pivot_results = results.drop("fold", axis=1).groupby(["model"]).mean()

    if group:
        new_columns = sorted([
            (re.split("@", col)[0], int(re.split("@", col)[1])) if "@" in col else (col, "")
            for col in pivot_results.columns])
        pivot_results.columns = pd.MultiIndex.from_tuples(new_columns, names=["Metric", "Value"])

    display(
        pivot_results.style
        .highlight_min(color='lightcoral', axis=0)
        .highlight_max(color='lightgreen', axis=0)
    )

In [33]:
results = []

fold_iterator = cv.split(interactions, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    pprint(fold_info)

    df_train = interactions.df.iloc[train_ids].copy()
    df_test = interactions.df.iloc[test_ids][Columns.UserItem].copy()

    catalog = df_train[Columns.Item].unique()

    for model_name, model in models.items():
        userknn_model = UserKnn(model=model, N_users=50)
        userknn_model.fit(df_train)

        recos = userknn_model.predict(df_test)

        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )

        fold = {"fold": i_fold, "model": model_name}
        fold.update(metric_values)
        results.append(fold)


{'end': Timestamp('2021-07-26 00:00:00', freq='7D'),
 'i_split': 0,
 'start': Timestamp('2021-07-19 00:00:00', freq='7D'),
 'test': 4620,
 'test_items': 1698,
 'test_users': 3988,
 'train': 178209,
 'train_items': 7316,
 'train_users': 128389}


  0%|          | 0/128389 [00:00<?, ?it/s]

  0%|          | 0/128389 [00:00<?, ?it/s]

  0%|          | 0/128389 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-02 00:00:00', freq='7D'),
 'i_split': 1,
 'start': Timestamp('2021-07-26 00:00:00', freq='7D'),
 'test': 5075,
 'test_items': 1758,
 'test_users': 4420,
 'train': 194855,
 'train_items': 7487,
 'train_users': 139288}


  0%|          | 0/139288 [00:00<?, ?it/s]

  0%|          | 0/139288 [00:00<?, ?it/s]

  0%|          | 0/139288 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'i_split': 2,
 'start': Timestamp('2021-08-02 00:00:00', freq='7D'),
 'test': 5357,
 'test_items': 1749,
 'test_users': 4667,
 'train': 213415,
 'train_items': 7664,
 'train_users': 151495}


  0%|          | 0/151495 [00:00<?, ?it/s]

  0%|          | 0/151495 [00:00<?, ?it/s]

  0%|          | 0/151495 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'i_split': 3,
 'start': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'test': 5848,
 'test_items': 1932,
 'test_users': 5070,
 'train': 232606,
 'train_items': 7872,
 'train_users': 163936}


  0%|          | 0/163936 [00:00<?, ?it/s]

  0%|          | 0/163936 [00:00<?, ?it/s]

  0%|          | 0/163936 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-23 00:00:00', freq='7D'),
 'i_split': 4,
 'start': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'test': 6312,
 'test_items': 1979,
 'test_users': 5434,
 'train': 252663,
 'train_items': 8068,
 'train_users': 176665}


  0%|          | 0/176665 [00:00<?, ?it/s]

  0%|          | 0/176665 [00:00<?, ?it/s]

  0%|          | 0/176665 [00:00<?, ?it/s]

In [34]:
#df_metrics = pd.read_csv("df_metrics.csv")

In [35]:
df_metrics=pd.DataFrame(results)

In [36]:
show_pivot(df_metrics, True)

Metric,NDCG,Precision,Recall,Serendipity,map,novelty
Value,10,10,10,10,10,Unnamed: 6_level_1
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
BM25_userknn,0.000266,0.002275,0.000227,0.000462,10.096834,5e-06
cosine_userknn,0.000206,0.001776,0.000178,0.000371,9.851301,3e-06
tfidf_userknn,0.000332,0.002845,0.000302,0.000654,9.754532,3e-06


Ориентируемся на MAP@10, будем использовать BM25

## Train

In [37]:
uknn = UserKnn(BM25Recommender(), N_users=50)

In [38]:
uknn.fit(interactions.df)

  0%|          | 0/189929 [00:00<?, ?it/s]

In [39]:
interactions.df.head(2).user_id.values[0]

923232

In [40]:
uknn.recommend(interactions.df.head(2).user_id.values[0], N_recs=10)

[3935, 10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142]

In [41]:
uknn.recommend(1000000000, N_recs=10)

[10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809]

## Saving

In [42]:
pickle.dump(uknn, open('../saved_models/userknn.pkl', "wb"))

In [43]:
with open('../saved_models/userknn.pkl', 'rb') as f:
    uknn_pkl = pickle.load(f)

In [44]:
uknn_pkl.recommend(interactions.df.head(2).user_id.values[0], N_recs=10)

[3935, 10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142]