In [1]:
import warnings
from pprint import pprint

import pandas as pd
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
from rectools import Columns
from rectools.dataset import Interactions
from rectools.metrics import MAP, MeanInvUserFreq, calc_metrics
from rectools.metrics import Precision, Recall, NDCG, Serendipity
from rectools.model_selection import TimeRangeSplitter

from models.userknn import UserKnn

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)
import pickle

In [2]:
interactions_df = pd.read_csv('../data/interactions.csv')
users = pd.read_csv('../data/users.csv')
items = pd.read_csv('../data/items.csv')

interactions_df.rename(columns={'last_watch_dt': Columns.Datetime,
                                'total_dur': Columns.Weight}, inplace=True)
# will cast types and save new pd.DataFrame inside in Interactions.df
# interactions = Interactions(interactions_df)

# ! если хотите быстро прогнать этот ноутбук - раскомментируйте эту строку - она уменьшает данные
interactions = Interactions(interactions_df.sample(frac=0.01))

interactions.df.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
2706061,222607,11697,2021-05-15,34353.0,100.0
773945,878038,4740,2021-07-25,4.0,0.0
2122652,133168,3125,2021-08-05,891.0,0.0
2380760,1062626,8337,2021-05-16,414.0,0.0
799959,915203,11310,2021-06-18,1953.0,26.0


In [3]:
N_SPLITS = 1
TEST_SIZE = '14D'

In [4]:
cv = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [5]:
metrics = {
    'Precision@10': Precision(k=10),
    'Recall@10': Recall(k=10),
    'NDCG@10': NDCG(k=10),
    'map@10': MAP(k=10),
    'novelty': MeanInvUserFreq(k=10),
    'Serendipity@10': Serendipity(k=10)
}

# few simple models to compare
models = {
    'cosine_userknn': CosineRecommender(),  # implicit 
    # 'tfidf_userknn': TFIDFRecommender(),
    # 'BM25_userknn': BM25Recommender()
}

In [6]:
results = []

fold_iterator = cv.split(interactions, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    pprint(fold_info)

    df_train = interactions.df.iloc[train_ids].copy()
    df_test = interactions.df.iloc[test_ids][Columns.UserItem].copy()

    catalog = df_train[Columns.Item].unique()

    for model_name, model in models.items():
        userknn_model = UserKnn(model=model, N_users=50)
        userknn_model.fit(df_train)

        recos = userknn_model.predict(df_test)

        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )

        fold = {"fold": i_fold, "model": model_name}
        fold.update(metric_values)
        results.append(fold)


{'end': Timestamp('2021-08-23 00:00:00', freq='14D'),
 'i_split': 0,
 'start': Timestamp('2021-08-09 00:00:00', freq='14D'),
 'test': 724,
 'test_items': 529,
 'test_users': 683,
 'train': 46523,
 'train_items': 4803,
 'train_users': 42121}


  0%|          | 0/42121 [00:00<?, ?it/s]

In [7]:
df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,fold,model,Precision@10,Recall@10,NDCG@10,map@10,novelty,Serendipity@10
0,0,cosine_userknn,0.000293,0.002928,0.0003,0.000854,10.252009,8.741356e-07


In [8]:
df_metrics.groupby('model').mean()[metrics.keys()]


Unnamed: 0_level_0,Precision@10,Recall@10,NDCG@10,map@10,novelty,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
cosine_userknn,0.000293,0.002928,0.0003,0.000854,10.252009,8.741356e-07


In [9]:
uknn = UserKnn(TFIDFRecommender(), N_users=50)

In [10]:
uknn.fit(interactions.df)

  0%|          | 0/49318 [00:00<?, ?it/s]

In [11]:
interactions.df.head(2).user_id.values[0]

222607

In [12]:
uknn.predict(pd.DataFrame([{'user_id': interactions.df.head(2).user_id.values[0]}]))

Unnamed: 0,user_id,item_id,score,rank
0,222607,11697,6.941464,1
1,222607,10440,0.01,2
2,222607,15297,0.01,3
3,222607,9728,0.01,4
4,222607,13865,0.01,5
5,222607,4151,0.01,6
6,222607,2657,0.01,7
7,222607,3734,0.01,8
8,222607,4880,0.01,9
9,222607,142,0.01,10


In [13]:
uknn.recommend_user(interactions.df.head(2).user_id.values[1], N_recs=10)

AttributeError: 'UserKnn' object has no attribute 'recommend_user'

In [None]:
pickle.dump(uknn, open('../saved_models/userknn.pkl', "wb"))

In [None]:
with open('../saved_models/userknn.pkl', 'rb') as f:
    uknn_pkl = pickle.load(f)

In [None]:
uknn.recommend_user(interactions.df.head(2).user_id.values[0], N_recs=10)