# Itempop and two-stage recommender on MTS data

## Setup

In [None]:
!pip install --upgrade pip setuptools wheel
!git clone https://github.com/benfred/implicit
!cd implicit && pip install .
!pip install -q catboost
!pip install recohut

In [None]:
import os
import numpy as np
import pandas as pd
import scipy.sparse as sp

import random
import datetime

import pickle
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from implicit import nearest_neighbours as NN
from implicit.nearest_neighbours import TFIDFRecommender

from catboost import CatBoostClassifier

from recohut.datasets.mts import MTSDataset
from recohut.utils.common_utils import get_coo_matrix
from recohut.transforms.splitting import TimeRangeSplit
from recohut.models.itempop import ItemPop as PopularRecommender

In [None]:
ds = MTSDataset(data_dir='/content/data', sample_frac=0.1)

In [None]:
users_df = pd.read_csv(os.path.join(ds.processed_dir, 'users_processed.csv'))
items_df = pd.read_csv(os.path.join(ds.processed_dir, 'items_processed.csv'))
interactions_df = pd.read_csv(os.path.join(ds.processed_dir, 'interactions_processed.csv'))

In [None]:
interactions_df['last_watch_dt'] = pd.to_datetime(interactions_df['last_watch_dt'])
interactions_df.sort_values(by='last_watch_dt', inplace=True)

## Winning Solution

This solution includes a two-stage model. I used item-item CF from implicit library to generate candidates with their scores and Catboost classifier to predict final ranks with classification objective. Recommendations for cold users were made with Popular items.

Implicit model parameters were chosen on sliding time window cross validation. The best scores were achieved by Cosine recommender model, taking only last 20 interactions for each user. 100 candidates with their scores were generated for each user, filtering all items that user had interactions with.

Implicit candidates were calculated for the last 14 days of the interactions. Then catboost model was trained on positive interactions from the candidates list on last 14 days. Random negative sampling was applied.

For final submission implicit candidates and catboost predictions were recalculated on the whole dataset.

Ref: [Daria](https://github.com/blondered/ods_MTS_RecSys_Challenge_solution)

In [None]:
# Creating items and users mapping
users_inv_mapping = dict(enumerate(interactions_df['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}
items_inv_mapping = dict(enumerate(interactions_df['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

In [None]:
# Preparing data
last_date_df = interactions_df['last_watch_dt'].max()
boosting_split_date = last_date_df - pd.Timedelta(days=14)
boosting_data = interactions_df[(interactions_df['last_watch_dt'] >
                                 boosting_split_date)].copy()
boost_idx = boosting_data['user_id'].unique() 
before_boosting = interactions_df[(interactions_df['last_watch_dt'] <=
                                   boosting_split_date)].copy()
before_boosting_known_items = before_boosting.groupby(
    'user_id')['item_id'].apply(list).to_dict()

before_boosting_known_items_mapped = {}
for user, recommend in before_boosting_known_items.items():
    before_boosting_known_items_mapped[user] = list(map(lambda x:
                                                        items_mapping[x],
                                                        recommend))
before_boosting['order_from_recent'] = before_boosting.sort_values(
    by=['last_watch_dt'], ascending=False).groupby('user_id').cumcount() + 1
boost_warm_idx = np.intersect1d(before_boosting['user_id'].unique(),
                                boosting_data['user_id'].unique())

 Calculates top candidates from implicit model with their scores. Implicit parameters were chosen on time range split cross-validation. History offset stands for taking only lask X items from user history. Day offset stands for taking items from last X days of user history.

In [None]:
k_neighbours = 200
day_offset = 170
history_offset = 20
distance = 'Cosine'
num_candidates = 100

In [None]:
before_boosting['order_from_recent'] = before_boosting.sort_values(
    by=['last_watch_dt'], ascending=False).groupby('user_id').cumcount() + 1
train = before_boosting.copy()
date_window = train['last_watch_dt'].max() - pd.DateOffset(days=day_offset)
train = train[train['last_watch_dt'] >= date_window]

In [None]:
if history_offset:
    train = train[train['order_from_recent'] < history_offset]
    
if distance == 'Cosine':
    model = NN.CosineRecommender(K=k_neighbours)
    weights = None
else:
    model = NN.TFIDFRecommender(K=k_neighbours)
    weights = None

In [None]:
train_mat = get_coo_matrix(
    train,
    users_mapping=users_mapping,
    items_mapping=items_mapping,
    weight_col=weights
).tocsr()

In [None]:
model.fit(train_mat.T, show_progress=True)

  0%|          | 0/266854 [00:00<?, ?it/s]

In [None]:
def generate_implicit_recs_mapper(
        model,
        train_matrix,
        top_N,
        user_mapping,
        item_inv_mapping,
        filter_already_liked_items,
        known_items=None,
        filter_items=None,
        return_scores=False
):
    def _recs_mapper(user):
        user_id = user_mapping[user]
        if filter_items:
            if user in known_items:
                filtering = set(known_items[user]).union(set(filter_items))
            else:
                filtering = filter_items
        else:
            if known_items and user in known_items:
                filtering = known_items[user]
            else:
                filtering = None
        recs = model.recommend(user_id,
                               train_matrix,
                               N=top_N,
                               filter_already_liked_items=filter_already_liked_items,
                               filter_items=filtering)
        if return_scores:
            return recs
        return recs[0]

    return _recs_mapper

In [None]:
mapper = generate_implicit_recs_mapper(
    model,
    train_mat,
    num_candidates,
    users_mapping,
    items_inv_mapping,
    filter_already_liked_items=False,
    known_items=before_boosting_known_items_mapped,
    filter_items=None,
    return_scores=True
)

In [None]:
recs = pd.DataFrame({'user_id': boost_warm_idx})
recs['item_id_score'] = recs['user_id'].map(mapper)
recs['item_id'] = recs['item_id_score'].apply(lambda x: x[0])
recs['implicit_score'] = recs['item_id_score'].apply(lambda x: x[1])
recs['tmp'] = recs.apply(lambda row: list(zip(row['item_id'], row['implicit_score'])), axis=1) 
recs = recs.explode('tmp')
recs[['item_id','implicit_score']] = pd.DataFrame(recs['tmp'].tolist(), index=recs.index)
recs.drop(columns='tmp', inplace=True)
recs.drop(['item_id_score'], axis=1, inplace=True)
recs

Unnamed: 0,user_id,item_id,implicit_score
0,30,199262.0,0.707107
0,30,203105.0,0.707107
0,30,199886.0,0.707107
0,30,219904.0,0.707107
0,30,203206.0,0.707107
...,...,...,...
22231,1097544,263721.0,0.577350
22231,1097544,227113.0,0.577350
22231,1097544,239830.0,0.577350
22231,1097544,139002.0,0.577350


In [None]:
recs.to_csv(os.path.join(ds.processed_dir, 'impl_scores.csv'), index=False)

In [None]:
# taking candidates from implicit model and generating positive samples
candidates = pd.read_csv(os.path.join(ds.processed_dir, 'impl_scores.csv'))
candidates['item_id'] = candidates['item_id'].fillna(0.).astype('int64')
candidates['id'] = candidates.index
pos = candidates.merge(boosting_data[['user_id', 'item_id']], 
                       on=['user_id', 'item_id'], how='inner')
pos['target'] = 1

In [None]:
pos

Unnamed: 0,user_id,item_id,implicit_score,id,target
0,109925,5543,1.0,211288,1
1,126087,5518,1.0,240448,1
2,131803,7807,0.707107,250989,1
3,140179,5011,0.707107,264967,1
4,223763,2780,1.0,425032,1
5,316074,7033,1.0,604543,1
6,419536,10267,1.0,806723,1
7,482854,13237,1.0,923066,1
8,484834,7558,0.5,927130,1
9,487160,3784,1.0,931333,1


In [None]:
# Generating negative samples
num_negatives = 3
pos_group = pos.groupby('user_id')['item_id'].count()
neg = candidates[~candidates['id'].isin(pos['id'])].copy()
neg_sampling = pd.DataFrame(neg.groupby('user_id')['id'].apply(
    list)).join(pos_group, on='user_id',  rsuffix='p', how='right')
neg_sampling['num_choices'] = np.clip(neg_sampling['item_id'] * num_negatives, 
                                      a_min=0, a_max=25)
func = lambda row: np.random.choice(row['id'],
                                    size=row['num_choices'],
                                    replace=False)
neg_sampling['sample_idx'] = neg_sampling.apply(func, axis=1)
idx_chosen = neg_sampling['sample_idx'].explode().values
neg = neg[neg['id'].isin(idx_chosen)]
neg['target'] = 0

In [None]:
neg

Unnamed: 0,user_id,item_id,implicit_score,id,target
211232,109925,12948,1.0,211232,0
211234,109925,31205,1.0,211234,0
211287,109925,251132,1.0,211287,0
240482,126087,38859,1.0,240482,0
240493,126087,65257,1.0,240493,0
240494,126087,41067,1.0,240494,0
250980,131803,207587,0.57735,250980,0
250988,131803,6113,0.707107,250988,0
251041,131803,107381,1.0,251041,0
265003,140179,30433,1.0,265003,0


In [None]:
# Creating training data sample and early stopping data sample
boost_idx_train = np.intersect1d(boost_idx, pos['user_id'].unique())
boost_train_users, boost_eval_users = train_test_split(boost_idx_train, 
                                                       test_size=0.1,
                                                       random_state=345)
select_col = ['user_id', 'item_id', 'implicit_score', 'target']
boost_train = shuffle(
    pd.concat([
               pos[pos['user_id'].isin(boost_train_users)],
               neg[neg['user_id'].isin(boost_train_users)]
    ])[select_col]
)
boost_eval = shuffle(
    pd.concat([
               pos[pos['user_id'].isin(boost_eval_users)],
               neg[neg['user_id'].isin(boost_eval_users)]
    ])[select_col]
)

In [None]:
user_col = ['user_id','age','income','sex','kids_flg','boost_user_watch_cnt_all',
            'boost_user_watch_cnt_last_14']

item_col = ['item_id','content_type','countries_max','for_kids','age_rating',
            'studios_max','genres_max','genres_min','genres_med','release_novelty']

item_stats_col = ['item_id','watched_in_7_days','watch_ts_std','trend_slope',
                  'watch_ts_quantile_95_diff','watch_ts_median_diff',
                  'watched_in_all_time','male_watchers_fraction',
                  'female_watchers_fraction','younger_35_fraction','older_35_fraction']
                  
cat_col = ['age','income','sex','content_type']

In [None]:
train_feat = boost_train.merge(users_df[user_col],
                               on=['user_id'],
                               how='left')\
                               .merge(items_df[item_col],
                                      on=['item_id'],
                                      how='left')
                               
eval_feat = boost_eval.merge(users_df[user_col],
                             on=['user_id'],
                             how='left') \
                               .merge(items_df[item_col],
                                      on=['item_id'],
                                      how='left')
                               
eval_feat

Unnamed: 0,user_id,item_id,implicit_score,target,age,income,sex,kids_flg,boost_user_watch_cnt_all,boost_user_watch_cnt_last_14,content_type,countries_max,for_kids,age_rating,studios_max,genres_max,genres_min,genres_med,release_novelty
0,316074,7033,1.0,1,age_18_24,income_20_40,F,False,4.0,0.0,series,4340.0,False,16.0,14898.0,3858.0,2778.0,3318.0,5.0
1,131803,6113,0.707107,0,age_35_44,income_20_40,M,False,0.0,0.0,film,5065.0,False,12.0,14898.0,3503.0,1820.0,1877.0,1.0
2,316074,11829,1.0,0,age_18_24,income_20_40,F,False,4.0,0.0,film,5065.0,False,18.0,14898.0,1820.0,1033.0,1426.5,6.0
3,131803,207587,0.57735,0,age_35_44,income_20_40,M,False,0.0,0.0,,,,,,,,,
4,316074,7107,1.0,0,age_18_24,income_20_40,F,False,4.0,0.0,series,4340.0,False,12.0,14898.0,5431.0,626.0,1877.0,6.0
5,131803,7807,0.707107,1,age_35_44,income_20_40,M,False,0.0,0.0,film,4340.0,False,16.0,14898.0,3858.0,3858.0,3858.0,5.0
6,316074,73997,1.0,0,age_18_24,income_20_40,F,False,4.0,0.0,,,,,,,,,
7,131803,107381,1.0,0,age_35_44,income_20_40,M,False,0.0,0.0,,,,,,,,,


In [None]:
item_stats = pd.read_csv(os.path.join(ds.processed_dir, 'item_stats.csv'))
item_stats = item_stats[item_stats_col]
train_feat = train_feat.join(item_stats.set_index('item_id'), 
                             on='item_id', how='left')
eval_feat = eval_feat.join(item_stats.set_index('item_id'), 
                           on='item_id', how='left')
drop_col = ['user_id', 'item_id']
target_col = ['target']

X_train = train_feat.drop(drop_col + target_col, axis=1)
y_train = train_feat[target_col]
X_val = eval_feat.drop(drop_col + target_col, axis=1)
y_val = eval_feat[target_col]
X_train.fillna('None', inplace=True)
X_val.fillna('None', inplace=True)
X_train[cat_col] = X_train[cat_col].astype('category')
X_val[cat_col] = X_val[cat_col].astype('category')

X_train

Unnamed: 0,implicit_score,age,income,sex,kids_flg,boost_user_watch_cnt_all,boost_user_watch_cnt_last_14,content_type,countries_max,for_kids,age_rating,studios_max,genres_max,genres_min,genres_med,release_novelty,watched_in_7_days,watch_ts_std,trend_slope,watch_ts_quantile_95_diff,watch_ts_median_diff,watched_in_all_time,male_watchers_fraction,female_watchers_fraction,younger_35_fraction,older_35_fraction
0,1.000000,age_35_44,income_20_40,F,False,2,1,film,5065,False,16,14898,2418,1820,2119,3,46,0.787585,0.195783,0,1,46,0.422222,0.355556,0.311111,0.466667
1,1.000000,age_unknown,income_unknown,sex_unknown,False,1,1,series,4340,False,12,14898,1339,1339,1339,4,0,0,0,0,0,0,0,0,0,0
2,0.500000,age_18_24,income_20_40,M,False,2,1,film,5065,False,18,14898,5431,1224,2418,5,5,46.3813,-0.0692771,5,74,89,0.431818,0.409091,0.420455,0.420455
3,1.000000,age_25_34,income_20_40,M,True,1,1,,,,,,,,,,,,,,,,,,,
4,0.707107,age_18_24,income_20_40,M,False,3,1,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,0.707107,age_35_44,income_20_40,F,False,2,1,,,,,,,,,,,,,,,,,,,
68,0.447214,age_18_24,income_20_40,M,False,2,1,film,295,False,18,14898,3858,31,3140.5,4,0,0,0,0,0,0,0,0,0,0
69,0.500000,age_25_34,income_40_60,M,True,1,1,film,1272,False,18,14898,5431,254,3503,5,0,0,0,68,68,1,0,0,0,0
70,1.000000,age_45_54,income_40_60,M,True,2,0,film,5065,False,16,14898,3858,2778,3503,4,0,0,0,0,0,0,0,0,0,0


In [None]:
# Training CatBoost classifier with parameters previously chosen on cross validation
params = {
    'subsample': 0.97, 
    'max_depth': 9,
    'n_estimators': 2000,
    'learning_rate': 0.03, 
    'scale_pos_weight': num_negatives, 
    'l2_leaf_reg': 27, 
    'thread_count': -1,
    'verbose': 200,
    'task_type': "CPU",
    'devices': '0:1',
    # 'bootstrap_type': 'Poisson'
}
boost_model = CatBoostClassifier(**params)
boost_model.fit(X_train,
                y_train,
                eval_set=(X_val, y_val),
                early_stopping_rounds=200,
                cat_features=cat_col,
                plot=False)

0:	learn: 0.6814278	test: 0.6853672	best: 0.6853672 (0)	total: 57.5ms	remaining: 1m 54s
200:	learn: 0.1793975	test: 0.5471784	best: 0.5422113 (146)	total: 1.19s	remaining: 10.7s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.5422113159
bestIteration = 146

Shrink model to first 147 iterations.


<catboost.core.CatBoostClassifier at 0x7f157243ac90>

In [None]:
with open("catboost_trained.pkl", 'wb') as f:
    pickle.dump(boost_model, f)

In [None]:
# with open("catboost_trained.pkl", 'rb') as f:
#     boost_model = pickle.load(f)
boost_model

<catboost.core.CatBoostClassifier at 0x7f157243ac90>

In [None]:
random_items = list(np.random.choice(interactions_df['user_id'], size=5, replace=False))
cold_items = [10000, 20000]
random_items.extend(cold_items)

In [None]:
warm_idx = np.intersect1d(random_items, interactions_df['user_id'].unique())
warm_idx

array([ 20000, 133452, 332832, 341075, 622570, 728808])

In [None]:
_candidates = candidates.copy()
_candidates.dropna(subset=['item_id'], axis=0, inplace=True)

In [None]:
submit_feat = _candidates.merge(users_df[user_col],
                               on=['user_id'],
                               how='left') \
    .merge(items_df[item_col],
           on=['item_id'],
           how='left')
submit_feat

Unnamed: 0,user_id,item_id,implicit_score,id,age,income,sex,kids_flg,boost_user_watch_cnt_all,boost_user_watch_cnt_last_14,content_type,countries_max,for_kids,age_rating,studios_max,genres_max,genres_min,genres_med,release_novelty
0,30,199262,0.707107,0,age_unknown,income_unknown,sex_unknown,False,2.0,1.0,,,,,,,,,
1,30,203105,0.707107,1,age_unknown,income_unknown,sex_unknown,False,2.0,1.0,,,,,,,,,
2,30,199886,0.707107,2,age_unknown,income_unknown,sex_unknown,False,2.0,1.0,,,,,,,,,
3,30,219904,0.707107,3,age_unknown,income_unknown,sex_unknown,False,2.0,1.0,,,,,,,,,
4,30,203206,0.707107,4,age_unknown,income_unknown,sex_unknown,False,2.0,1.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2109148,1097544,263721,0.577350,2109148,age_25_34,income_20_40,F,True,1.0,1.0,,,,,,,,,
2109149,1097544,227113,0.577350,2109149,age_25_34,income_20_40,F,True,1.0,1.0,,,,,,,,,
2109150,1097544,239830,0.577350,2109150,age_25_34,income_20_40,F,True,1.0,1.0,,,,,,,,,
2109151,1097544,139002,0.577350,2109151,age_25_34,income_20_40,F,True,1.0,1.0,,,,,,,,,


In [None]:
full_train = submit_feat.fillna('None')
full_train[cat_col] = full_train[cat_col].astype('category')
# item_stats = pd.read_csv('data/item_stats_for_submit.csv')
full_train = full_train.join(item_stats.set_index('item_id'),
                             on='item_id', how='left')

In [None]:
full_train

Unnamed: 0,user_id,item_id,implicit_score,id,age,income,sex,kids_flg,boost_user_watch_cnt_all,boost_user_watch_cnt_last_14,content_type,countries_max,for_kids,age_rating,studios_max,genres_max,genres_min,genres_med,release_novelty,watched_in_7_days,watch_ts_std,trend_slope,watch_ts_quantile_95_diff,watch_ts_median_diff,watched_in_all_time,male_watchers_fraction,female_watchers_fraction,younger_35_fraction,older_35_fraction
0,30,199262,0.707107,0,age_unknown,income_unknown,sex_unknown,False,2,1,,,,,,,,,,,,,,,,,,,
1,30,203105,0.707107,1,age_unknown,income_unknown,sex_unknown,False,2,1,,,,,,,,,,,,,,,,,,,
2,30,199886,0.707107,2,age_unknown,income_unknown,sex_unknown,False,2,1,,,,,,,,,,,,,,,,,,,
3,30,219904,0.707107,3,age_unknown,income_unknown,sex_unknown,False,2,1,,,,,,,,,,,,,,,,,,,
4,30,203206,0.707107,4,age_unknown,income_unknown,sex_unknown,False,2,1,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2109148,1097544,263721,0.57735,2109148,age_25_34,income_20_40,F,True,1,1,,,,,,,,,,,,,,,,,,,
2109149,1097544,227113,0.57735,2109149,age_25_34,income_20_40,F,True,1,1,,,,,,,,,,,,,,,,,,,
2109150,1097544,239830,0.57735,2109150,age_25_34,income_20_40,F,True,1,1,,,,,,,,,,,,,,,,,,,
2109151,1097544,139002,0.57735,2109151,age_25_34,income_20_40,F,True,1,1,,,,,,,,,,,,,,,,,,,


In [None]:
cols

['user_id',
 'item_id',
 'implicit_score',
 'age',
 'income',
 'sex',
 'kids_flg',
 'user_watch_cnt_all',
 'user_watch_cnt_last_14',
 'content_type',
 'countries_max',
 'for_kids',
 'age_rating',
 'studios_max',
 'genres_max',
 'genres_min',
 'genres_med',
 'release_novelty',
 'watched_in_7_days',
 'watch_ts_std',
 'trend_slope',
 'watch_ts_quantile_95_diff',
 'watch_ts_median_diff',
 'watched_in_all_time',
 'male_watchers_fraction',
 'female_watchers_fraction',
 'younger_35_fraction',
 'older_35_fraction']

In [None]:
# Renaming columns to match classifier feature names
cols = ['user_id', 'item_id']
cols.extend(boost_model.feature_names_)
cols = cols[:7] + ['boost_user_watch_cnt_all', 'boost_user_watch_cnt_last_14'] + cols[9:]
full_train = full_train[cols]
full_train_new_names = ['user_id', 'item_id'] + boost_model.feature_names_
full_train.columns = full_train_new_names
full_train

Unnamed: 0,user_id,item_id,implicit_score,age,income,sex,kids_flg,boost_user_watch_cnt_all,boost_user_watch_cnt_last_14,content_type,countries_max,for_kids,age_rating,studios_max,genres_max,genres_min,genres_med,release_novelty,watched_in_7_days,watch_ts_std,trend_slope,watch_ts_quantile_95_diff,watch_ts_median_diff,watched_in_all_time,male_watchers_fraction,female_watchers_fraction,younger_35_fraction,older_35_fraction
0,30,199262,0.707107,age_unknown,income_unknown,sex_unknown,False,2,1,,,,,,,,,,,,,,,,,,,
1,30,203105,0.707107,age_unknown,income_unknown,sex_unknown,False,2,1,,,,,,,,,,,,,,,,,,,
2,30,199886,0.707107,age_unknown,income_unknown,sex_unknown,False,2,1,,,,,,,,,,,,,,,,,,,
3,30,219904,0.707107,age_unknown,income_unknown,sex_unknown,False,2,1,,,,,,,,,,,,,,,,,,,
4,30,203206,0.707107,age_unknown,income_unknown,sex_unknown,False,2,1,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2109148,1097544,263721,0.57735,age_25_34,income_20_40,F,True,1,1,,,,,,,,,,,,,,,,,,,
2109149,1097544,227113,0.57735,age_25_34,income_20_40,F,True,1,1,,,,,,,,,,,,,,,,,,,
2109150,1097544,239830,0.57735,age_25_34,income_20_40,F,True,1,1,,,,,,,,,,,,,,,,,,,
2109151,1097544,139002,0.57735,age_25_34,income_20_40,F,True,1,1,,,,,,,,,,,,,,,,,,,


In [None]:
# Making predictions for warm users
y_pred_all = boost_model.predict_proba(full_train.drop(
    ['user_id', 'item_id'], axis=1))
full_train['boost_pred'] = y_pred_all[:, 1]
full_train = full_train[['user_id', 'item_id', 'boost_pred']]
full_train = full_train.sort_values(by=['user_id', 'boost_pred'],
                                    ascending=[True, False])
full_train['rank'] = full_train.groupby('user_id').cumcount() + 1
full_train = full_train[full_train['rank'] <= 10].drop('boost_pred', axis=1)
full_train['item_id'] = full_train['item_id'].astype('int64')
boost_recs = full_train.groupby('user_id')['item_id'].apply(list)
boost_recs = pd.DataFrame(boost_recs)
boost_recs.reset_index(inplace=True)
boost_recs

Unnamed: 0,user_id,item_id
0,30,"[16986, 199262, 203105, 199886, 219904, 203206..."
1,55,"[12232, 7634, 6489, 15987, 14556, 5573, 15058,..."
2,106,"[8821, 10700, 10497, 3399, 9154, 3629, 12189, ..."
3,144,"[79668, 85771, 79780, 100360, 87071, 80158, 14..."
4,155,"[10747, 2236, 67784, 78954, 139975, 137705, 22..."
...,...,...
22227,1097444,"[7300, 16181, 110702, 114582, 113097, 86716, 1..."
22228,1097459,"[68578, 71663, 68642, 74552, 71682, 68811, 777..."
22229,1097470,"[196242, 201115, 196364, 201461, 203105, 19904..."
22230,1097508,"[207809, 210545, 208388, 212164, 213627, 21296..."


In [None]:
# Making predictions for cold users with Popular Recommender
idx_for_popular = list(set(pd.Series(random_items).unique()).difference(
    set(boost_recs['user_id'].unique())))
idx_for_popular

[20000, 728808, 622570, 133452, 10000, 341075]

In [None]:
interactions_df

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,917575,10353,2021-03-13,11131,58
1060,275080,15574,2021-03-13,670,11
1059,120517,9550,2021-03-13,32456,100
1058,15045,6115,2021-03-13,22830,100
1057,92904,10135,2021-03-13,3709,71
...,...,...,...,...,...
542914,484870,9157,2021-08-22,9435,6
542913,8428,5732,2021-08-22,6570,100
542912,818134,11505,2021-08-22,60,0
542923,314358,14111,2021-08-22,2590,35


In [None]:
pop_model = PopularRecommender(days=30, dt_column='last_watch_dt',
                               with_filter=True)
pop_model.fit(interactions_df)

In [None]:
recs_popular = pop_model.recommend_with_filter(interactions_df, idx_for_popular, top_K=10)
recs_popular

Unnamed: 0,user_id,item_id
4,10000,"[10440, 9728, 15297, 13865, 3734, 12192, 4151,..."
0,20000,"[10440, 9728, 15297, 13865, 3734, 12192, 4151,..."
1,728808,"[10440, 9728, 15297, 13865, 12192, 4151, 11863..."
2,622570,"[10440, 9728, 15297, 13865, 12192, 4151, 11863..."
3,133452,"[10440, 9728, 15297, 13865, 3734, 12192, 4151,..."
5,341075,"[10440, 9728, 15297, 13865, 3734, 12192, 4151,..."


In [None]:
all_recs = pd.concat([boost_recs, recs_popular], axis=0)

In [None]:
def fill_with_popular(recs, pop_model_fitted, interactions_df, top_K=10):
    """
    Fills missing recommendations with Popular Recommender.
    Takes top_K first recommendations if length of recs exceeds top_K
    """
    recs['len'] = recs['item_id'].apply(lambda x: len(x))
    recs_good = recs[recs['len'] >= top_K].copy()
    recs_good.loc[(recs_good['len'] > top_K), 'item_id'] = recs_good.loc[
        (recs_good['len'] > 10), 'item_id'].apply(lambda x: x[:10])
    recs_bad = recs[recs['len'] < top_K].copy()
    recs_bad['num_popular'] = top_K - recs_bad.len
    idx_for_filling = recs_bad['user_id'].unique()
    filling_recs = pop_model_fitted.recommend_with_filter(
        interactions_df, idx_for_filling, top_K=top_K)
    recs_bad = recs_bad.join(filling_recs.set_index('user_id'),
                             on='user_id', how='left', rsuffix='1')
    recs_bad.loc[(recs_bad['len'] > 0), 'item_id'] = \
        recs_bad.loc[(recs_bad['len'] > 0), 'item_id'] + \
        recs_bad.loc[(recs_bad['len'] > 0), 'item_id1']
    recs_bad.loc[(recs_bad['len'] == 0), 'item_id'] = recs_bad.loc[
        (recs_bad['len'] == 0), 'item_id1']
    recs_bad['item_id'] = recs_bad['item_id'].apply(lambda x: x[:top_K])
    total_recs = pd.concat([recs_good[['user_id', 'item_id']],
                            recs_bad[['user_id', 'item_id']]], axis=0)
    return total_recs

In [None]:
# Filling short recommendations woth popular items
all_recs = fill_with_popular(all_recs, pop_model, interactions_df)
all_recs

Unnamed: 0,user_id,item_id
0,30,"[16986, 199262, 203105, 199886, 219904, 203206..."
1,55,"[12232, 7634, 6489, 15987, 14556, 5573, 15058,..."
2,106,"[8821, 10700, 10497, 3399, 9154, 3629, 12189, ..."
3,144,"[79668, 85771, 79780, 100360, 87071, 80158, 14..."
4,155,"[10747, 2236, 67784, 78954, 139975, 137705, 22..."
...,...,...
22054,1087746,"[366, 4784, 33316, 63977, 10440, 9728, 15297, ..."
22137,1092833,"[15355, 198132, 191636, 50599, 177761, 10440, ..."
22159,1093784,"[296, 124311, 20002, 219743, 10440, 9728, 1529..."
22160,1093836,"[1343, 11710, 3254, 1967, 3356, 5292, 70331, 2..."


## Baseline

Popularity based model

Ref: [Official baseline tutorial](https://github.com/recohut/notebooks/blob/main/extras/mts_baseline.ipynb)

In [None]:
def calculate_novelty(train_interactions, recommendations, top_n): 
    users = recommendations['user_id'].unique()
    n_users = train_interactions['user_id'].nunique()
    n_users_per_item = train_interactions.groupby('item_id')['user_id'].nunique()

    recommendations = recommendations.loc[recommendations['rank'] <= top_n].copy()
    recommendations['n_users_per_item'] = recommendations['item_id'].map(n_users_per_item)
    recommendations['n_users_per_item'] = recommendations['n_users_per_item'].fillna(1)
    recommendations['item_novelty'] = -np.log2(recommendations['n_users_per_item'] / n_users)

    item_novelties = recommendations[['user_id', 'rank', 'item_novelty']]
    
    miuf_at_k = item_novelties.loc[item_novelties['rank'] <= top_n, ['user_id', 'item_novelty']]
    miuf_at_k = miuf_at_k.groupby('user_id').agg('mean').squeeze()

    return miuf_at_k.reindex(users).mean()

In [None]:
def compute_metrics(train, test, recs, top_N):
    result = {}
    test_recs = test.set_index(['user_id', 'item_id']).join(recs.set_index(['user_id', 'item_id']))
    test_recs = test_recs.sort_values(by=['user_id', 'rank'])

    test_recs['users_item_count'] = test_recs.groupby(level='user_id')['rank'].transform(np.size)
    test_recs['reciprocal_rank'] = (1 / test_recs['rank']).fillna(0)
    test_recs['cumulative_rank'] = test_recs.groupby(level='user_id').cumcount() + 1
    test_recs['cumulative_rank'] = test_recs['cumulative_rank'] / test_recs['rank']
    
    users_count = test_recs.index.get_level_values('user_id').nunique()

    for k in range(1, top_N + 1):
        hit_k = f'hit@{k}'
        test_recs[hit_k] = test_recs['rank'] <= k
        result[f'Precision@{k}'] = (test_recs[hit_k] / k).sum() / users_count
        result[f'Recall@{k}'] = (test_recs[hit_k] / test_recs['users_item_count']).sum() / users_count
        
    result[f'MAP@{top_N}'] = (test_recs['cumulative_rank'] / test_recs['users_item_count']).sum() / users_count
    result[f'Novelty@{top_N}'] = calculate_novelty(train, recs, top_N)
    
    return pd.Series(result)

### Example on one fold

In [None]:
test = interactions_df[interactions_df['last_watch_dt'] == interactions_df['last_watch_dt'].max()]
train = interactions_df[interactions_df['last_watch_dt'] < interactions_df['last_watch_dt'].max()]

In [None]:
pop_model = PopularRecommender(days=7, dt_column='last_watch_dt')
pop_model.fit(train)

In [None]:
top10_recs = pop_model.recommend()
top10_recs

array([ 9728, 15297, 10440, 13865, 12360, 14488, 12192,   512,   341,
        3734])

In [None]:
item_titles = pd.Series(items_df['title'].values, index=items_df['item_id']).to_dict()

In [None]:
list(map(item_titles.get, top10_recs))

['гнев человеческий',
 'клиника счастья',
 'хрустальный',
 'девятаев',
 'круэлла',
 'мастер меча',
 'фемида видит',
 'рядовой чээрин',
 'лето - это море',
 'прабабушка легкого поведения']

In [None]:
recs = pd.DataFrame({'user_id': test['user_id'].unique()})
top_N = 10
recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)
recs.head()

Unnamed: 0,user_id,item_id
0,936370,"[9728, 15297, 10440, 13865, 12360, 14488, 1219..."
1,279776,"[9728, 15297, 10440, 13865, 12360, 14488, 1219..."
2,321739,"[9728, 15297, 10440, 13865, 12360, 14488, 1219..."
3,98693,"[9728, 15297, 10440, 13865, 12360, 14488, 1219..."
4,267998,"[9728, 15297, 10440, 13865, 12360, 14488, 1219..."


In [None]:
recs = recs.explode('item_id')

In [None]:
recs['rank'] = recs.groupby('user_id').cumcount() + 1
recs.head(top_N + 2)

Unnamed: 0,user_id,item_id,rank
0,936370,9728,1
0,936370,15297,2
0,936370,10440,3
0,936370,13865,4
0,936370,12360,5
0,936370,14488,6
0,936370,12192,7
0,936370,512,8
0,936370,341,9
0,936370,3734,10


In [None]:
compute_metrics(train, test, recs, 10)

Precision@1     0.034862
Recall@1        0.033231
Precision@2     0.033945
Recall@2        0.065418
Precision@3     0.032875
Recall@3        0.095387
Precision@4     0.029128
Recall@4        0.112564
Precision@5     0.023425
Recall@5        0.113175
Precision@6     0.022273
Recall@6        0.128721
Precision@7     0.021669
Recall@7        0.145846
Precision@8     0.019897
Recall@8        0.152727
Precision@9     0.018926
Recall@9        0.163532
Precision@10    0.018211
Recall@10       0.174618
MAP@10          0.071974
Novelty@10      6.242784
dtype: float64

### Folder validation

Let's take the last 3 weeks from our data and test them sequentially (1 test fold - 1 week). Don't forget about the cold start problem.

In [None]:
last_date = interactions_df['last_watch_dt'].max().normalize()
folds = 3
start_date = last_date - pd.Timedelta(days=folds*7)
start_date, last_date

(Timestamp('2021-08-01 00:00:00'), Timestamp('2021-08-22 00:00:00'))

In [None]:
cv = TimeRangeSplit(start_date=start_date, periods=folds+1, freq='W')

cv.max_n_splits, cv.get_n_splits(interactions_df, datetime_column='last_watch_dt')

(3, 3)

In [None]:
cv.date_range

DatetimeIndex(['2021-08-01', '2021-08-08', '2021-08-15', '2021-08-22'], dtype='datetime64[ns]', freq='W-SUN')

In [None]:
folds_with_stats = list(cv.split(
    interactions_df, 
    user_column='user_id',
    item_column='item_id',
    datetime_column='last_watch_dt',
    fold_stats=True
))

folds_info_with_stats = pd.DataFrame([info for _, _, info in folds_with_stats])

Already seen number: 0
Already seen number: 0
Already seen number: 0


In [None]:
folds_info_with_stats

Unnamed: 0,Start date,End date,Train,New users,New users interactions,New items,New items interactions,Known interactions,Test
0,2021-08-01,2021-08-08,420915,19360,22608,166,907,0,14717
1,2021-08-08,2021-08-15,459147,19615,22955,136,609,0,15979
2,2021-08-15,2021-08-22,498690,20501,24032,99,476,0,17371


### Popular on folds

In [None]:
top_N = 10
last_n_days = 7

In [None]:
final_results = []
validation_results = pd.DataFrame()

for train_idx, test_idx, info in folds_with_stats:
    train = interactions_df.loc[train_idx]
    test = interactions_df.loc[test_idx]
        
    pop_model = PopularRecommender(days=last_n_days, dt_column='last_watch_dt')
    pop_model.fit(train)

    recs = pd.DataFrame({'user_id': test['user_id'].unique()})
    recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)
    recs = recs.explode('item_id')
    recs['rank'] = recs.groupby('user_id').cumcount() + 1

    fold_result = compute_metrics(train, test, recs, top_N)

    validation_results = validation_results.append(fold_result, ignore_index=True)

In [None]:
validation_results.agg({'MAP@10':'mean', 'Novelty@10':'mean'})

MAP@10        0.039814
Novelty@10    5.778481
dtype: float64

### Popular Prediction

Let's see if it makes sense to predict the popular depending on the social group

In [None]:
train_idx, test_idx, info = folds_with_stats[0]
train = interactions_df.loc[train_idx]
test = interactions_df.loc[test_idx]
date_window_for_popular = train['last_watch_dt'].max() - pd.DateOffset(days=last_n_days)
train_slice = pd.merge(train[train['last_watch_dt'] >= date_window_for_popular], users_df, on='user_id', how='left')

we have users without features, so we need to define padding for them

In [None]:
train_slice.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,age,income,sex,kids_flg,boost_user_watch_cnt_all,boost_user_watch_cnt_last_14,user_watch_cnt_all,user_watch_cnt_last_14
0,689871,6404,2021-07-24,905,16,age_45_54,income_20_40,M,False,1.0,0.0,1.0,0.0
1,482718,2624,2021-07-24,1898,25,age_18_24,income_40_60,F,False,1.0,0.0,4.0,3.0
2,183195,11239,2021-07-24,1037,14,age_35_44,income_20_40,F,True,5.0,0.0,5.0,0.0
3,1077534,4457,2021-07-24,151,2,age_25_34,income_20_40,M,False,0.0,0.0,0.0,0.0
4,274241,16228,2021-07-24,19306,18,age_65_inf,income_20_40,F,False,4.0,0.0,4.0,0.0


In [None]:
train_slice.fillna({'age':'age_unknown',
                    'sex':'sex_unknown',
                    'income': 'income_unknown',
                    'kids_flg': False
                   }, inplace=True)

For example, you can watch popular by age, gender and presence of children

In [None]:
train_slice.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,age,income,sex,kids_flg,boost_user_watch_cnt_all,boost_user_watch_cnt_last_14,user_watch_cnt_all,user_watch_cnt_last_14
0,689871,6404,2021-07-24,905,16,age_45_54,income_20_40,M,False,1.0,0.0,1.0,0.0
1,482718,2624,2021-07-24,1898,25,age_18_24,income_40_60,F,False,1.0,0.0,4.0,3.0
2,183195,11239,2021-07-24,1037,14,age_35_44,income_20_40,F,True,5.0,0.0,5.0,0.0
3,1077534,4457,2021-07-24,151,2,age_25_34,income_20_40,M,False,0.0,0.0,0.0,0.0
4,274241,16228,2021-07-24,19306,18,age_65_inf,income_20_40,F,False,4.0,0.0,4.0,0.0


In [None]:
soc_dem_recommendations = train_slice.groupby(
    ['age', 'sex', 'income', 'item_id']
).size().to_frame().reset_index()

In [None]:
soc_dem_recommendations

Unnamed: 0,age,sex,income,item_id,0
0,age_18_24,F,income_0_20,14,1
1,age_18_24,F,income_0_20,111,1
2,age_18_24,F,income_0_20,162,1
3,age_18_24,F,income_0_20,288,1
4,age_18_24,F,income_0_20,334,1
...,...,...,...,...,...
18651,age_unknown,sex_unknown,income_unknown,16488,1
18652,age_unknown,sex_unknown,income_unknown,16498,1
18653,age_unknown,sex_unknown,income_unknown,16499,3
18654,age_unknown,sex_unknown,income_unknown,16509,21


Now you just need to select for each user the most popular top_n objects in his group

We can check this option on folds


In [None]:
validation_results = pd.DataFrame()

for train_idx, test_idx, info in folds_with_stats:
    train = interactions_df.loc[train_idx]
    test = interactions_df.loc[test_idx]
    date_window = train['last_watch_dt'].max() - pd.DateOffset(days=last_n_days)
    train_slice = pd.merge(train[train['last_watch_dt'] >= date_window], users_df, on='user_id', how='left')
    
    train_slice.fillna({
        'age':'age_unknown',
        'sex':'sex_unknown',
        'income': 'income_unknown',
        'kids_flg': False
    },inplace=True)
    
    soc_dem_recommendations = train_slice.groupby(
        ['age', 'sex', 'income', 'item_id']
    ).size().to_frame().reset_index()
    
    top_soc_dem = []

    for age in soc_dem_recommendations.age.unique():
        for income in soc_dem_recommendations.income.unique():
            for sex in soc_dem_recommendations.sex.unique():
                top_items = soc_dem_recommendations[
                (soc_dem_recommendations.age == age)
                & (soc_dem_recommendations.income == income)
                & (soc_dem_recommendations.sex == sex)].sort_values(0, ascending=False).head(10).item_id.values
                top_soc_dem.append([age, income, sex, top_items])

    top_soc_dem = pd.DataFrame(top_soc_dem, columns = ['age', 'income', 'sex', 'item_id'])
    
    recs = pd.DataFrame({'user_id': test['user_id'].unique()})
    recs = pd.merge(recs[['user_id']], users_df, on='user_id', how='left')
    recs.fillna({
        'age':'age_unknown',
        'sex':'sex_unknown',
        'income': 'income_unknown',
        'kids_flg': False
    }, inplace=True)
    
    recs = pd.merge(recs, top_soc_dem, on = ['age', 'sex', 'income'], how = 'left')
    recs = recs.drop(columns = ['age', 'sex', 'income'])
    
    recs = recs.explode('item_id')
    recs['rank'] = recs.groupby('user_id').cumcount() + 1
    fold_result = compute_metrics(train, test, recs, top_N)
    
    validation_results = validation_results.append(fold_result, ignore_index=True)

In [None]:
validation_results.agg({'MAP@10':'mean', 'Novelty@10':'mean'})

MAP@10        0.040677
Novelty@10    6.050588
dtype: float64

In this case, the features by which you build the popular are selected, as well as the number of days that you take to calculate the popular

### Tfidf

In [None]:
users_inv_mapping = dict(enumerate(interactions_df['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}

items_inv_mapping = dict(enumerate(interactions_df['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

In [None]:
validation_results = pd.DataFrame()

for train_idx, test_idx, info in folds_with_stats:
    train = interactions_df.loc[train_idx]

    date_window = train['last_watch_dt'].max() - pd.DateOffset(days=60)
    train = train[train['last_watch_dt'] >= date_window]

    test = interactions_df.loc[test_idx]

    train_mat = get_coo_matrix(
        train,
        users_mapping=users_mapping,
        items_mapping=items_mapping,
    ).tocsr()

    model = TFIDFRecommender(K=top_N)
    model.fit(train_mat.T, show_progress=False) 

    mapper = generate_implicit_recs_mapper( 
        model,
        train_mat,
        top_N,
        users_mapping,
        items_inv_mapping,
        filter_already_liked_items=True
    )

    recs = pd.DataFrame({'user_id': test['user_id'].unique()})
    recs['item_id'] = recs['user_id'].map(mapper)
    recs = recs.explode('item_id')
    recs['rank'] = recs.groupby('user_id').cumcount() + 1
    fold_result = compute_metrics(train, test, recs, top_N)

    validation_results = validation_results.append(fold_result, ignore_index=True)

In [None]:
validation_results.agg({'MAP@10':'mean', 'Novelty@10':'mean',})

MAP@10         0.698575
Novelty@10    17.440547
dtype: float64

Simply using the code above for submission won't work due to cold users. We'll have to figure out how to process them.

### Predictions

In [None]:
random_items = list(np.random.choice(interactions_df['user_id'], size=5, replace=False))
cold_items = [10000, 20000]
random_items.extend(cold_items)

In [None]:
random_items

[754950, 758416, 83485, 636568, 669127, 10000, 20000]

In [None]:
train = interactions_df
test = random_items

pop_model = PopularRecommender(days=last_n_days, dt_column='last_watch_dt')
pop_model.fit(train)

recs = pd.DataFrame({'user_id': pd.Series(test).unique()})
recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)
recs = recs.explode('item_id')
recs['rank'] = recs.groupby('user_id').cumcount() + 1
recs = recs.groupby('user_id').agg({'item_id': list}).reset_index()

In [None]:
recs.head()

Unnamed: 0,user_id,item_id
0,10000,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
1,20000,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
2,83485,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
3,636568,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."
4,669127,"[9728, 15297, 10440, 14488, 13865, 12192, 341,..."


---

In [None]:
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p implicit,catboost,recohut

numpy  1.19.5
pandas 1.1.5
Sparsh A. 
last updated: 2022-01-14 19:35:09 

implicit 0.4.8
catboost 1.0.4
recohut 0.0.11

compiler   : GCC 7.5.0
system     : Linux
release    : 5.4.144+
machine    : x86_64
processor  : x86_64
CPU cores  : 2
interpreter: 64bit


---

**END**