In [None]:
import numpy as np 
import pandas as pd 
import scipy.sparse as sp
from itertools import islice, cycle, product
from more_itertools import pairwise
import copy

from tqdm import tqdm_notebook
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

from collections import Counter

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import pickle

In [None]:
df = pd.read_csv('input/interactions.csv')
df_users = pd.read_csv('input/users.csv')
df_items = pd.read_csv('input/items.csv')
submit = pd.read_csv('input/sample_submission.csv')

# Preprocessing
### Interaсtions

In [None]:
df['start_date']= pd.to_datetime(df.start_date)
assert (df.start_date.dt.month.unique() == np.arange(1,13)).all()
http://localhost:8888/notebooks/MTC_teta/mts-ml-summer-school/model_ansamble.ipynb#Intera%D1%81tions
print(f'df ini.shape: {df.shape}')

# если пара пользователь-книга повторяется, то буде использовать мак/мин значения для фичей
duplicates = df.duplicated(subset=['user_id', 'item_id'], keep=False)
df_duplicates = df[duplicates].sort_values(by=['user_id', 'item_id'])
print(f'df_duplicates ini.shape: {df_duplicates.shape}')

df = df[~duplicates]

df_duplicates = df_duplicates.groupby(['user_id', 'item_id']).agg({
                                        'progress': 'max',
                                        'rating': 'max',
                                        'start_date': 'min'
                                        })
print(f'df_duplicates after agg.shape: {df_duplicates.shape}')
df = df.append(df_duplicates.reset_index(), ignore_index=True)
print(f'df.shape: {df.shape}')

df.info()

In [None]:
print((df.rating < 0).sum(), (df.rating > 5).sum())
df.rating.fillna(-1).plot(kind='hist');

In [None]:
df['rating_user_mean'] = df.groupby('user_id').rating.transform(np.mean)
df['rating_fillna_user_mean'] = np.where(df.rating.isna(), df.rating_user_mean, df.rating)
df.rating_fillna_user_mean.fillna(-1).plot(kind='hist');

In [None]:
# например, пользователь всем книгам ставит оценку не больше 3, его 3 сопоставима с 5. Уровняем такие разницы вычев среднее по пользователю
df['rating_norm_by_user'] = df.rating  - df.groupby('user_id').rating.transform(np.mean)
df.rating_norm_by_user.fillna(-5).plot(kind='hist');

In [None]:
(df.progress == 0).sum()/ df.shape[0],\
((df.progress == 0) & (df.rating.notna())).sum()/df.shape[0]

In [None]:
df['progress0_rating_na'] = (df.progress == 0) & (df.rating.notna())

In [None]:
print((df.progress < 0).sum(), (df.progress > 100).sum())
df.progress = df.progress.clip(upper=100)

In [None]:
df.isna().sum()/df.shape[0]

In [None]:
df['progress'] = df['progress'].astype(np.int8)
df['rating'] = df['rating'].astype(pd.SparseDtype(np.float32, np.nan))
df.info()

### Users

In [None]:
display(df_users, df_users.info())

In [None]:
df_users.nunique()

In [None]:
def add_nan_to_cat(df_user, feature):
    if df_users[feature].isna().sum() != 0:
        df_users[feature] = df_users[feature].astype('category').cat.add_categories(feature+'_nan')
        df_users[feature] = df_users[feature].fillna(feature+'_nan')
        
        
def compate_dfs_by_id(df, df_users, feature_id):
    interaction_users = df[feature_id].unique()
    users = df_users[feature_id].unique()

    common_users = len(np.intersect1d(interaction_users, users))
    users_only_interaction = len(np.setdiff1d(interaction_users, users))
    users_only_features = len(np.setdiff1d(users, interaction_users))
    total_users = common_users + users_only_features + users_only_interaction

    print(f'Кол-во пользователей - {total_users}')
    print(f'Кол-во пользователей cвзаимодействиями и фичами - {common_users} ({common_users / total_users * 100:.2f}%)')
    print(f'Кол-во пользователей только c взаимодействиями - {users_only_interaction} ({users_only_interaction / total_users * 100:.2f}%)')
    print(f'Кол-во пользователей только c фичами - {users_only_features} ({users_only_features / total_users * 100:.2f}%)')

In [None]:
compate_dfs_by_id(df, df_users, 'user_id')

In [None]:
# добавим пользователей в df_users, если они есть в interection df
new_users = np.setdiff1d(df.user_id.unique(), df_users.user_id.unique())
df_users = df_users.append(pd.DataFrame(new_users, columns=['user_id']))
assert df_users.user_id.nunique() >= df.user_id.nunique()
df_users

In [None]:
df_users.isna().sum()/df_users.shape[0]

In [None]:
add_nan_to_cat(df_users, 'age')
add_nan_to_cat(df_users, 'sex')
df_users.isna().sum(), df_users.dtypes

### Items

In [None]:
display(df_items.head(), df_items.info())

In [None]:
df_items.year.value_counts().tail()

In [None]:
def num_bytes_format(num_bytes, float_prec=4):
    units = ['bytes', 'Kb', 'Mb', 'Gb', 'Tb', 'Pb', 'Eb']
    for unit in units[:-1]:
        if abs(num_bytes) < 1000:
            return f'{num_bytes:.{float_prec}f} {unit}'
        num_bytes /= 1000
    return f'{num_bytes:.4f} {units[-1]}'

In [None]:
num_bytes = df_items.memory_usage(deep=True).sum()
num_bytes_format(num_bytes)

In [None]:
df_items.nunique(), df_items.shape

In [None]:
df_items.head()

In [None]:
df_items['genres']

### Test/ to_submit

In [None]:
TEST_N_DAY = 2 # надо будет предсказать наблюедния за 2 дня, сохраним этот параметр для валидации
TOP_N = 10

# Validation

In [None]:
CV_FOLDS = 7
class TimeRangeSplit():
    """
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.date_range.html
    """
    def __init__(self, 
                 start_date, 
                 end_date=None, 
                 freq='D', 
                 periods=None, 
                 tz=None, 
                 normalize=False, 
                 closed=None, 
                 train_min_date=None,
                 filter_cold_users=True, 
                 filter_cold_items=True, 
                 filter_already_seen=True):
        
        self.start_date = start_date
        if end_date is None and periods is None:
            raise ValueError("Either 'end_date' or 'periods' must be non-zero, not both at the same time.")

        self.end_date = end_date
        self.freq = freq
        self.periods = periods
        self.tz = tz
        self.normalize = normalize
        self.closed = closed
        self.train_min_date = pd.to_datetime(train_min_date, errors='raise')
        self.filter_cold_users = filter_cold_users
        self.filter_cold_items = filter_cold_items
        self.filter_already_seen = filter_already_seen

        self.date_range = pd.date_range(
            start=start_date, 
            end=end_date, 
            freq=freq, 
            periods=periods, 
            tz=tz, 
            normalize=normalize, 
            closed=closed)
        
        print(self.date_range)

        self.max_n_splits = max(0, len(self.date_range) - 1)
        if self.max_n_splits == 0:
            raise ValueError("Provided parametrs set an empty date range.") 

    def split(self, 
              df, 
              user_column='user_id',
              item_column='item_id',
              datetime_column='date',
              fold_stats=False):
        df_datetime = df[datetime_column]
        if self.train_min_date is not None:
            train_min_mask = df_datetime >= self.train_min_date
        else:
            train_min_mask = df_datetime.notnull()
        date_range = self.date_range[(self.date_range >= df_datetime.min()) & 
                                     (self.date_range <= df_datetime.max())]
        
        for start, end in pairwise(date_range):
            fold_info = {
                'Start date': start,
                'End date': end
            }
            train_mask = train_min_mask & (df_datetime < start)
            train_idx = df.index[train_mask]
            if fold_stats:
                fold_info['Train'] = len(train_idx)

            test_mask = (df_datetime >= start) & (df_datetime < end)
            test_idx = df.index[test_mask]
            
            if self.filter_cold_users:
                new = np.setdiff1d(
                    df.loc[test_idx, user_column].unique(), 
                    df.loc[train_idx, user_column].unique())
                new_idx = df.index[test_mask & df[user_column].isin(new)]
                test_idx = np.setdiff1d(test_idx, new_idx)
                test_mask = df.index.isin(test_idx)
                if fold_stats:
                    fold_info['New users'] = len(new)
                    fold_info['New users interactions'] = len(new_idx)

            if self.filter_cold_items:
                new = np.setdiff1d(
                    df.loc[test_idx, item_column].unique(), 
                    df.loc[train_idx, item_column].unique()) # not in train
                new_idx = df.index[test_mask & df[item_column].isin(new)]
                test_idx = np.setdiff1d(test_idx, new_idx) # in train
                test_mask = df.index.isin(test_idx)
                if fold_stats:
                    fold_info['New items'] = len(new)
                    fold_info['New items interactions'] = len(new_idx)

            if self.filter_already_seen:
                user_item = [user_column, item_column]
                train_pairs = df.loc[train_idx, user_item].set_index(user_item).index
                test_pairs = df.loc[test_idx, user_item].set_index(user_item).index
                intersection = train_pairs.intersection(test_pairs)
                test_idx = test_idx[~test_pairs.isin(intersection)] # exclude intersection
                if fold_stats:
                    fold_info['Known interactions'] = len(intersection)

            if fold_stats:
                fold_info['Test'] = len(test_idx)

            yield (train_idx, test_idx, fold_info)

    def get_n_splits(self, df, datetime_column='date'):
        df_datetime = df[datetime_column]
        if self.train_min_date is not None:
            df_datetime = df_datetime[df_datetime >= self.train_min_date]

        date_range = self.date_range[(self.date_range >= df_datetime.min()) & 
                                     (self.date_range <= df_datetime.max())]

        return max(0, len(date_range) - 1)

In [None]:
df.sort_values('start_date', inplace=True)

start_date = df['start_date'].max().normalize() - pd.Timedelta(days=CV_FOLDS) * TEST_N_DAY # количество фолом по 2 дня в каждом
cv = TimeRangeSplit(start_date=start_date, periods=CV_FOLDS+1, freq=f'{TEST_N_DAY}d', 
                    filter_cold_users=False) # в тесте есть новые наблюдения, поэтому для корреляции валидации и теста оставим новые наблюдения в валидации
next(cv.split(df, datetime_column='start_date'));

In [None]:
cvs = list(cv.split(
    df, 
    user_column='user_id',
    item_column='item_id',
    datetime_column='start_date',
    fold_stats=True
))

folds_info_with_stats = pd.DataFrame([info for _, _, info in cvs])
folds_info_with_stats

In [None]:
for train_idx, val_idx, _ in cv.split(df.sort_values('start_date'), datetime_column='start_date'):
    train, val = df.loc[train_idx], df.loc[val_idx]
    print(train.shape, train.user_id.nunique(), end='\t')
    print(val.shape, val.user_id.nunique(), end='\n------------\n')

In [None]:
submit.shape, submit.Id.nunique()

In [None]:
train_idx, val_idx, _ = cvs[0]
train, val = df.loc[train_idx], df.loc[val_idx]

print(train.start_date.max(), val.start_date.min())
assert folds_info_with_stats['Train'][0] == train.shape[0]
assert folds_info_with_stats['Test'][0] == val.shape[0]
train.head()

# Popular model

In [None]:
def metrics_map(val, recs):
    users_count = val.user_id.nunique()

    recs = recs.explode('item_id')
    recs['rank'] = recs.groupby('user_id').cumcount() + 1

    val_recs = val.set_index(['user_id', 'item_id']).join(recs.set_index(['user_id', 'item_id']))
    val_recs = val_recs.sort_values(by=['user_id', 'rank'])
    val_recs['users_item_count'] = val_recs.groupby(['user_id'], sort=False)['rank'].transform(np.size)

    val_recs['cumulative_rank'] = val_recs.groupby(level='user_id').cumcount() + 1
    val_recs['cumulative_rank'] = val_recs['cumulative_rank'] / val_recs['rank']

    mapN = (val_recs["cumulative_rank"] / val_recs["users_item_count"]).sum() / users_count
    print(f"MAP@{TOP_N} = {mapN}")
    return mapN

In [None]:
class PopularRecommender():
    def __init__(self, max_K=100, days=30, user_column='user_id', item_column='item_id', dt_column='date',
                groupby=None, fit_na_as_common=True):
        self.max_K = max_K
        self.days = days
        self.user_column = user_column
        self.item_column = item_column
        self.dt_column = dt_column
        self.groupby = groupby
        self.fit_na_as_common = fit_na_as_common
        self.recommendations = []
        
    def fit(self, df, df_users=None):
        min_date = df[self.dt_column].max().normalize() - pd.DateOffset(days=self.days)
        data = df[df[self.dt_column] > min_date]
        recomm_common = data[self.item_column].value_counts().head(self.max_K).index.values
        self.recomm_common = recomm_common
        self.df_users = df_users
        
        if self.groupby is not None:
            if df_users is None:
                print('No df_users')
                return None
            
            data = data.merge(df_users, on=self.user_column, how='left')
            self.recommendations = data.groupby(self.groupby)[self.item_column]\
                                       .apply(lambda x: x.value_counts().head(self.max_K).index.values)
            # если нет записей для рекомендации, рекомендовать общее
            na_mask = self.recommendations.isna()
            self.recommendations[na_mask] = self.recommendations[na_mask].apply(lambda x: recomm_common)
            # на случай, если список рекомендаций будет коротким (например только 2 книги)
            # добавим общую рекоменацию
            self.recommendations = self.recommendations.apply(lambda x: np.concatenate((x, recomm_common)))
            # na в категориях
            if self.fit_na_as_common:
                na_mask = (self.recommendations.reset_index()[self.groupby]=='nan').sum(axis=1)!=0
                self.recommendations[na_mask.values] = self.recommendations[na_mask.values].apply(lambda x: recomm_common)
        else:
            self.recommendations = recomm_common
        
    def recommend(self, users=None, N=10):
        recs = self.recommendations[:N]
        
        if users is None:
            if self.groupby is not None:
                print('For recomendations based on groupby needs used_id')
                return None
            return recs
        else:
            if self.groupby is not None:
                recoms = self.recommendations.apply(lambda x: x[:N]) # только N первых рекомендаций
                recoms = recoms.reset_index()
                recoms[self.groupby] = recoms[self.groupby].astype('category')
                data = users.to_frame().merge(self.df_users, on=self.user_column, how='left') # добавляем информацию по пользователям для разбиения на группы
                data = data.merge(recoms, on=self.groupby, how='left') # добавляем рекомендации в соответсвии с группой
                # если встречается уникальная группа, то пресказания будут пропусками. Заполнить их общими предсказаниями по всему набору
                na_mask = data.iloc[:, -1].isna()
                data.iloc[:, -1][na_mask] = data.iloc[:, -1][na_mask].apply(lambda x: self.recomm_common[:N])

                return data.iloc[:, -1].values.tolist()
            
            else: # если не было разбиения на группы
                return list(islice(cycle([recs]), len(users))) # возвращаем общее предсказание

## Cross-val

In [None]:
def model_Popular_cv(model):
    metrics = []
    for train_idx, val_idx, _ in cvs:
        train, val = df.loc[train_idx], df.loc[val_idx]

        model.fit(train, df_users)

        recs = pd.DataFrame({'user_id': val['user_id'].unique()})
        recs['item_id'] = model.recommend(recs['user_id'], N=TOP_N)

        metrics.append(metrics_map(val, recs))

    map_mean = np.mean(metrics)
    print(f'mean MAP@10 = {map_mean}')
    print(f'std MAP@10 = {np.std(metrics)}')
    
    return metrics

def model_Popular_to_submit(model, name=''):
    model.fit(df, df_users)

    user_id = submit['Id'].copy()
    user_id.rename('user_id',inplace=True)
    recs = model.recommend(user_id, N=TOP_N)
    submit['Predicted'] = recs
    submit['Predicted'] = submit.Predicted.apply(lambda x: ' '.join(map(str, x)) if x is not np.nan else '')
    submit.to_csv(f'submit{name}.csv', index=False)
    return submit

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
# results = []
# for days in tqdm(range(21, 33)):
#     params['days'] = days
#     model = PopularRecommender(**params)
#     metrics = model_Popular_cv(model, temp, drop_known=True)
#     to_add = dict(days=days, 
#                   map_mean = np.mean(metrics),
#                   map_std=np.std(metrics))
#     results.append(to_add)
    
# pd.DataFrame(results).sort_values('map_mean', ascending=False)

In [None]:
params = dict(groupby=None,
              fit_na_as_common=False,
              days = 21)

model = PopularRecommender(dt_column='start_date', 
                           **params)
metrics = model_Popular_cv(model)

In [None]:
model = PopularRecommender(days= 21 , groupby=['genre'], dt_column='start_date')
model.fit(df, df_users)

## BM25Recommender model

In [None]:
users_inv_mapping = dict(enumerate(df['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}

items_inv_mapping = dict(enumerate(df['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

item_titles = pd.Series(df_items['title'].values, index=df_items['id']).to_dict()

len(users_mapping), len(items_mapping)

In [None]:
def get_coo_matrix(df, 
                   user_col='user_id', 
                   item_col='item_id', 
                   weight_col=None, 
                   users_mapping=users_mapping, 
                   items_mapping=items_mapping):
    if weight_col is None:
        weights = np.ones(len(df), dtype=np.float32)
    else:
        weights = df[weight_col].astype(np.float32)

    interaction_matrix = sp.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(items_mapping.get)
        )
    ))
    return interaction_matrix

In [None]:
train_mat = get_coo_matrix(train).tocsr()
assert train_mat.shape == (train['user_id'].map(users_mapping.get).max()+1,
                    train['item_id'].map(items_mapping.get).max()+1)
train_mat

In [None]:
mat = get_coo_matrix(df).tocsr()

In [None]:
from implicit.nearest_neighbours import BM25Recommender

In [None]:
model = BM25Recommender(K=10)
model.fit(mat.T) # 

In [None]:
# для новых наблюдений будем использовать общепопулярную рекомендацию, без группировки соотвественно
pop_model = PopularRecommender(days=21, dt_column='start_date')
pop_model.fit(df, df_users)
recomm_mapper = pop_model.recomm_common

In [None]:
def generate_implicit_recs_mapper(model, train_matrix, N, user_mapping, item_inv_mapping, recomm_mapper, thres=20):
    def _recs_mapper(user):
        try:
            user_id = user_mapping[user]
        except KeyError:
            return recomm_mapper[:N]
        if user in user_to_lib.keys() and len(user_to_lib[user]) < thres:
            return recomm_mapper[:N]
        if user_id >= train_matrix.shape[0]:
            return recomm_mapper[:N]
        else:
            recs = model.recommend(user_id, 
                               train_matrix, 
                               N=N, 
                               filter_already_liked_items=True)
        return [item_inv_mapping[item] for item, _ in recs]
    return _recs_mapper

def generate_implicit_recs_mapper_submit(model, train_matrix, N, user_mapping, item_inv_mapping, recomm_mapper, thres=20):
    def _recs_mapper(user):
        try:
            user_id = user_mapping[user]
        except KeyError:
            return recomm_mapper[:N]
        if user in user_to_lib.keys() and len(user_to_lib[user]) < thres:
            return recomm_mapper[:N]
        if user_id >= train_matrix.shape[0]:
            return recomm_mapper[:N]
        else:
            recs = model.recommend(user_id, 
                               train_matrix, 
                               N=N, 
                               filter_already_liked_items=True)
        return [item_inv_mapping[item] for item, _ in recs]
    return _recs_mapper

In [None]:
mapper = generate_implicit_recs_mapper(model, mat, 10, 
                                       users_mapping, items_inv_mapping, recomm_mapper, thres=10)


In [None]:
submit['Predicted'] = submit['Id'].map(mapper)

In [None]:
submit['Predicted'] = submit.Predicted.apply(lambda x: ' '.join(map(str, x)) if x is not np.nan else '')

In [None]:
submit.to_csv('ensemble.csv', index=False)

In [None]:
submit