In [120]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender  # нужен для одного трюка
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Модель второго уровня
from lightgbm import LGBMClassifier

In [121]:
def prefilter_items(data_in, item_features, take_n_popular):
    data = data_in.copy()
    

    # Уберем самые популярные товары (их и так купят)
    data_grp_by_item = data.groupby('item_id')
    popularity = data_grp_by_item['user_id'].nunique().reset_index()
    popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)
    popularity['share_unique_users'] = popularity['share_unique_users'] / data['user_id'].nunique()
    
    top_popular = popularity[popularity['share_unique_users'] > 0.5].item_id.tolist()
    data = data[~data['item_id'].isin(top_popular)]
    
    # Уберем самые НЕ популярные товары (их и так НЕ купят)
    top_notpopular = popularity[popularity['share_unique_users'] < 0.01].item_id.tolist()
    data = data[~data['item_id'].isin(top_notpopular)]
    
    # Уберем товары, которые не продавались за последние 12 месяцев
    data_grp_by_item = data.groupby('item_id')

    last_week_in_data = data['week_no'].max()
    item_last_sell_week = data_grp_by_item['week_no'].max().reset_index()
    item_last_sell_week.rename(columns={'week_no': 'last_week_no'}, inplace=True)
    not_trending_items = \
        item_last_sell_week[item_last_sell_week['last_week_no'] < \
                       (last_week_in_data - 52)].item_id.to_list()
    
    data = data[~data['item_id'].isin(not_trending_items)]

    # Уберем не интересные для рекоммендаций категории (department)
    not_iteresing_department = ['MISC. TRANS.', 
                                'VIDEO RENTAL', 
                                'KIOSK-GAS', 
                                'MISC SALES TRAN', 
                                'POSTAL CENTER', 
                                'RX', 
                                'HBC']

    data_item = data.merge(item_features,
           on='item_id',
           how='inner')
    items_in_not_iteresing_department = \
        data_item[data_item['department'].isin(not_iteresing_department)].item_id.to_list()

    data = data[~data['item_id'].isin(items_in_not_iteresing_department)]

    # Уберем слишком дешевые товары (на них не заработаем). 1 покупка из рассылок стоит 60 руб. 
    
    # Уберем слишком дорогие товарыs
    data_grp_by_item = data.groupby('item_id')
    item_sum_quantity = data_grp_by_item['quantity'].sum().reset_index()
    item_sum_quantity.rename(columns={'quantity': 'sum_quantity'}, inplace=True)

    item_sum_sales_value = data_grp_by_item['sales_value'].sum().reset_index()
    item_sum_sales_value.rename(columns={'sales_value': 'sum_sales_value'}, inplace=True)

    item_sum_quantity = data_grp_by_item['quantity'].sum().reset_index()
    item_sum_quantity.rename(columns={'quantity': 'sum_quantity'}, inplace=True)
    item_with_no_quantity = item_sum_quantity[item_sum_quantity['sum_quantity'] == 0].item_id.to_list()

    item_sum_quantity = \
        item_sum_quantity[~item_sum_quantity['item_id'].isin(item_with_no_quantity)]
    item_sum_sales_value = \
        item_sum_sales_value[~item_sum_sales_value['item_id'].isin(item_with_no_quantity)]

    item_mean_sales_value = item_sum_sales_value['item_id'].to_frame()
    item_mean_sales_value['mean_sum_sales_value'] = item_sum_sales_value['sum_sales_value'] / item_sum_quantity['sum_quantity']

    q1_mean_sum_sales_value = item_mean_sales_value['mean_sum_sales_value'].quantile(q=0.05)
    q2_mean_sum_sales_value = item_mean_sales_value['mean_sum_sales_value'].quantile(q=0.95)

    item_with_acceptible_price = \
        item_mean_sales_value[(item_mean_sales_value['mean_sum_sales_value'] < q2_mean_sum_sales_value) & \
                            (item_mean_sales_value['mean_sum_sales_value'] > q1_mean_sum_sales_value)]\
                            .item_id.to_list()

    data = data[data['item_id'].isin(item_with_acceptible_price)]
    

    # Топ N товаров среди оставшихся
    data_grp_by_item = data.groupby('item_id')
    popularity = data_grp_by_item['user_id'].nunique().reset_index()
    popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)
    popularity['share_unique_users'] = popularity['share_unique_users'] / data['user_id'].nunique()
    top_popular_n = \
        popularity.sort_values(by='share_unique_users', ascending=False)[:take_n_popular].item_id.to_list()
    
    # data = data[data['item_id'].isin(top_popular_n)]

    return data, top_popular_n

In [122]:
class MainRecommender:
    """Рекоммендации, которые можно получить из ALS

    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """

    
    def __init__(self, data, top_popular_n, weighting=True):
        
        # your_code. Это не обязательная часть. Но если вам удобно что-либо посчитать тут - можно это сделать
        self.data_in = data
        self.top_popular = top_popular_n

        self.user_item_matrix = self.prepare_matrix(data, top_popular_n)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix)
        
        if weighting:
            bm25_weight_user_item_matrix = bm25_weight(self.user_item_matrix.T).T
            self.model_ALS = self.fit(bm25_weight_user_item_matrix)
            self.own_recommender = self.fit_own_recommender(bm25_weight_user_item_matrix)
        else:
            self.model_ALS = self.fit(self.user_item_matrix)
            self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
     
    @staticmethod
    def prepare_matrix(data_in, top_popular_n):
        
        # your_code

        data_in.loc[~data_in['item_id'].isin(top_popular_n), 'item_id'] = 999999

        user_item_matrix = pd.pivot_table(data_in, 
                                          index='user_id', columns='item_id', 
                                          values='quantity', # Можно пробоват ьдругие варианты
                                          aggfunc='count', 
                                          fill_value=0
                                         )

        user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit
        
        return user_item_matrix
    
    
    

    @staticmethod
    def _prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))

        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""

        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender.fit(csr_matrix(user_item_matrix).tocsr())

        return own_recommender

    @staticmethod
    def fit(user_item_matrix, n_factors=20, regularization=0.001, iterations=15, num_threads=4):
        """Обучает ALS"""

        model_ALS = AlternatingLeastSquares(factors=n_factors,
                                            regularization=regularization,
                                            iterations=iterations,
                                            num_threads=num_threads)
        model_ALS.fit(csr_matrix(user_item_matrix).tocsr())

        return model_ALS

    def _update_dict(self, user_id):
        """Если появился новыю user / item, то нужно обновить словари"""

        if user_id not in self.userid_to_id.keys():
            max_id = max(list(self.userid_to_id.values()))
            max_id += 1

            self.userid_to_id.update({user_id: max_id})
            self.id_to_userid.update({max_id: user_id})

#     def _extend_with_top_popular(self, recommendations, N):
#         """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""

#         if len(recommendations) < N:
#             recommendations.extend(self.overall_top_purchases[:N])
#             recommendations = recommendations[:N]

#         return recommendations
    
    def _extend_with_top_popular(self, recommendations, N):
        """Если количество рекомендаций меньше N, то дополняем их топ-популярными"""
        
        max_top_popular_len = len(self.top_popular)
        recommendations = list(recommendations)
        if len(recommendations) < N:
            if N <= max_top_popular_len:
                top_popular = [rec for rec in self.top_popular[:N] if rec not in recommendations]
                recommendations.extend(top_popular)
                recommendations = recommendations[:N]
            else:
                recommendations = recommendations[:max_top_popular_len]
        return recommendations    
    

    def _get_recommendations(self, user, model, N):
        """Рекомендации через стардартные библиотеки implicit"""
        res = []

        self._update_dict(user_id=user)
        try:
            res = [self.id_to_itemid[rec] for rec in model.recommend(userid=self.userid_to_id[user],
                                                                 user_items=csr_matrix(self.user_item_matrix).tocsr()[
                                                                     self.userid_to_id[user]],
                                                                 N=N,
                                                                 filter_already_liked_items=False,
                                                                 filter_items=[self.itemid_to_id[999999]],
                                                                 recalculate_user=True)[0]]
        except IndexError:
            res = []
        except KeyError:
            res = [self.id_to_itemid[rec] for rec in model.recommend(userid=self.userid_to_id[user],
                                                                 user_items=csr_matrix(self.user_item_matrix).tocsr()[
                                                                     self.userid_to_id[user]],
                                                                 N=N,
                                                                 filter_already_liked_items=False,
                                                                 #filter_items=[self.itemid_to_id[999999]],
                                                                 recalculate_user=True)[0]]
            
        
        
        res = self._extend_with_top_popular(res, N=N)

        assert len(res) >= N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_als_recommendations(self, user, N):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.model_ALS, N=N)

    def get_own_recommendations(self, user, N):
        """Рекомендуем товары среди тех, которые юзер уже купил"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.own_recommender, N=N)

In [123]:
def precision_at_k(recommended_list, bought_list, k=5):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    

    bought_list = bought_list  # Тут нет [:k] !!

    try:
        if k < len(recommended_list):
            recommended_list = recommended_list[:k]
    except TypeError:
        recommended_list = [0]

    flags = np.isin(bought_list, recommended_list)

    precision = flags.sum() / len(recommended_list)

    return precision

In [124]:
def recall_at_k(recommended_list, bought_list, k=5):

    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)

    try:
        if k < len(recommended_list):
            recommended_list = recommended_list[:k]
    except TypeError:
        recommended_list = []

    flags = np.isin(bought_list, recommended_list)
    recall = flags.sum() / len(bought_list)

    return recall

In [126]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# # Важна схема обучения и валидации!
# # -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# # подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
# val_lvl_1_size_weeks = 6
# val_lvl_2_size_weeks = 3

# data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
# data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
#                       (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

# data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
# data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1 = data.copy()
data_train_lvl_1

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.60,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.00,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.30,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.00,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.50,364,-0.39,1631,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2396799,1613,41655820646,663,16102849,1,2.00,3262,-1.15,1231,95,0.0,0.0
2396800,1001,41655829421,663,13217063,1,1.69,3131,0.00,2231,95,0.0,0.0
2396801,1001,41655829421,663,13217800,1,1.69,3131,0.00,2231,95,0.0,0.0
2396802,1167,41656790510,663,6410462,22451,43.98,3385,-0.65,1059,95,0.0,0.0


In [127]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1, top_n_popular = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 89051 to 10902


In [128]:
recommender = MainRecommender(data_train_lvl_1, top_n_popular)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [129]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(4)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96..."


In [130]:
%%time
    
result_lvl_1['own_k50'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))
result_lvl_1

CPU times: total: 6min 45s
Wall time: 6min 55s


Unnamed: 0,user_id,actual,own_k50
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[856942, 5577022, 963835, 9655212, 940947, 108..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[963835, 1075368, 911974, 922561, 827683, 8090..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[891423, 1075368, 910109, 6773204, 883932, 963..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[1119051, 13003092, 963835, 1037863, 8203834, ..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[963835, 1122358, 6944571, 949836, 922561, 660..."
...,...,...,...
2149,2496,"[831509, 867188, 1013623, 1048851, 5592734, 16...","[963835, 855672, 1056509, 1029112, 7441210, 55..."
2150,2497,"[820291, 824759, 838797, 859010, 859075, 86077...","[1066685, 1135834, 896938, 965719, 1031864, 91..."
2151,2498,"[865511, 962991, 1076374, 1102358, 5564901, 15...","[963835, 1070820, 1100379, 1126899, 901776, 82..."
2152,2499,"[861282, 921744, 1050968, 13842089, 828837, 86...","[1070820, 963835, 873964, 1085604, 5570048, 83..."


In [133]:
result_lvl_1.apply(lambda row: recall_at_k(row['own_k50'], row['actual']), axis=1).mean()

0.04403923938599829

In [134]:
result_lvl_1.apply(lambda row: precision_at_k(row['own_k50'], row['actual']), axis=1).mean()

0.34317548746518106

Подготовка матрицы для тренировки модели второго уровня LGBMClassifier, бинарная классификация - предсказание рекомендуемых item

In [135]:
def create_matrix_target(users_lvl_2, data_train_lvl_2, name_column = 'own_k50'):
    # сформируем матрицу user и item на основании данных первой модели itemitemrecommender + топовые item
    s = users_lvl_2.apply(lambda x: pd.Series(x[name_column]), axis=1).stack().reset_index(level=1, drop=True)
    s.name = 'item_id'
    users_lvl_2 = users_lvl_2.drop(name_column, axis=1).join(s)
    
    # матрица таргетов из тренировочной выборки
    targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
    targets_lvl_2['target'] = 1  # тут только покупки 
    
    #к матрице полученной по результатам первой модели добавим таргеты из тренировочной выборки
    targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')
    
    targets_lvl_2['target'].fillna(0, inplace= True)
    targets_lvl_2['item_id'] = targets_lvl_2['item_id'].astype(int)
    
    return targets_lvl_2   

In [136]:
users_lvl_2 = result_lvl_1.drop('actual', axis=1)
users_lvl_2.head(4)

Unnamed: 0,user_id,own_k50
0,1,"[856942, 5577022, 963835, 9655212, 940947, 108..."
1,2,"[963835, 1075368, 911974, 922561, 827683, 8090..."
2,4,"[891423, 1075368, 910109, 6773204, 883932, 963..."
3,6,"[1119051, 13003092, 963835, 1037863, 8203834, ..."


In [137]:
targets_lvl_2 = create_matrix_target(users_lvl_2, data_train_lvl_1)
targets_lvl_2

Unnamed: 0,user_id,item_id,target
0,1,856942,1.0
1,1,856942,1.0
2,1,856942,1.0
3,1,856942,1.0
4,1,856942,1.0
...,...,...,...
481199,2500,1110572,1.0
481200,2500,1110572,1.0
481201,2500,1137507,1.0
481202,2500,849098,1.0


In [138]:
targets_lvl_2['target'].mean()

0.9812511949194105

In [139]:
def create_matrix_predict_LGB(X_train, train_preds, train_preds_proba):
    
    predict_proba_item = X_train[['user_id', 'item_id']].copy()
    predict_proba_item['predict'] = train_preds
    predict_proba_item['proba'] = train_preds_proba[:,0]
    predict_proba_item = predict_proba_item.drop(predict_proba_item[predict_proba_item['predict']==0].index, axis=0)
    predict_proba_item.sort_values(by='proba', ascending=False, inplace=True)
    return predict_proba_item.groupby('user_id')['item_id'].unique().reset_index()

Добавим к трейну матрицу с фичами из файлов

In [140]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [141]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [142]:
targets_lvl_2

Unnamed: 0,user_id,item_id,target
0,1,856942,1.0
1,1,856942,1.0
2,1,856942,1.0
3,1,856942,1.0
4,1,856942,1.0
...,...,...,...
481199,2500,1110572,1.0
481200,2500,1110572,1.0
481201,2500,1137507,1.0
481202,2500,849098,1.0


In [143]:
def features_generator(data_train_lvl_2, item_features, user_features, targets_lvl_2):
    targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
    targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')
    
    data_train_lvl_2_features = data_train_lvl_2.merge(item_features, on='item_id', how='left')
    data_train_lvl_2_features = data_train_lvl_2_features.merge(user_features, on='user_id', how='left')
    
    #Фича user_id - средний чек для пользователя
    mean_sales_value_userid = data_train_lvl_2_features.groupby('user_id')['sales_value'].mean().reset_index()
    mean_sales_value_userid.columns=['user_id', 'mean_sal_val_uid']
    
    #Фича user_id - кол-во покупок в каждой категории
    sum_quantity_department_userid = data_train_lvl_2_features.groupby(by=['user_id', 'department'])['quantity'].sum().reset_index()
    sum_quantity_department_userid.columns=['user_id', 'department', 'sum_qt_dep_uid']
    
    #Фича item_id - Среднее кол-во покупок в неделю
    mean_quantity_week_itemid = data_train_lvl_2_features.groupby(by=['item_id', 'week_no'])['quantity'].sum().reset_index()
    mean_quantity_week_itemid = mean_quantity_week_itemid.groupby(by=['item_id'])['quantity'].mean().reset_index()
    mean_quantity_week_itemid.columns=['item_id', 'mean_qt_week_item']
    
    #Фича item_id - Средняя цена товара в категории
    mean_sales_value_department_itemid = data_train_lvl_2_features.groupby(by=['department'])['sales_value'].mean().reset_index()
    mean_sales_value_department_itemid.columns=['department', 'mean_sal_val_dep']
    
    #Фичи пары user_id - item_id - Среднее кол-во покупок всеми юзерами конкретной категории в неделю
    mean_quantity_department_week_no_userid_itemid = data_train_lvl_2_features.groupby(by=['department', 'week_no'])['quantity'].sum().reset_index()
    mean_quantity_department_week_no_userid_itemid = mean_quantity_department_week_no_userid_itemid.groupby(by=['department'])['quantity'].mean().reset_index()
    mean_quantity_department_week_no_userid_itemid.columns=['department', 'mean_qt_dep_week']
    
    #Фичи пары user_id - item_id - Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)) - (Цена item_id)
    mean_sales_value_department_userid_itemid = data_train_lvl_2_features.groupby(by=['item_id', 'department'])['sales_value'].mean().reset_index()
    mean_sales_value_department_userid_itemid.columns=['item_id', 'department', 'mean_sal_val_dep_itemid']
    
    targets_lvl_2_features = targets_lvl_2.merge(mean_sales_value_userid, on='user_id', how='left')
    targets_lvl_2_features = targets_lvl_2_features.merge(sum_quantity_department_userid, on=['user_id', 'department'], how='left')
    targets_lvl_2_features = targets_lvl_2_features.merge(mean_quantity_week_itemid, on=['item_id'], how='left')
    targets_lvl_2_features = targets_lvl_2_features.merge(mean_sales_value_department_itemid, on=['department'], how='left')
    targets_lvl_2_features = targets_lvl_2_features.merge(mean_quantity_department_week_no_userid_itemid, on=['department'], how='left')
    targets_lvl_2_features = targets_lvl_2_features.merge(mean_sales_value_department_userid_itemid, on=['item_id', 'department'], how='left')
    
    X_train = targets_lvl_2_features.drop('target', axis=1)
    y_train = targets_lvl_2_features[['target']]
    cat_feats = X_train.columns[2:].tolist()
    X_train[cat_feats] = X_train[cat_feats].astype('category')

    return X_train, y_train
    
    

In [144]:
X_train, y_train = features_generator(data_train_lvl_1, item_features, user_features, targets_lvl_2)

In [145]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc',
 'mean_sal_val_uid',
 'sum_qt_dep_uid',
 'mean_qt_week_item',
 'mean_sal_val_dep',
 'mean_qt_dep_week',
 'mean_sal_val_dep_itemid']

Обучение модели с дополнительными фичами

In [146]:
lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)
lgb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [147]:
train_preds = lgb.predict(X_train)
train_preds.mean()

0.981176382573711

In [148]:
train_preds_proba = lgb.predict_proba(X_train)
train_preds_proba

array([[1.68075438e-04, 9.99831925e-01],
       [1.68075438e-04, 9.99831925e-01],
       [1.68075438e-04, 9.99831925e-01],
       ...,
       [9.02768347e-05, 9.99909723e-01],
       [9.54584421e-05, 9.99904542e-01],
       [8.19851922e-05, 9.99918015e-01]])

In [149]:
predict_train = create_matrix_predict_LGB(X_train, train_preds, train_preds_proba)
predict_train

Unnamed: 0,user_id,item_id
0,1,"[1005186, 1104349, 1135834, 1105433, 911454, 1..."
1,2,"[900370, 5569230, 1106523, 909396, 989101, 108..."
2,4,"[5569230, 883404, 1096317, 1038692, 835618, 90..."
3,6,"[1024306, 845208, 849843, 5569230, 9859112, 55..."
4,7,"[949836, 1106523, 1068504, 914577, 6602729, 88..."
...,...,...
2148,2496,"[899624, 995785, 860776, 916122, 883404, 55692..."
2149,2497,"[865026, 845208, 844165, 995785, 819978, 86077..."
2150,2498,"[9859182, 961554, 1005186, 865511, 974204, 994..."
2151,2499,"[5568378, 953476, 944466, 1046689, 1014802, 94..."


In [150]:
result_lvl_1.head(2)

Unnamed: 0,user_id,actual,own_k50
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[856942, 5577022, 963835, 9655212, 940947, 108..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[963835, 1075368, 911974, 922561, 827683, 8090..."


In [65]:
#result_train_lvl_2 = result_train_lvl_2.drop(['item_id_x', 'item_id_y'], axis=1)

In [151]:
#result_train_lvl_2 = data_train_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1 = result_lvl_1.merge(predict_train, on=['user_id'], how='left')
result_lvl_1.columns=['user_id', 'actual', 'own_k50', 'predict_train_ext_feat']
result_lvl_1

Unnamed: 0,user_id,actual,own_k50,predict_train_ext_feat
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[856942, 5577022, 963835, 9655212, 940947, 108...","[1005186, 1104349, 1135834, 1105433, 911454, 1..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[963835, 1075368, 911974, 922561, 827683, 8090...","[900370, 5569230, 1106523, 909396, 989101, 108..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[891423, 1075368, 910109, 6773204, 883932, 963...","[5569230, 883404, 1096317, 1038692, 835618, 90..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[1119051, 13003092, 963835, 1037863, 8203834, ...","[1024306, 845208, 849843, 5569230, 9859112, 55..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[963835, 1122358, 6944571, 949836, 922561, 660...","[949836, 1106523, 1068504, 914577, 6602729, 88..."
...,...,...,...,...
2149,2496,"[831509, 867188, 1013623, 1048851, 5592734, 16...","[963835, 855672, 1056509, 1029112, 7441210, 55...","[899624, 995785, 860776, 916122, 883404, 55692..."
2150,2497,"[820291, 824759, 838797, 859010, 859075, 86077...","[1066685, 1135834, 896938, 965719, 1031864, 91...","[865026, 845208, 844165, 995785, 819978, 86077..."
2151,2498,"[865511, 962991, 1076374, 1102358, 5564901, 15...","[963835, 1070820, 1100379, 1126899, 901776, 82...","[9859182, 961554, 1005186, 865511, 974204, 994..."
2152,2499,"[861282, 921744, 1050968, 13842089, 828837, 86...","[1070820, 963835, 873964, 1085604, 5570048, 83...","[5568378, 953476, 944466, 1046689, 1014802, 94..."


In [154]:
result_lvl_1.apply(lambda row: recall_at_k(row['own_k50'], row['actual']), axis=1).mean()

0.04403923938599829

In [155]:
result_lvl_1.apply(lambda row: recall_at_k(row['predict_train_ext_feat'], row['actual']), axis=1).mean()

0.03382525345393126

In [156]:
result_lvl_1.apply(lambda row: precision_at_k(row['own_k50'], row['actual']), axis=1).mean()

0.34317548746518106

In [158]:
result_lvl_1.apply(lambda row: precision_at_k(row['predict_train_ext_feat'], row['actual']), axis=1).mean()

0.2895078922934076

In [159]:
X_train

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,...,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,mean_sal_val_uid,sum_qt_dep_uid,mean_qt_week_item,mean_sal_val_dep,mean_qt_dep_week,mean_sal_val_dep_itemid
0,1,856942,159,GROCERY,National,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,65+,A,...,Homeowner,2 Adults No Kids,2,None/Unknown,2.435960,611.0,3.670588,2.187201,12464.126316,3.090036
1,1,856942,159,GROCERY,National,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,65+,A,...,Homeowner,2 Adults No Kids,2,None/Unknown,2.435960,611.0,3.670588,2.187201,12464.126316,3.090036
2,1,856942,159,GROCERY,National,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,65+,A,...,Homeowner,2 Adults No Kids,2,None/Unknown,2.435960,611.0,3.670588,2.187201,12464.126316,3.090036
3,1,856942,159,GROCERY,National,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,65+,A,...,Homeowner,2 Adults No Kids,2,None/Unknown,2.435960,611.0,3.670588,2.187201,12464.126316,3.090036
4,1,856942,159,GROCERY,National,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,65+,A,...,Homeowner,2 Adults No Kids,2,None/Unknown,2.435960,611.0,3.670588,2.187201,12464.126316,3.090036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481199,2500,1110572,1208,GROCERY,National,SOFT DRINKS,SFT DRNK MLT-PK BTL CARB (EXCP,20 OZ,,,...,,,,,2.602737,784.0,17.221053,2.187201,12464.126316,1.400166
481200,2500,1110572,1208,GROCERY,National,SOFT DRINKS,SFT DRNK MLT-PK BTL CARB (EXCP,20 OZ,,,...,,,,,2.602737,784.0,17.221053,2.187201,12464.126316,1.400166
481201,2500,1137507,531,GROCERY,National,CANNED JUICES,BLENDED JUICE&COMBINATIONS (OV,64 OZ,,,...,,,,,2.602737,784.0,2.862500,2.187201,12464.126316,2.382844
481202,2500,849098,759,GROCERY,National,YOGURT,YOGURT NOT MULTI-PACKS,7 OZ,,,...,,,,,2.602737,784.0,4.402597,2.187201,12464.126316,1.671990


Сделаем предсказания на test и посчитаем метрики

In [176]:
data_val_lvl_2 = pd.read_csv('retail_test.csv')
data_val_lvl_2

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.10,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
88729,98,41944918665,684,945779,2,2.00,421,0.0,1705,98,0.0,0.0
88730,98,41944918665,684,993617,2,2.00,421,0.0,1705,98,0.0,0.0
88731,98,41944918665,684,1128647,2,2.00,421,0.0,1705,98,0.0,0.0
88732,98,41944918665,684,9526886,2,0.60,421,0.0,1705,98,0.0,0.0


In [177]:
result_val_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_val_lvl_2.columns=['user_id', 'actual']
result_val_lvl_2.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


In [178]:
def create_matrix_target(users_lvl_2, data_train_lvl_2, name_column = 'own_k50'):
    # сформируем матрицу user и item на основании данных первой модели itemitemrecommender + топовые item
    s = users_lvl_2.apply(lambda x: pd.Series(x[name_column]), axis=1).stack().reset_index(level=1, drop=True)
    s.name = 'item_id'
    users_lvl_2 = users_lvl_2.drop(name_column, axis=1).join(s)
    
    # матрица таргетов из тренировочной выборки
    targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
    targets_lvl_2['target'] = 1  # тут только покупки 
    
    #к матрице полученной по результатам первой модели добавим таргеты и тренировочной выборки
    targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')
    
    targets_lvl_2['target'].fillna(0, inplace= True)
    targets_lvl_2['item_id'] = targets_lvl_2['item_id'].astype(int)
    
    return targets_lvl_2   

In [179]:
users_lvl_2 = result_val_lvl_2
users_lvl_2

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."
2,3,"[827683, 908531, 989069, 1071377, 1080155, 109..."
3,6,"[956902, 960791, 1037863, 1119051, 1137688, 84..."
4,7,"[847270, 855557, 859987, 863407, 895454, 90663..."
...,...,...
1880,2496,"[829291, 862139, 912704, 933067, 933835, 95537..."
1881,2497,[6534178]
1882,2498,"[1053690, 1076875, 12386123, 858303, 920109, 1..."
1883,2499,"[826249, 895327, 9858944, 820321, 829291, 8323..."


In [164]:
targets_val_lvl_2 = create_matrix_target(users_lvl_2, data_val_lvl_2, name_column = 'actual')
targets_val_lvl_2

Unnamed: 0,user_id,item_id,target
0,1,880007,1
1,1,883616,1
2,1,931136,1
3,1,938004,1
4,1,940947,1
...,...,...,...
88729,2500,10204556,1
88730,2500,15801331,1
88731,2500,15831322,1
88732,2500,17169644,1


In [165]:
X_val, y_val = features_generator(data_val_lvl_2, item_features, user_features, targets_val_lvl_2)

In [173]:
X_val

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,...,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,mean_sal_val_uid,sum_qt_dep_uid,mean_qt_week_item,mean_sal_val_dep,mean_qt_dep_week,mean_sal_val_dep_itemid
0,1,880007,1193,PASTRY,National,CAKES,CAKES: ANGEL FDS/CKE ROLLS,9 OZ,65+,A,...,Homeowner,2 Adults No Kids,2,None/Unknown,2.196389,1,1.000000,2.999346,582.000000,2.990000
1,1,883616,5612,GROCERY,National,SUGARS/SWEETNERS,SWEETENERS,100 CT,65+,A,...,Homeowner,2 Adults No Kids,2,None/Unknown,2.196389,19,2.666667,2.517678,25318.666667,3.990000
2,1,931136,709,GROCERY,National,MARGARINES,MARGARINE: TUBS AND BOWLS,16 OZ,65+,A,...,Homeowner,2 Adults No Kids,2,None/Unknown,2.196389,19,3.000000,2.517678,25318.666667,1.625000
3,1,938004,5724,GROCERY,National,BEANS - CANNED GLASS & MW,VARIETY BEANS - KIDNEY PINTO,15.5 OZ,65+,A,...,Homeowner,2 Adults No Kids,2,None/Unknown,2.196389,19,1.000000,2.517678,25318.666667,1.290000
4,1,940947,2082,MEAT-PCKGD,National,HEAT/SERVE,ENTREES,24 OZ,65+,A,...,Homeowner,2 Adults No Kids,2,None/Unknown,2.196389,3,20.333333,3.703863,1633.000000,3.259216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88729,2500,10204556,869,GROCERY,National,TEAS,TEA BAGS & BULK TEA,4.8 OZ,,,...,,,,,4.426792,40,2.000000,2.517678,25318.666667,2.923333
88730,2500,15801331,194,GROCERY,National,BAKING MIXES,BROWNIE MIX,18.3 OZ,,,...,,,,,4.426792,40,2.000000,2.517678,25318.666667,1.990000
88731,2500,15831322,1011,GROCERY,National,PAPER HOUSEWARES,DESIGNER PAPER: MEDIUM WEIGHT,40 CT,,,...,,,,,4.426792,40,1.000000,2.517678,25318.666667,5.090000
88732,2500,17169644,1136,GROCERY,National,HOUSEHOLD CLEANG NEEDS,TOOLS - FLOOR & FURNITURE,1 CT,,,...,,,,,4.426792,40,2.000000,2.517678,25318.666667,4.290000


In [166]:
train_preds_val = lgb.predict(X_val)
train_preds_val.mean()

1.0

In [167]:
train_preds_proba = lgb.predict_proba(X_val)
train_preds_proba

array([[1.63485829e-04, 9.99836514e-01],
       [1.59610480e-04, 9.99840390e-01],
       [1.81278291e-04, 9.99818722e-01],
       ...,
       [1.59264035e-04, 9.99840736e-01],
       [1.59264035e-04, 9.99840736e-01],
       [9.87473668e-05, 9.99901253e-01]])

In [168]:
predict_train = create_matrix_predict_LGB(X_val, train_preds_val, train_preds_proba)
predict_train

Unnamed: 0,user_id,item_id
0,1,"[1037863, 1004906, 986947, 1062002, 962568, 96..."
1,2,"[12302069, 1049788, 15687014, 5568228, 940947,..."
2,3,"[827683, 1080155, 13213518, 15452812, 908531, ..."
3,6,"[849843, 840361, 994928, 995242, 904521, 10632..."
4,7,"[5592694, 12695481, 15741535, 14106445, 859987..."
...,...,...
1880,2496,"[827271, 824796, 933835, 979707, 909268, 11282..."
1881,2497,[6534178]
1882,2498,"[1030455, 920109, 1053690, 1076875, 963719, 10..."
1883,2499,"[1131351, 882308, 899624, 895327, 880150, 9487..."


In [169]:
result_val_lvl_2.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


In [122]:
#result_train_lvl_2 = result_train_lvl_2.drop(['predict_train_lvl_2_ext_feat', 'item_id'], axis=1)

In [170]:
#result_train_lvl_2 = data_train_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_val_lvl_2 = result_val_lvl_2.merge(predict_train, on=['user_id'], how='left')
result_val_lvl_2.columns=['user_id', 'actual', 'predict_test_ext_feat']
result_val_lvl_2

Unnamed: 0,user_id,actual,predict_test_ext_feat
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[1037863, 1004906, 986947, 1062002, 962568, 96..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[12302069, 1049788, 15687014, 5568228, 940947,..."
2,3,"[827683, 908531, 989069, 1071377, 1080155, 109...","[827683, 1080155, 13213518, 15452812, 908531, ..."
3,6,"[956902, 960791, 1037863, 1119051, 1137688, 84...","[849843, 840361, 994928, 995242, 904521, 10632..."
4,7,"[847270, 855557, 859987, 863407, 895454, 90663...","[5592694, 12695481, 15741535, 14106445, 859987..."
...,...,...,...
1880,2496,"[829291, 862139, 912704, 933067, 933835, 95537...","[827271, 824796, 933835, 979707, 909268, 11282..."
1881,2497,[6534178],[6534178]
1882,2498,"[1053690, 1076875, 12386123, 858303, 920109, 1...","[1030455, 920109, 1053690, 1076875, 963719, 10..."
1883,2499,"[826249, 895327, 9858944, 820321, 829291, 8323...","[1131351, 882308, 899624, 895327, 880150, 9487..."


In [171]:
result_val_lvl_2.apply(lambda row: recall_at_k(row['predict_test_ext_feat'], row['actual']), axis=1).mean()

0.2930372020428399

In [172]:
result_val_lvl_2.apply(lambda row: precision_at_k(row['predict_test_ext_feat'], row['actual']), axis=1).mean()

1.0

In [174]:
result = result_val_lvl_2.drop('actual', axis=1)
result

Unnamed: 0,user_id,predict_test_ext_feat
0,1,"[1037863, 1004906, 986947, 1062002, 962568, 96..."
1,2,"[12302069, 1049788, 15687014, 5568228, 940947,..."
2,3,"[827683, 1080155, 13213518, 15452812, 908531, ..."
3,6,"[849843, 840361, 994928, 995242, 904521, 10632..."
4,7,"[5592694, 12695481, 15741535, 14106445, 859987..."
...,...,...
1880,2496,"[827271, 824796, 933835, 979707, 909268, 11282..."
1881,2497,[6534178]
1882,2498,"[1030455, 920109, 1053690, 1076875, 963719, 10..."
1883,2499,"[1131351, 882308, 899624, 895327, 880150, 9487..."


In [180]:
result.to_csv('recommendations.csv', index=False)