# Итоговый проект по рекомендательным системам


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня

import lightgbm as lgbm, catboost as catb
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k, ndcg_at_k, ap_at_k
from src.utils import prefilter_items

In [2]:
import pandas as pd
import numpy as np

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender  # нужен для одного трюка
from implicit.nearest_neighbours import bm25_weight, tfidf_weight


class MainRecommender:
    """Рекоммендации, которые можно получить из ALS

    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """

    def __init__(self, data, weighting=True):

        # Топ покупок каждого юзера
        self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 999999]

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 999999]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist()

        self.user_item_matrix = self._prepare_matrix(data)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid, \
            self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix)

        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T

        self.model = self.fit(self.user_item_matrix)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)

    @staticmethod
    def _prepare_matrix(data):
        """Готовит user-item матрицу"""
        user_item_matrix = pd.pivot_table(data,
                                          index='user_id', columns='item_id',
                                          values='quantity',  # Можно пробовать другие варианты
                                          aggfunc='count',
                                          fill_value=0
                                          )

        user_item_matrix = user_item_matrix.astype(float)  # необходимый тип матрицы для implicit

        return user_item_matrix

    @staticmethod
    def _prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))

        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""

        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())

        return own_recommender

    @staticmethod
    def fit(user_item_matrix, n_factors=20, regularization=0.001, iterations=15, num_threads=4):
        """Обучает ALS"""

        model = AlternatingLeastSquares(factors=n_factors,
                                        regularization=regularization,
                                        iterations=iterations,
                                        num_threads=num_threads)
        model.fit(csr_matrix(user_item_matrix).T.tocsr())

        return model

    def _update_dict(self, user_id):
        """Если появился новыю user / item, то нужно обновить словари"""

        if user_id not in self.userid_to_id.keys():

            max_id = max(list(self.userid_to_id.values()))
            max_id += 1

            self.userid_to_id.update({user_id: max_id})
            self.id_to_userid.update({max_id: user_id})

    def _get_similar_item(self, item_id):
        """Находит товар, похожий на item_id"""
        recs = self.model.similar_items(self.itemid_to_id[item_id], N=2)  # Товар похож на себя -> рекомендуем 2 товара
        top_rec = recs[1][0]  # И берем второй (не товар из аргумента метода)
        return self.id_to_itemid[top_rec]

    def _extend_with_top_popular(self, recommendations, N=5):
        """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""

        if len(recommendations) < N:
            recommendations.extend(self.overall_top_purchases[:N])
            recommendations = recommendations[:N]

        return recommendations

    def _get_recommendations(self, user, model, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)
        res = [self.id_to_itemid[rec[0]] for rec in model.recommend(userid=self.userid_to_id[user],
                                        user_items=csr_matrix(self.user_item_matrix).tocsr(),
                                        N=N,
                                        filter_already_liked_items=False,
                                        filter_items=[self.itemid_to_id[999999]],
                                        recalculate_user=True)]

        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_als_recommendations(self, user, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        try:
            self._update_dict(user_id=user)
            return self._get_recommendations(user, model=self.model, N=N)
        except:
            return self._extend_with_top_popular([], N=N)

    def get_own_recommendations(self, user, N=5):
        """Рекомендуем товары среди тех, которые юзер уже купил"""

        try:
            self._update_dict(user_id=user)
            return self._get_recommendations(user, model=self.own_recommender, N=N)
        except:
            return self._extend_with_top_popular([], N=N)

    def get_similar_items_recommendation(self, user, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""

        try:
            top_users_purchases = self.top_purchases[self.top_purchases['user_id'] == user].head(N)

            res = top_users_purchases['item_id'].apply(lambda x: self._get_similar_item(x)).tolist()
            res = self._extend_with_top_popular(res, N=N)

            assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
            return res
        except:
            return self._extend_with_top_popular([], N=N)

    def get_similar_users_recommendation(self, user, N=5):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""

        try:
            res = []

            # Находим топ-N похожих пользователей
            similar_users = self.model.similar_users(self.userid_to_id[user], N=N+1)
            similar_users = [rec[0] for rec in similar_users]
            similar_users = similar_users[1:]   # удалим юзера из запроса

            for user in similar_users:
                res.extend(self.get_own_recommendations(user, N=1))

            res = self._extend_with_top_popular(res, N=N)

            assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
            return res
        except:
            return self._extend_with_top_popular([], N=N)

In [3]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

val_size_weeks = 3

data_train = data[data['week_no'] < (data['week_no'].max() - val_size_weeks)]

data_val = data[data['week_no'] >= (data['week_no'].max() - val_size_weeks)]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [4]:
n_items_before = data_train['item_id'].nunique()

data_train = prefilter_items(data_train, item_features=item_features, take_n_popular=5000)

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86865 to 5001


## Проверим, появились ли новые пользователи, по которым отсутствуют данные ранее тестовых 6 недель

In [5]:
u_train_1 = data_train['user_id'].unique()
u_test_1 = data_val['user_id'].unique()

In [6]:
u_train_1 = pd.Series(u_train_1)
u_test_1 = pd.Series(u_test_1)

In [7]:
u_test_1.isin(u_train_1).value_counts()

True     2040
False       2
dtype: int64

Два пользователя появились только в валидационных данных (последние 3 недели), по ним будем рекомендовать популярные товары

In [8]:
recommender = MainRecommender(data_train)



  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [9]:
result = data_val.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [10]:
result['als_50'] = result['user_id'].apply(lambda row: recommender.get_als_recommendations(row, N=50))

In [11]:
result.head(2)

Unnamed: 0,user_id,actual,als_50
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1003649, 1037332, 920200, 1062572, 1131488, 8..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1044078, 844179, 1029743, 8090521, 1106523, 8..."


In [12]:
result.apply(lambda row: precision_at_k(row['als_50'], row['actual'], k = 50), axis=1).mean()

0.04663075416258582

In [13]:
result['own_50'] = result['user_id'].apply(lambda row: recommender.get_own_recommendations(row, N=50))

In [14]:
result.apply(lambda row: precision_at_k(row['own_50'], row['actual'], k = 50), axis=1).mean()

0.06179236043095013

In [15]:
result['similar_items_50'] = result['user_id'].apply(lambda row: recommender.get_similar_items_recommendation(row, N=50))

In [16]:
result.apply(lambda row: precision_at_k(row['similar_items_50'], row['actual'], k = 50), axis=1).mean()

0.0351518119490697

In [17]:
# Измерим precision_at_k k=5 на рекомендациях own - это базовый вариант

In [18]:
#precision_at_k k=5 train
result['own_5'] = result['user_id'].apply(lambda row: recommender.get_own_recommendations(row, N=5))
result.apply(lambda row: precision_at_k(row['own_5'], row['actual'], k = 5), axis=1).mean()

0.15602350636630624

In [None]:
#ndcg_at_5 valid
result.apply(lambda row: ndcg_at_k(row['own_5'], row['actual'], k = 5), axis=1).mean()

In [None]:
#map_at_k valid
result.apply(lambda row: ap_at_k(row['own_5'], row['actual'], k = 5), axis=1).mean()

In [None]:
result.head(2)

In [None]:
def mix(a, b):
    return list(set(a) & set(b))

In [None]:
result['mix1'] = result.apply(lambda row: mix(row['als_50'], row['similar_items_50']), axis=1)

In [None]:
result['mix1']

In [None]:
result['mix2'] = result.apply(lambda row: mix(row['als_50'], row['own_50']), axis=1)

In [None]:
result['mix2']

In [None]:
result['mix3'] = result.apply(lambda row: mix(row['own_50'], row['similar_items_50']), axis=1)

In [None]:
result['mix3']

In [None]:
result['mix'] = result.apply(lambda row: mix(row['mix1'], row['mix2']), axis=1)

In [None]:
result['mix'] = result.apply(lambda row: mix(row['mix'], row['mix3']), axis=1)

In [None]:
result['mix']

In [None]:
result['mix'] = result['mix'] + result['mix2'] + result['mix3'] + result['mix1'] + result['own_50']

In [None]:
result['mix'] = result['mix'].apply(lambda row: list(set(row))[:5])

In [None]:
result['mix']

In [None]:
result.apply(lambda row: precision_at_k(row['mix'], row['actual'], k = 5), axis=1).mean()

In [35]:
df_items = candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [36]:
df_items

0         856942
0        9297615
0        5577022
0         877391
0        9655212
          ...   
2153      890909
2153      874837
2153     1137507
2153     8292967
2153    12782182
Name: item_id, Length: 107700, dtype: int64

In [37]:
candidates = candidates.drop('candidates', axis=1).join(df_items)

In [38]:
candidates.head()

Unnamed: 0,user_id,item_id
0,1,856942
0,1,9297615
0,1,5577022
0,1,877391
0,1,9655212


In [39]:
candidates.shape

(107700, 2)

In [40]:
data_train_lvl_2 = data[data['week_no'] < (data['week_no'].max() - (val_lvl_2_size_weeks))]
data_train_lvl_2['target'] = 1
data_train_lvl_2 = data_train_lvl_2.groupby(['user_id', 'item_id', 'target'])['quantity'].count().reset_index()
data_train_lvl_2 = data_train_lvl_2[['user_id', 'item_id', 'target']]
data_train_lvl_2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_train_lvl_2['target'] = 1


Unnamed: 0,user_id,item_id,target
0,1,819312,1
1,1,820165,1
2,1,821815,1
3,1,823721,1
4,1,823990,1
...,...,...,...
1260013,2500,15797145,1
1260014,2500,15830972,1
1260015,2500,16059046,1
1260016,2500,16223404,1


In [41]:
candidates = candidates.merge(item_features, on='item_id', how='left')

In [42]:
candidates = candidates.merge(user_features, on='user_id', how='left')

In [43]:
candidates.shape

(107700, 15)

In [44]:
data_train_lvl_2 = candidates.merge(data_train_lvl_2, on=['user_id', 'item_id'], how='left')
data_train_lvl_2['target'].fillna(0, inplace= True)

data_train_lvl_2

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,target
0,1,856942,159,GROCERY,National,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0
1,1,9297615,1194,GROCERY,National,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0
2,1,5577022,1194,GROCERY,National,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0
3,1,877391,584,NUTRITION,National,RICE CAKES,RICE CAKES,6.57 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0
4,1,9655212,1011,GROCERY,National,PAPER HOUSEWARES,DESIGNER PAPER: MEDIUM WEIGHT,45 CT,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107695,2500,890909,69,GROCERY,Private,CHEESE,SHREDDED CHEESE,24 OZ,,,,,,,,1.0
107696,2500,874837,194,GROCERY,National,COLD CEREAL,ADULT CEREAL,16 OZ,,,,,,,,1.0
107697,2500,1137507,531,GROCERY,National,CANNED JUICES,BLENDED JUICE&COMBINATIONS (OV,64 OZ,,,,,,,,1.0
107698,2500,8292967,2143,GROCERY,National,DRY NOODLES/PASTA,MACARONI DRY,8 OZ,,,,,,,,1.0


In [45]:
data_train_lvl_2['target'].unique()

array([1., 0.])

In [46]:
data_train_lvl_2['target'].value_counts()

1.0    88322
0.0    19378
Name: target, dtype: int64

In [47]:
data_train_lvl_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 107700 entries, 0 to 107699
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   user_id               107700 non-null  int64  
 1   item_id               107700 non-null  int64  
 2   manufacturer          107700 non-null  int64  
 3   department            107700 non-null  object 
 4   brand                 107700 non-null  object 
 5   commodity_desc        107700 non-null  object 
 6   sub_commodity_desc    107700 non-null  object 
 7   curr_size_of_product  107700 non-null  object 
 8   age_desc              39550 non-null   object 
 9   marital_status_code   39550 non-null   object 
 10  income_desc           39550 non-null   object 
 11  homeowner_desc        39550 non-null   object 
 12  hh_comp_desc          39550 non-null   object 
 13  household_size_desc   39550 non-null   object 
 14  kid_category_desc     39550 non-null   object 
 15  

In [48]:
from scipy.stats import mode

Заполним отсутствующие значения по пользователям, по которым отсутствуют user_features: категориальные заполним модой, числовые медианой

In [49]:
columns_for_users_fillna = ['age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc', 'hh_comp_desc', 'household_size_desc', 'kid_category_desc']

In [50]:
for column in columns_for_users_fillna:
    data_train_lvl_2[column].fillna(data_train_lvl_2[column].mode()[0], inplace= True)

In [51]:
data_train_lvl_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 107700 entries, 0 to 107699
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   user_id               107700 non-null  int64  
 1   item_id               107700 non-null  int64  
 2   manufacturer          107700 non-null  int64  
 3   department            107700 non-null  object 
 4   brand                 107700 non-null  object 
 5   commodity_desc        107700 non-null  object 
 6   sub_commodity_desc    107700 non-null  object 
 7   curr_size_of_product  107700 non-null  object 
 8   age_desc              107700 non-null  object 
 9   marital_status_code   107700 non-null  object 
 10  income_desc           107700 non-null  object 
 11  homeowner_desc        107700 non-null  object 
 12  hh_comp_desc          107700 non-null  object 
 13  household_size_desc   107700 non-null  object 
 14  kid_category_desc     107700 non-null  object 
 15  

In [52]:
data_train_lvl_2.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,target
0,1,856942,159,GROCERY,National,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0
1,1,9297615,1194,GROCERY,National,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0
2,1,5577022,1194,GROCERY,National,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0
3,1,877391,584,NUTRITION,National,RICE CAKES,RICE CAKES,6.57 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0
4,1,9655212,1011,GROCERY,National,PAPER HOUSEWARES,DESIGNER PAPER: MEDIUM WEIGHT,45 CT,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0


Фичи user_id: - Средний чек - Средняя сумма покупки 1 товара в каждой категории - Кол-во покупок в каждой категории - Частотность покупок раз/месяц - Долю покупок в выходные - Долю покупок утром/днем/вечером

Фичи item_id: - Кол-во покупок в неделю - Среднее ол-во покупок 1 товара в категории в неделю - (Кол-во покупок в неделю) / (Среднее ол-во покупок 1 товара в категории в неделю) - Цена (Можно посчитать из retil_train.csv) - Цена / Средняя цена товара в категории

Фичи пары user_id - item_id - (Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)) - (Цена item_id) - (Кол-во покупок юзером конкретной категории в неделю) - (Среднее кол-во покупок всеми юзерами конкретной категории в неделю) - (Кол-во покупок юзером конкретной категории в неделю) / (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)

## Добавим дополнительные признаки

In [53]:
data_train_train_lvl_2_extra_features = data[data['week_no'] < (data['week_no'].max() - (val_lvl_2_size_weeks))]
data_train_train_lvl_2_extra_features.shape

(2278490, 12)

In [54]:
data_train_train_lvl_2_extra_features = data_train_train_lvl_2_extra_features.merge(item_features, on='item_id', how='left')
data_train_train_lvl_2_extra_features.shape

(2278490, 18)

In [55]:
data_train_train_lvl_2_extra_features = data_train_train_lvl_2_extra_features.merge(user_features, on='user_id', how='left')
data_train_train_lvl_2_extra_features.shape

(2278490, 25)

In [56]:
data_train_train_lvl_2_extra_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2278490 entries, 0 to 2278489
Data columns (total 25 columns):
 #   Column                Dtype  
---  ------                -----  
 0   user_id               int64  
 1   basket_id             int64  
 2   day                   int64  
 3   item_id               int64  
 4   quantity              int64  
 5   sales_value           float64
 6   store_id              int64  
 7   retail_disc           float64
 8   trans_time            int64  
 9   week_no               int64  
 10  coupon_disc           float64
 11  coupon_match_disc     float64
 12  manufacturer          int64  
 13  department            object 
 14  brand                 object 
 15  commodity_desc        object 
 16  sub_commodity_desc    object 
 17  curr_size_of_product  object 
 18  age_desc              object 
 19  marital_status_code   object 
 20  income_desc           object 
 21  homeowner_desc        object 
 22  hh_comp_desc          object 
 23  househo

### extra_feature 1

In [57]:
user_item_quantity = data_train_train_lvl_2_extra_features.groupby(['user_id', 'item_id'])['quantity'].sum().reset_index()

data_train_lvl_2 = data_train_lvl_2.merge(user_item_quantity, on=['user_id', 'item_id'], how='left')
data_train_lvl_2.rename(columns={'quantity': 'user_item_quantity'}, inplace=True)
data_train_lvl_2.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,target,user_item_quantity
0,1,856942,159,GROCERY,National,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,53.0
1,1,9297615,1194,GROCERY,National,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,21.0
2,1,5577022,1194,GROCERY,National,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,20.0
3,1,877391,584,NUTRITION,National,RICE CAKES,RICE CAKES,6.57 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,13.0
4,1,9655212,1011,GROCERY,National,PAPER HOUSEWARES,DESIGNER PAPER: MEDIUM WEIGHT,45 CT,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,19.0


### extra_feature 2

In [58]:
user_item_quantity_a_week = data_train_train_lvl_2_extra_features.groupby(['user_id', 'item_id', 'week_no'])['quantity'].sum().reset_index()
mean_user_item_quantity_a_week = user_item_quantity_a_week.groupby(['user_id', 'item_id'])['quantity'].mean().reset_index()

data_train_lvl_2 = data_train_lvl_2.merge(mean_user_item_quantity_a_week, on=['user_id', 'item_id'], how='left')
data_train_lvl_2.rename(columns={'quantity': 'mean_user_item_quantity_a_week'}, inplace=True)
data_train_lvl_2.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,target,user_item_quantity,mean_user_item_quantity_a_week
0,1,856942,159,GROCERY,National,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,53.0,1.06
1,1,9297615,1194,GROCERY,National,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,21.0,1.05
2,1,5577022,1194,GROCERY,National,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,20.0,1.0
3,1,877391,584,NUTRITION,National,RICE CAKES,RICE CAKES,6.57 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,13.0,1.0
4,1,9655212,1011,GROCERY,National,PAPER HOUSEWARES,DESIGNER PAPER: MEDIUM WEIGHT,45 CT,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,19.0,1.117647


### extra_feature 3

In [59]:
user_mean_sales_value = data_train_train_lvl_2_extra_features.groupby(['user_id'])['sales_value'].mean().reset_index()

data_train_lvl_2 = data_train_lvl_2.merge(user_mean_sales_value, on=['user_id'], how='left')
data_train_lvl_2.rename(columns={'sales_value': 'user_mean_sales_value'}, inplace=True)
data_train_lvl_2.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,target,user_item_quantity,mean_user_item_quantity_a_week,user_mean_sales_value
0,1,856942,159,GROCERY,National,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,53.0,1.06,2.494884
1,1,9297615,1194,GROCERY,National,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,21.0,1.05,2.494884
2,1,5577022,1194,GROCERY,National,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,20.0,1.0,2.494884
3,1,877391,584,NUTRITION,National,RICE CAKES,RICE CAKES,6.57 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,13.0,1.0,2.494884
4,1,9655212,1011,GROCERY,National,PAPER HOUSEWARES,DESIGNER PAPER: MEDIUM WEIGHT,45 CT,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,19.0,1.117647,2.494884


### extra_feature 4

In [60]:
user_mean_sales_value_com_desc = data_train_train_lvl_2_extra_features.groupby(['user_id', 'commodity_desc'])['sales_value'].mean().reset_index()

data_train_lvl_2 = data_train_lvl_2.merge(user_mean_sales_value_com_desc, on=['user_id', 'commodity_desc'], how='left')
data_train_lvl_2.rename(columns={'sales_value': 'user_mean_sales_value_com_desc'}, inplace=True)
data_train_lvl_2.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,target,user_item_quantity,mean_user_item_quantity_a_week,user_mean_sales_value,user_mean_sales_value_com_desc
0,1,856942,159,GROCERY,National,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,53.0,1.06,2.494884,2.478421
1,1,9297615,1194,GROCERY,National,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,21.0,1.05,2.494884,2.893721
2,1,5577022,1194,GROCERY,National,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,20.0,1.0,2.494884,2.893721
3,1,877391,584,NUTRITION,National,RICE CAKES,RICE CAKES,6.57 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,13.0,1.0,2.494884,2.608235
4,1,9655212,1011,GROCERY,National,PAPER HOUSEWARES,DESIGNER PAPER: MEDIUM WEIGHT,45 CT,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,19.0,1.117647,2.494884,3.0695


### extra_feature 5

In [61]:
user_quantity_com_desc = data_train_train_lvl_2_extra_features.groupby(['user_id', 'commodity_desc'])['quantity'].sum().reset_index()

data_train_lvl_2 = data_train_lvl_2.merge(user_quantity_com_desc, on=['user_id', 'commodity_desc'], how='left')
data_train_lvl_2.rename(columns={'quantity': 'user_quantity_com_desc'}, inplace=True)
data_train_lvl_2.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,...,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,target,user_item_quantity,mean_user_item_quantity_a_week,user_mean_sales_value,user_mean_sales_value_com_desc,user_quantity_com_desc
0,1,856942,159,GROCERY,National,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,65+,A,...,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,53.0,1.06,2.494884,2.478421,117.0
1,1,9297615,1194,GROCERY,National,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,65+,A,...,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,21.0,1.05,2.494884,2.893721,44.0
2,1,5577022,1194,GROCERY,National,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,65+,A,...,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,20.0,1.0,2.494884,2.893721,44.0
3,1,877391,584,NUTRITION,National,RICE CAKES,RICE CAKES,6.57 OZ,65+,A,...,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,13.0,1.0,2.494884,2.608235,17.0
4,1,9655212,1011,GROCERY,National,PAPER HOUSEWARES,DESIGNER PAPER: MEDIUM WEIGHT,45 CT,65+,A,...,Homeowner,2 Adults No Kids,2,None/Unknown,1.0,19.0,1.117647,2.494884,3.0695,22.0


### extra_feature 6

In [62]:
user_quantity_a_week_com_desc = user_count_sales_value_com_desc = data_train_train_lvl_2_extra_features.groupby(['user_id', 'week_no', 'commodity_desc'])['quantity'].sum().reset_index()
user_mean_quantity_a_week_com_desc = user_quantity_a_week_com_desc.groupby(['user_id', 'commodity_desc'])['quantity'].mean().reset_index()

data_train_lvl_2 = data_train_lvl_2.merge(user_mean_quantity_a_week_com_desc, on=['user_id', 'commodity_desc'], how='left')
data_train_lvl_2.rename(columns={'quantity': 'user_mean_quantity_a_week_com_desc'}, inplace=True)
data_train_lvl_2.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,...,hh_comp_desc,household_size_desc,kid_category_desc,target,user_item_quantity,mean_user_item_quantity_a_week,user_mean_sales_value,user_mean_sales_value_com_desc,user_quantity_com_desc,user_mean_quantity_a_week_com_desc
0,1,856942,159,GROCERY,National,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,65+,A,...,2 Adults No Kids,2,None/Unknown,1.0,53.0,1.06,2.494884,2.478421,117.0,2.166667
1,1,9297615,1194,GROCERY,National,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,65+,A,...,2 Adults No Kids,2,None/Unknown,1.0,21.0,1.05,2.494884,2.893721,44.0,1.073171
2,1,5577022,1194,GROCERY,National,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,65+,A,...,2 Adults No Kids,2,None/Unknown,1.0,20.0,1.0,2.494884,2.893721,44.0,1.073171
3,1,877391,584,NUTRITION,National,RICE CAKES,RICE CAKES,6.57 OZ,65+,A,...,2 Adults No Kids,2,None/Unknown,1.0,13.0,1.0,2.494884,2.608235,17.0,1.214286
4,1,9655212,1011,GROCERY,National,PAPER HOUSEWARES,DESIGNER PAPER: MEDIUM WEIGHT,45 CT,65+,A,...,2 Adults No Kids,2,None/Unknown,1.0,19.0,1.117647,2.494884,3.0695,22.0,1.222222


### extra_feature 7

In [63]:
purchases_a_week_com_desc = data_train_train_lvl_2_extra_features.groupby(['week_no', 'commodity_desc'])['quantity'].sum().reset_index()
mean_purchases_a_week_com_desc = purchases_a_week_com_desc.groupby(['commodity_desc'])['quantity'].mean().reset_index()

data_train_lvl_2 = data_train_lvl_2.merge(mean_purchases_a_week_com_desc, on=['commodity_desc'], how='left')
data_train_lvl_2.rename(columns={'quantity': 'mean_purchases_a_week_com_desc'}, inplace=True)
data_train_lvl_2.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,...,household_size_desc,kid_category_desc,target,user_item_quantity,mean_user_item_quantity_a_week,user_mean_sales_value,user_mean_sales_value_com_desc,user_quantity_com_desc,user_mean_quantity_a_week_com_desc,mean_purchases_a_week_com_desc
0,1,856942,159,GROCERY,National,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,65+,A,...,2,None/Unknown,1.0,53.0,1.06,2.494884,2.478421,117.0,2.166667,967.483516
1,1,9297615,1194,GROCERY,National,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,65+,A,...,2,None/Unknown,1.0,21.0,1.05,2.494884,2.893721,44.0,1.073171,372.131868
2,1,5577022,1194,GROCERY,National,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,65+,A,...,2,None/Unknown,1.0,20.0,1.0,2.494884,2.893721,44.0,1.073171,372.131868
3,1,877391,584,NUTRITION,National,RICE CAKES,RICE CAKES,6.57 OZ,65+,A,...,2,None/Unknown,1.0,13.0,1.0,2.494884,2.608235,17.0,1.214286,16.932584
4,1,9655212,1011,GROCERY,National,PAPER HOUSEWARES,DESIGNER PAPER: MEDIUM WEIGHT,45 CT,65+,A,...,2,None/Unknown,1.0,19.0,1.117647,2.494884,3.0695,22.0,1.222222,119.340659


### extra_feature 8

In [64]:
item_quantity_a_week = data_train_train_lvl_2_extra_features.groupby(['week_no', 'item_id'])['quantity'].sum().reset_index()
mean_item_quantity_a_week = item_quantity_a_week.groupby(['item_id'])['quantity'].mean().reset_index()

data_train_lvl_2 = data_train_lvl_2.merge(mean_item_quantity_a_week, on=['item_id'], how='left')
data_train_lvl_2.rename(columns={'quantity': 'mean_item_quantity_a_week'}, inplace=True)
data_train_lvl_2.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,...,kid_category_desc,target,user_item_quantity,mean_user_item_quantity_a_week,user_mean_sales_value,user_mean_sales_value_com_desc,user_quantity_com_desc,user_mean_quantity_a_week_com_desc,mean_purchases_a_week_com_desc,mean_item_quantity_a_week
0,1,856942,159,GROCERY,National,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,65+,A,...,None/Unknown,1.0,53.0,1.06,2.494884,2.478421,117.0,2.166667,967.483516,3.698795
1,1,9297615,1194,GROCERY,National,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,65+,A,...,None/Unknown,1.0,21.0,1.05,2.494884,2.893721,44.0,1.073171,372.131868,1.984127
2,1,5577022,1194,GROCERY,National,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,65+,A,...,None/Unknown,1.0,20.0,1.0,2.494884,2.893721,44.0,1.073171,372.131868,2.453333
3,1,877391,584,NUTRITION,National,RICE CAKES,RICE CAKES,6.57 OZ,65+,A,...,None/Unknown,1.0,13.0,1.0,2.494884,2.608235,17.0,1.214286,16.932584,1.190476
4,1,9655212,1011,GROCERY,National,PAPER HOUSEWARES,DESIGNER PAPER: MEDIUM WEIGHT,45 CT,65+,A,...,None/Unknown,1.0,19.0,1.117647,2.494884,3.0695,22.0,1.222222,119.340659,2.573529


In [65]:
data_train_lvl_2['manufacturer'] = data_train_lvl_2['manufacturer'].astype('object')

In [66]:
data_train_lvl_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 107700 entries, 0 to 107699
Data columns (total 24 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   user_id                             107700 non-null  int64  
 1   item_id                             107700 non-null  int64  
 2   manufacturer                        107700 non-null  object 
 3   department                          107700 non-null  object 
 4   brand                               107700 non-null  object 
 5   commodity_desc                      107700 non-null  object 
 6   sub_commodity_desc                  107700 non-null  object 
 7   curr_size_of_product                107700 non-null  object 
 8   age_desc                            107700 non-null  object 
 9   marital_status_code                 107700 non-null  object 
 10  income_desc                         107700 non-null  object 
 11  homeowner_desc            

In [67]:
data_train_lvl_2[data_train_lvl_2['user_item_quantity'].isna()== True]

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,...,kid_category_desc,target,user_item_quantity,mean_user_item_quantity_a_week,user_mean_sales_value,user_mean_sales_value_com_desc,user_quantity_com_desc,user_mean_quantity_a_week_com_desc,mean_purchases_a_week_com_desc,mean_item_quantity_a_week
22,1,948640,1213,DRUG GM,National,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,65+,A,...,None/Unknown,0.0,,,2.494884,2.662727,11.0,1.222222,116.230769,1.289474
29,1,7152319,2627,GROCERY,National,DOMESTIC WINE,POPULAR 750ML WINES,4/187 ML,65+,A,...,None/Unknown,0.0,,,2.494884,,,,64.505495,2.095238
36,1,979674,111,DRUG GM,National,CIGARETTES,CIGARETTES,CTN,65+,A,...,None/Unknown,0.0,,,2.494884,3.920000,4.0,2.000000,165.153846,1.113636
60,2,896666,1046,GROCERY,National,COFFEE,INSTANT COFFEE FLAVORED W/SWEE,8 OZ,45-54,U,...,None/Unknown,0.0,,,2.783893,3.590000,1.0,1.000000,146.890110,1.533333
65,2,843418,1230,MEAT-PCKGD,National,DINNER SAUSAGE,SMOKED/COOKED,14 OZ,45-54,U,...,None/Unknown,0.0,,,2.783893,,,,158.219780,1.534884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107625,2499,1138596,111,DRUG GM,National,CIGARETTES,CIGARETTES,523670 CTN,25-34,U,...,1,0.0,,,2.972163,0.000000,0.0,0.000000,165.153846,1.250000
107642,2499,7443137,233,GROCERY,National,CAT FOOD,CAT TREATS,4 OZ,25-34,U,...,1,0.0,,,2.972163,2.640000,4.0,1.000000,295.945055,1.529412
107659,2500,1057168,697,GROCERY,National,BEERS/ALES,BEERALEMALT LIQUORS,12 OZ,45-54,U,...,None/Unknown,0.0,,,3.688531,17.774706,23.0,1.916667,192.626374,2.486486
107671,2500,941515,111,DRUG GM,National,CIGARETTES,CIGARETTES,972414 PK,45-54,U,...,None/Unknown,0.0,,,3.688531,,,,165.153846,1.657895


In [68]:
extra_features_na_list = ['user_item_quantity', 'mean_user_item_quantity_a_week', 'user_mean_sales_value_com_desc', 'user_quantity_com_desc', 'user_mean_quantity_a_week_com_desc']

for el in extra_features_na_list:
    data_train_lvl_2[el].fillna(0, inplace= True)

In [69]:
data_train_lvl_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 107700 entries, 0 to 107699
Data columns (total 24 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   user_id                             107700 non-null  int64  
 1   item_id                             107700 non-null  int64  
 2   manufacturer                        107700 non-null  object 
 3   department                          107700 non-null  object 
 4   brand                               107700 non-null  object 
 5   commodity_desc                      107700 non-null  object 
 6   sub_commodity_desc                  107700 non-null  object 
 7   curr_size_of_product                107700 non-null  object 
 8   age_desc                            107700 non-null  object 
 9   marital_status_code                 107700 non-null  object 
 10  income_desc                         107700 non-null  object 
 11  homeowner_desc            

In [123]:
cat_features = ['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [71]:
def one_hot(cat_features, df):
    for cat_colname in cat_features:
        df = pd.concat([df, pd.get_dummies(df[cat_colname], prefix=cat_colname)], axis=1)
    return df    

In [125]:
data_train_lvl_2 = one_hot(cat_features, data_train_lvl_2)

  uniques = Index(uniques)


In [73]:
drop_features = ['user_id', 'item_id', 'manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [74]:
df_train = data_train_lvl_2.drop(drop_features, axis = 1)

In [77]:
df_train.shape

(107700, 2649)

In [78]:
df_train.columns

Index(['target', 'user_item_quantity', 'mean_user_item_quantity_a_week',
       'user_mean_sales_value', 'user_mean_sales_value_com_desc',
       'user_quantity_com_desc', 'user_mean_quantity_a_week_com_desc',
       'mean_purchases_a_week_com_desc', 'mean_item_quantity_a_week',
       'manufacturer_2',
       ...
       'hh_comp_desc_Unknown', 'household_size_desc_1',
       'household_size_desc_2', 'household_size_desc_3',
       'household_size_desc_4', 'household_size_desc_5+',
       'kid_category_desc_1', 'kid_category_desc_2', 'kid_category_desc_3+',
       'kid_category_desc_None/Unknown'],
      dtype='object', length=2649)

In [79]:
X_train = df_train.drop('target', axis=1)
y_train = df_train[['target']]

In [80]:
y_train.value_counts()

target
1.0       88322
0.0       19378
dtype: int64

In [150]:
# XGBclf = XGBClassifier(random_state=21, n_estimators=500, max_depth=3, eta=0.01, gamma = 0.1, alpha = 5)

# XGBclf.fit(X_train, y_train)

In [154]:
y_train_pred = XGBclf.predict(X_train)

y_train_pred_proba = XGBclf.predict_proba(X_train)[:,1]

In [155]:
# cat_model = catb.CatBoostClassifier(random_state=21, silent = True)

# cat_model.fit(X_train, y_train)

In [156]:
# y_train_pred = cat_model.predict(X_train)
# y_train_pred_proba = cat_model.predict_proba(X_train)[:, 1]
# y_train_pred_proba

In [157]:
# lgbm_model = lgbm.LGBMClassifier(random_state=21)

# lgbm_model.fit(X_train, y_train)

In [158]:
# y_train_pred = lgbm_model.predict(X_train)
# y_train_pred_proba = lgbm_model.predict_proba(X_train)[:, 1]
# y_train_pred_proba

In [159]:
classification_report(y_train, y_train_pred)

'              precision    recall  f1-score   support\n\n         0.0       1.00      1.00      1.00     19378\n         1.0       1.00      1.00      1.00     88322\n\n    accuracy                           1.00    107700\n   macro avg       1.00      1.00      1.00    107700\nweighted avg       1.00      1.00      1.00    107700\n'

In [160]:
def rerank(user_id, df):
    result = df[df['user_id']==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()
    if result == []:
        result = recommender.get_own_recommendations(user = user_id, N=5)
    return result

In [161]:
df_train = data_train_lvl_2[['user_id', 'item_id']]

In [162]:
df_train

Unnamed: 0,user_id,item_id
0,1,856942
1,1,9297615
2,1,5577022
3,1,877391
4,1,9655212
...,...,...
107695,2500,890909
107696,2500,874837
107697,2500,1137507
107698,2500,8292967


In [163]:
y_train_pred_proba

array([0.9965754, 0.9965754, 0.9965754, ..., 0.9965754, 0.9965754,
       0.9965754], dtype=float32)

In [164]:
df_train['proba_item_purchase'] = y_train_pred_proba

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['proba_item_purchase'] = y_train_pred_proba


In [165]:
df_train

Unnamed: 0,user_id,item_id,proba_item_purchase
0,1,856942,0.996575
1,1,9297615,0.996575
2,1,5577022,0.996575
3,1,877391,0.996575
4,1,9655212,0.996575
...,...,...,...
107695,2500,890909,0.996575
107696,2500,874837,0.996575
107697,2500,1137507,0.996575
107698,2500,8292967,0.996575


In [166]:
data_val_lvl_2[data_val_lvl_2['user_id'] == 1]

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2292853,1,41338185846,639,821867,1,0.69,436,0.00,1230,92,0.0,0.0
2292854,1,41338185846,639,834484,1,0.50,436,0.00,1230,92,0.0,0.0
2292855,1,41338185846,639,856942,1,2.99,436,0.00,1230,92,0.0,0.0
2292856,1,41338185846,639,865456,1,3.99,436,0.00,1230,92,0.0,0.0
2292857,1,41338185846,639,889248,1,1.09,436,0.00,1230,92,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2381206,1,41613756353,660,9677939,1,4.09,436,0.00,1602,95,0.0,0.0
2381207,1,41613756353,660,10356149,1,4.69,436,0.00,1602,95,0.0,0.0
2381208,1,41613756353,660,13417048,1,1.50,436,-0.39,1602,95,0.0,0.0
2381209,1,41613756353,660,15741823,1,2.99,436,0.00,1602,95,0.0,0.0


In [190]:
result_val_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_val_lvl_2.columns=['user_id', 'actual']

In [191]:
result_val_lvl_2['candidates_2_lvl'] = result_val_lvl_2['user_id'].apply(lambda user_id: rerank(user_id, df_train))

In [192]:
result_val_lvl_2

Unnamed: 0,user_id,actual,candidates_2_lvl
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[856942, 849066, 1087411, 1044487, 911454]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1092937, 1008714, 12132312, 1075979, 998206]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[13003092, 13115375, 1126786, 1123254, 819308]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[851066, 1091793, 10456164, 993838, 894360]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[12808385, 10282046, 1044500, 878636, 834435]"
...,...,...,...
2037,2496,[6534178],"[872826, 1067954, 1069003, 5580577, 1056267]"
2038,2497,"[1016709, 9835695, 1132298, 16809501, 845294, ...","[870515, 833151, 1052294, 1031316, 1124971]"
2039,2498,"[15716530, 834484, 901776, 914190, 958382, 972...","[1022066, 1123045, 933354, 1093268, 991886]"
2040,2499,"[867188, 877580, 902396, 914190, 951590, 95813...","[7168055, 1128395, 924967, 1015280, 1038214]"


In [193]:
rerank(7, df_train)

[851066, 1091793, 10456164, 993838, 894360]

In [194]:
df_train[df_train['user_id'] == 7].sort_values(by = ['proba_item_purchase'], ascending=False)

Unnamed: 0,user_id,item_id,proba_item_purchase
249,7,851066,0.996575
221,7,1091793,0.996575
223,7,10456164,0.996575
224,7,993838,0.996575
201,7,894360,0.996575
226,7,983316,0.996575
227,7,9419422,0.996575
228,7,930918,0.996575
243,7,9803591,0.996575
230,7,966546,0.996575


In [195]:
#Уберем новых пользователей, их будем учитывать отдельно

In [196]:
result_val_lvl_2['warm'] = pd.Series(result_val_lvl_2['user_id'].unique()).isin(data_train_lvl_2['user_id'].unique())

In [197]:
result_val_lvl_2 = result_val_lvl_2[result_val_lvl_2['warm'] == True]

In [198]:
result_val_lvl_2

Unnamed: 0,user_id,actual,candidates_2_lvl,warm
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[856942, 849066, 1087411, 1044487, 911454]",True
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[13003092, 13115375, 1126786, 1123254, 819308]",True
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[851066, 1091793, 10456164, 993838, 894360]",True
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[12808385, 10282046, 1044500, 878636, 834435]",True
5,9,"[864335, 990865, 1029743, 9297474, 10457112, 8...","[872146, 1070497, 5568197, 882190, 1120559]",True
...,...,...,...,...
2037,2496,[6534178],"[872826, 1067954, 1069003, 5580577, 1056267]",True
2038,2497,"[1016709, 9835695, 1132298, 16809501, 845294, ...","[870515, 833151, 1052294, 1031316, 1124971]",True
2039,2498,"[15716530, 834484, 901776, 914190, 958382, 972...","[1022066, 1123045, 933354, 1093268, 991886]",True
2040,2499,"[867188, 877580, 902396, 914190, 951590, 95813...","[7168055, 1128395, 924967, 1015280, 1038214]",True


126 пользователей не присутствовали в тренировочном датасете, им рекомендации не делали

### Сделаем предсказания и рассчитаем precision_at_k k=5 для применения одноуровневой модели OWN на валидационных данных для сравнения результатов работы моделей

In [199]:
result_val_lvl_2['candidates_own'] = result_val_lvl_2['user_id'].apply(lambda row: recommender.get_own_recommendations(row, N=5))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_val_lvl_2['candidates_own'] = result_val_lvl_2['user_id'].apply(lambda row: recommender.get_own_recommendations(row, N=5))


In [200]:
result_val_lvl_2.apply(lambda row: precision_at_k(row['candidates_own'], row['actual'], k = 5), axis=1).mean()

0.1506263048016691

In [201]:
result_val_lvl_2.apply(lambda row: ndcg_at_k(row['candidates_own'], row['actual'], k = 5), axis=1).mean()

0.1671028636156502

In [202]:
#precision_at_k k=5
result_val_lvl_2.apply(lambda row: precision_at_k(row['candidates_2_lvl'], row['actual'], k = 5), axis=1).mean()

0.09728601252609557

In [203]:
result_val_lvl_2.apply(lambda row: ndcg_at_k(row['candidates_2_lvl'], row['actual'], k = 5), axis=1).mean()

0.12195193954484294