# Курсовой проект


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня

import lightgbm as lgbm, catboost as catb
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k, ndcg_at_k, ap_at_k
from src.utils import prefilter_items

In [4]:
import pandas as pd
import numpy as np

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender  # нужен для одного трюка
from implicit.nearest_neighbours import bm25_weight, tfidf_weight


class MainRecommender:
    """Рекоммендации, которые можно получить из ALS

    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """

    def __init__(self, data, weighting=True):

        # Топ покупок каждого юзера
        self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 999999]

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 999999]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist()

        self.user_item_matrix = self._prepare_matrix(data)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid, \
            self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix)

        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T

        self.model = self.fit(self.user_item_matrix)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)

    @staticmethod
    def _prepare_matrix(data):
        """Готовит user-item матрицу"""
        user_item_matrix = pd.pivot_table(data,
                                          index='user_id', columns='item_id',
                                          values='quantity',  # Можно пробовать другие варианты
                                          aggfunc='count',
                                          fill_value=0
                                          )

        user_item_matrix = user_item_matrix.astype(float)  # необходимый тип матрицы для implicit

        return user_item_matrix

    @staticmethod
    def _prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))

        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""

        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())

        return own_recommender

    @staticmethod
    def fit(user_item_matrix, n_factors=100, regularization=0.001, iterations=15, num_threads=4):
        """Обучает ALS"""

        model = AlternatingLeastSquares(factors=n_factors,
                                        regularization=regularization,
                                        iterations=iterations,
                                        num_threads=num_threads)
        model.fit(csr_matrix(user_item_matrix).T.tocsr())

        return model

    def _update_dict(self, user_id):
        """Если появился новыю user / item, то нужно обновить словари"""

        if user_id not in self.userid_to_id.keys():

            max_id = max(list(self.userid_to_id.values()))
            max_id += 1

            self.userid_to_id.update({user_id: max_id})
            self.id_to_userid.update({max_id: user_id})

    def _get_similar_item(self, item_id):
        """Находит товар, похожий на item_id"""
        recs = self.model.similar_items(self.itemid_to_id[item_id], N=2)  # Товар похож на себя -> рекомендуем 2 товара
        top_rec = recs[1][0]  # И берем второй (не товар из аргумента метода)
        return self.id_to_itemid[top_rec]

    def _extend_with_top_popular(self, recommendations, N=5):
        """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""

        if len(recommendations) < N:
            recommendations.extend(self.overall_top_purchases[:N])
            recommendations = recommendations[:N]

        return recommendations

    def _get_recommendations(self, user, model, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)
        res = [self.id_to_itemid[rec[0]] for rec in model.recommend(userid=self.userid_to_id[user],
                                        user_items=csr_matrix(self.user_item_matrix).tocsr(),
                                        N=N,
                                        filter_already_liked_items=False,
                                        filter_items=[self.itemid_to_id[999999]],
                                        recalculate_user=True)]

        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_als_recommendations(self, user, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        try:
            self._update_dict(user_id=user)
            return self._get_recommendations(user, model=self.model, N=N)
        except:
            return self._extend_with_top_popular([], N=N)

    def get_own_recommendations(self, user, N=5):
        """Рекомендуем товары среди тех, которые юзер уже купил"""

        try:
            self._update_dict(user_id=user)
            return self._get_recommendations(user, model=self.own_recommender, N=N)
        except:
            return self._extend_with_top_popular([], N=N)

    def get_similar_items_recommendation(self, user, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""

        try:
            top_users_purchases = self.top_purchases[self.top_purchases['user_id'] == user].head(N)

            res = top_users_purchases['item_id'].apply(lambda x: self._get_similar_item(x)).tolist()
            res = self._extend_with_top_popular(res, N=N)

            assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
            return res
        except:
            return self._extend_with_top_popular([], N=N)

    def get_similar_users_recommendation(self, user, N=5):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""

        try:
            res = []

            # Находим топ-N похожих пользователей
            similar_users = self.model.similar_users(self.userid_to_id[user], N=N+1)
            similar_users = [rec[0] for rec in similar_users]
            similar_users = similar_users[1:]   # удалим юзера из запроса

            for user in similar_users:
                res.extend(self.get_own_recommendations(user, N=1))

            res = self._extend_with_top_popular(res, N=N)

            assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
            return res
        except:
            return self._extend_with_top_popular([], N=N)

In [5]:
# отчет по метрикам
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [6]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')
data_test = pd.read_csv('retail_test1.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

user_features['age_desc'].replace(
    {'19-24': 22, '25-34': 30, '35-44': 40, '45-54': 50, '55-64': 60, '65+': 70},
    inplace=True)

user_features['marital_status_code'].replace(
    {'U': 0, 'A': 1, 'B': 2}, inplace=True)

user_features['income_desc'].replace(
    {'Under 15K': 10, '15-24K': 20, '25-34K':30, '35-49K': 40,
     '50-74K': 62, '75-99K': 87, '100-124K': 112, '125-149K': 137, 
     '150-174K': 162, '175-199K': 187, '200-249K': 225, '250K+':275}, inplace=True)

user_features['homeowner_desc'].replace(
    {'Unknown': 0, 'Probable Renter': 1, 'Renter': 2,
     'Probable Owner': 3, 'Homeowner': 4}, inplace=True)

user_features['hh_comp_desc'].replace(
    {'Unknown': 0, 'Single Male': 1, 'Single Female': 2,
     '1 Adult Kids': 3, '2 Adults No Kids': 4, '2 Adults Kids':5},inplace=True)

user_features['household_size_desc'].replace({'5+': 5}, inplace=True) 

user_features['kid_category_desc'].replace(
    {'None/Unknown': 0, '3+': 3}, inplace=True)


names = ['manufacturer', 'department', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product']
for name in names:
    new_name = name + '_freq'
    a = item_features[name].value_counts()
    ind = a.index.tolist()
    for i in ind:
        item_features.loc[item_features[name] == i, new_name] = a[i]

item_features['brand'] = np.where(item_features['brand']=='Private', 0, 1)

commodities = item_features.commodity_desc.value_counts()
commodities_list = commodities.keys().tolist()
for i, name in enumerate(commodities_list):
    item_features.loc[item_features['commodity_desc'] == name, 'commodity_category'] = i

val_size_weeks = 3

data_train = data[data['week_no'] < (data['week_no'].max() - val_size_weeks)]
data_val = data[data['week_no'] >= (data['week_no'].max() - val_size_weeks)]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [7]:
n_items_before = data_train['item_id'].nunique()

data_train = prefilter_items(data_train, item_features=item_features, take_n_popular=5000)

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86865 to 5001


## Проверим, появились ли новые пользователи, по которым отсутствуют данные ранее тестовых 3 недель

In [8]:
u_train = data_train['user_id'].unique()
u_test = data_val['user_id'].unique()

In [9]:
u_train = pd.Series(u_train)
u_test = pd.Series(u_test)

In [10]:
u_test.isin(u_train).value_counts()

True     2040
False       2
dtype: int64

Два пользователя появились только в валидационных данных (последние 6  недель), по ним будем рекомендовать популярные товары

In [11]:
recommender = MainRecommender(data_train)



  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

### Level 1.Подбор кандидатов для классификатора

In [12]:
result = data_val.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


Отберем 50 кандидатов разными способами

In [13]:
# result['als_candidates_50'] = result['user_id'].apply(lambda row: recommender.get_als_recommendations(row, N=50))

In [14]:
# result.apply(lambda row: recall_at_k(row['als_candidates_50'], row['actual'], k = 50), axis=1).mean()

In [15]:
# result['own_candidates_50'] = result['user_id'].apply(lambda row: recommender.get_own_recommendations(row, N=50))

In [16]:
# result.apply(lambda row: recall_at_k(row['own_candidates_50'], row['actual'], k = 50), axis=1).mean()

In [17]:
# result['similar_items_candidates_50'] = result['user_id'].apply(lambda row: recommender.get_similar_items_recommendation(row, N=50))

In [18]:
# result.apply(lambda row: recall_at_k(row['similar_items_candidates_50'], row['actual'], k = 50), axis=1).mean()

In [19]:
# result['similar_users_candidates_50'] = result['user_id'].apply(lambda row: recommender.get_similar_users_recommendation(row, N=50))

In [20]:
# result.apply(lambda row: recall_at_k(row['similar_users_candidates_50'], row['actual'], k = 50), axis=1).mean()

Вывод: лучший recall дают als и own recommendtions + top-popular

Проверим модели ALS и own на разном количестве отобранных кандидатов



## OWN

In [21]:
# recall_at_k_list_own = []
# number_of_candidates_list = [20, 50, 100, 200, 500]
# for number in number_of_candidates_list:
#     result['own_candidates'+str(number)] = result['user_id'].apply(lambda row: recommender.get_own_recommendations(row, N=number))
#     recall_at_k_list_own.append(result.apply(lambda row: recall_at_k(row['own_candidates'+str(number)], row['actual'], k = number), axis=1).mean())

In [22]:
# recall_at_k_list_own


## ALS

In [23]:
# recall_at_k_list_als = []
# number_of_candidates_list = [20, 50, 100, 200, 500]
# for number in number_of_candidates_list:
#     result['als_candidates'+str(number)] = result['user_id'].apply(lambda row: recommender.get_als_recommendations(row, N=number))
#     recall_at_k_list_als.append(result.apply(lambda row: recall_at_k(row['als_candidates'+str(number)], row['actual'], k = number), axis=1).mean())

In [24]:
# recall_at_k_list_als

In [25]:
# plt.plot(number_of_candidates_list, recall_at_k_list_own, label='own')
# plt.plot(number_of_candidates_list, recall_at_k_list_als, label='als')
# plt.xlabel("N_candidates")
# plt.ylabel("recall_at_k")     
# plt.legend(loc="best")
# plt.grid()
# plt.show()

На графике видно, что лучший результат дает модель own. Ее результаты будем использовать для дальнейшего обучения. 
В обеих моделях чем больше кандидатов мы выбираем первой моделью, тем больше recall на первом этапе. Но очень большой объем выборки дает дополнительную нагрузку на вычислительные мощности, поэтому нужно ограничить выборку. Возьмем 300 кандидатов own

In [26]:
result['own_candidates'] = result['user_id'].apply(lambda row: recommender.get_own_recommendations(row, N=300))

In [27]:
result = result[['user_id', 'actual', 'own_candidates']]
result.rename(columns={'own_candidates': 'candidates'}, inplace=True)

In [28]:
result.head()

Unnamed: 0,user_id,actual,candidates
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[856942, 9297615, 5577022, 877391, 9655212, 82..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1092937, 1008714, 12132312, 1075979, 998206, ..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[13003092, 995598, 972416, 13115971, 923600, 1..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[998519, 7147142, 894360, 9338009, 896666, 939..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[12808385, 981660, 7410201, 939860, 847374, 64..."


In [29]:
# result['als_candidates'] = result['user_id'].apply(lambda row: recommender.get_als_recommendations(row, N=50))

In [30]:
# result['mix_own_als'] = result['candidates']+result['als_candidates']

In [31]:
# def make_mix_list(list):
#     result = []
#     for el in list:
#         if el not in result:
#             result.append(el)
#     return result

In [32]:
# result['mix_own_als'] = result['mix_own_als'].apply(lambda row: make_mix_list(row))

In [33]:
# result.apply(lambda row: recall_at_k(row['mix_own_als'], row['actual'], k = 50), axis=1).mean()

In [34]:
# result.apply(lambda row: recall_at_k(row['als_candidates'], row['actual'], k = 50), axis=1).mean()


In [35]:
# result.apply(lambda row: recall_at_k(row['candidates'], row['actual'], k = 50), axis=1).mean()


## Измерим precision@5 на модели первого уровня

In [36]:
#precision_at_k k=5
result['candidates_own_5'] = result['user_id'].apply(lambda row: recommender.get_own_recommendations(row, N=5))
result.apply(lambda row: precision_at_k(row['candidates_own_5'], row['actual'], k = 5), axis=1).mean()

0.15602350636630624

In [37]:
candidates = result[['user_id','candidates']]
candidates

Unnamed: 0,user_id,candidates
0,1,"[856942, 9297615, 5577022, 877391, 9655212, 82..."
1,3,"[1092937, 1008714, 12132312, 1075979, 998206, ..."
2,6,"[13003092, 995598, 972416, 13115971, 923600, 1..."
3,7,"[998519, 7147142, 894360, 9338009, 896666, 939..."
4,8,"[12808385, 981660, 7410201, 939860, 847374, 64..."
...,...,...
2037,2496,"[872826, 983665, 12452939, 991546, 1134296, 74..."
2038,2497,"[870515, 1102207, 1117219, 1010950, 1103513, 1..."
2039,2498,"[1100379, 1022066, 1076580, 931579, 5565356, 9..."
2040,2499,"[7168055, 1128395, 6904613, 5570048, 830202, 8..."


### Level 2. Обучение классификатора

In [38]:
df_items = candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [39]:
df_items

0         856942
0        9297615
0        5577022
0         877391
0        9655212
          ...   
2041      999270
2041      999104
2041      832678
2041      838186
2041    12301100
Name: item_id, Length: 612600, dtype: int64

In [40]:
candidates = candidates.drop('candidates', axis=1).join(df_items)

In [41]:
candidates.head()

Unnamed: 0,user_id,item_id
0,1,856942
0,1,9297615
0,1,5577022
0,1,877391
0,1,9655212


In [42]:
candidates.shape

(612600, 2)

In [43]:
data_train['target'] = 1
data_train = data_train.groupby(['user_id', 'item_id', 'target'])['quantity'].count().reset_index()
data_train = data_train[['user_id', 'item_id', 'target']]
data_train.head(2)

Unnamed: 0,user_id,item_id,target
0,1,823721,1
1,1,823990,1


In [44]:
candidates = candidates.merge(item_features, on='item_id', how='left')

In [45]:
candidates = candidates.merge(user_features, on='user_id', how='left')

In [46]:
candidates.shape

(612600, 21)

In [47]:
data_train = candidates.merge(data_train, on=['user_id', 'item_id'], how='left')
data_train['target'].fillna(0, inplace= True)

data_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 612600 entries, 0 to 612599
Data columns (total 22 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   user_id                    612600 non-null  int64  
 1   item_id                    612600 non-null  int64  
 2   manufacturer               612600 non-null  int64  
 3   department                 612600 non-null  object 
 4   brand                      612600 non-null  int64  
 5   commodity_desc             612600 non-null  object 
 6   sub_commodity_desc         612600 non-null  object 
 7   curr_size_of_product       612600 non-null  object 
 8   manufacturer_freq          612600 non-null  float64
 9   department_freq            612600 non-null  float64
 10  commodity_desc_freq        612600 non-null  float64
 11  sub_commodity_desc_freq    612600 non-null  float64
 12  curr_size_of_product_freq  612600 non-null  float64
 13  commodity_category         61

In [48]:
data_train['target'].unique()

array([1., 0.])

In [49]:
data_train['target'].value_counts()

0.0    317243
1.0    295357
Name: target, dtype: int64

In [50]:
from scipy.stats import mode

Заполним отсутствующие значения по пользователям, по которым отсутствуют user_features: категориальные заполним модой, числовые медианой

In [51]:
columns_for_users_fillna = ['age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc', 'hh_comp_desc', 'household_size_desc', 'kid_category_desc']

In [52]:
for column in columns_for_users_fillna:
    data_train[column].fillna(data_train[column].mode()[0], inplace= True)

In [53]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 612600 entries, 0 to 612599
Data columns (total 22 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   user_id                    612600 non-null  int64  
 1   item_id                    612600 non-null  int64  
 2   manufacturer               612600 non-null  int64  
 3   department                 612600 non-null  object 
 4   brand                      612600 non-null  int64  
 5   commodity_desc             612600 non-null  object 
 6   sub_commodity_desc         612600 non-null  object 
 7   curr_size_of_product       612600 non-null  object 
 8   manufacturer_freq          612600 non-null  float64
 9   department_freq            612600 non-null  float64
 10  commodity_desc_freq        612600 non-null  float64
 11  sub_commodity_desc_freq    612600 non-null  float64
 12  curr_size_of_product_freq  612600 non-null  float64
 13  commodity_category         61

In [54]:
data_train.head(1)

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,manufacturer_freq,department_freq,...,curr_size_of_product_freq,commodity_category,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,target
0,1,856942,159,GROCERY,1,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,330.0,39021.0,...,3924.0,10.0,70.0,1.0,40.0,4.0,4.0,2,0,1.0


## Добавим дополнительные признаки

In [55]:
data_train_lvl_2_extra_features = data[data['week_no'] < (data['week_no'].max() - val_size_weeks)]
data_train_lvl_2_extra_features.shape

(2278490, 12)

In [56]:
data_train_lvl_2_extra_features = data_train_lvl_2_extra_features.merge(item_features, on='item_id', how='left')
data_train_lvl_2_extra_features.shape

(2278490, 24)

In [57]:
data_train_lvl_2_extra_features = data_train_lvl_2_extra_features.merge(user_features, on='user_id', how='left')
data_train_lvl_2_extra_features.shape

(2278490, 31)

In [58]:
data_train_lvl_2_extra_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2278490 entries, 0 to 2278489
Data columns (total 31 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   user_id                    int64  
 1   basket_id                  int64  
 2   day                        int64  
 3   item_id                    int64  
 4   quantity                   int64  
 5   sales_value                float64
 6   store_id                   int64  
 7   retail_disc                float64
 8   trans_time                 int64  
 9   week_no                    int64  
 10  coupon_disc                float64
 11  coupon_match_disc          float64
 12  manufacturer               int64  
 13  department                 object 
 14  brand                      int64  
 15  commodity_desc             object 
 16  sub_commodity_desc         object 
 17  curr_size_of_product       object 
 18  manufacturer_freq          float64
 19  department_freq            float64
 20  co

### extra_feature 1

In [59]:
user_item_quantity = data_train_lvl_2_extra_features.groupby(['user_id', 'item_id'])['quantity'].sum().reset_index()

data_train = data_train.merge(user_item_quantity, on=['user_id', 'item_id'], how='left')
data_train.rename(columns={'quantity': 'user_item_quantity'}, inplace=True)
data_train.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,manufacturer_freq,department_freq,...,commodity_category,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,target,user_item_quantity
0,1,856942,159,GROCERY,1,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,330.0,39021.0,...,10.0,70.0,1.0,40.0,4.0,4.0,2,0,1.0,53.0
1,1,9297615,1194,GROCERY,1,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,130.0,39021.0,...,74.0,70.0,1.0,40.0,4.0,4.0,2,0,1.0,21.0
2,1,5577022,1194,GROCERY,1,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,130.0,39021.0,...,74.0,70.0,1.0,40.0,4.0,4.0,2,0,1.0,20.0
3,1,877391,584,NUTRITION,1,RICE CAKES,RICE CAKES,6.57 OZ,308.0,2914.0,...,236.0,70.0,1.0,40.0,4.0,4.0,2,0,1.0,13.0
4,1,9655212,1011,GROCERY,1,PAPER HOUSEWARES,DESIGNER PAPER: MEDIUM WEIGHT,45 CT,75.0,39021.0,...,117.0,70.0,1.0,40.0,4.0,4.0,2,0,1.0,19.0


### extra_feature 2

In [60]:
user_item_quantity_a_week = data_train_lvl_2_extra_features.groupby(['user_id', 'item_id', 'week_no'])['quantity'].sum().reset_index()
mean_user_item_quantity_a_week = user_item_quantity_a_week.groupby(['user_id', 'item_id'])['quantity'].mean().reset_index()

data_train = data_train.merge(mean_user_item_quantity_a_week, on=['user_id', 'item_id'], how='left')
data_train.rename(columns={'quantity': 'mean_user_item_quantity_a_week'}, inplace=True)
data_train.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,manufacturer_freq,department_freq,...,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,target,user_item_quantity,mean_user_item_quantity_a_week
0,1,856942,159,GROCERY,1,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,330.0,39021.0,...,70.0,1.0,40.0,4.0,4.0,2,0,1.0,53.0,1.06
1,1,9297615,1194,GROCERY,1,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,130.0,39021.0,...,70.0,1.0,40.0,4.0,4.0,2,0,1.0,21.0,1.05
2,1,5577022,1194,GROCERY,1,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,130.0,39021.0,...,70.0,1.0,40.0,4.0,4.0,2,0,1.0,20.0,1.0
3,1,877391,584,NUTRITION,1,RICE CAKES,RICE CAKES,6.57 OZ,308.0,2914.0,...,70.0,1.0,40.0,4.0,4.0,2,0,1.0,13.0,1.0
4,1,9655212,1011,GROCERY,1,PAPER HOUSEWARES,DESIGNER PAPER: MEDIUM WEIGHT,45 CT,75.0,39021.0,...,70.0,1.0,40.0,4.0,4.0,2,0,1.0,19.0,1.117647


### extra_feature 3

In [61]:
user_mean_sales_value = data_train_lvl_2_extra_features.groupby(['user_id'])['sales_value'].mean().reset_index()

data_train = data_train.merge(user_mean_sales_value, on=['user_id'], how='left')
data_train.rename(columns={'sales_value': 'user_mean_sales_value'}, inplace=True)
data_train.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,manufacturer_freq,department_freq,...,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,target,user_item_quantity,mean_user_item_quantity_a_week,user_mean_sales_value
0,1,856942,159,GROCERY,1,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,330.0,39021.0,...,1.0,40.0,4.0,4.0,2,0,1.0,53.0,1.06,2.494884
1,1,9297615,1194,GROCERY,1,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,130.0,39021.0,...,1.0,40.0,4.0,4.0,2,0,1.0,21.0,1.05,2.494884
2,1,5577022,1194,GROCERY,1,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,130.0,39021.0,...,1.0,40.0,4.0,4.0,2,0,1.0,20.0,1.0,2.494884
3,1,877391,584,NUTRITION,1,RICE CAKES,RICE CAKES,6.57 OZ,308.0,2914.0,...,1.0,40.0,4.0,4.0,2,0,1.0,13.0,1.0,2.494884
4,1,9655212,1011,GROCERY,1,PAPER HOUSEWARES,DESIGNER PAPER: MEDIUM WEIGHT,45 CT,75.0,39021.0,...,1.0,40.0,4.0,4.0,2,0,1.0,19.0,1.117647,2.494884


### extra_feature 4

In [62]:
user_mean_sales_value_com_desc = data_train_lvl_2_extra_features.groupby(['user_id', 'commodity_desc'])['sales_value'].mean().reset_index()

data_train = data_train.merge(user_mean_sales_value_com_desc, on=['user_id', 'commodity_desc'], how='left')
data_train.rename(columns={'sales_value': 'user_mean_sales_value_com_desc'}, inplace=True)
data_train.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,manufacturer_freq,department_freq,...,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,target,user_item_quantity,mean_user_item_quantity_a_week,user_mean_sales_value,user_mean_sales_value_com_desc
0,1,856942,159,GROCERY,1,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,330.0,39021.0,...,40.0,4.0,4.0,2,0,1.0,53.0,1.06,2.494884,2.478421
1,1,9297615,1194,GROCERY,1,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,130.0,39021.0,...,40.0,4.0,4.0,2,0,1.0,21.0,1.05,2.494884,2.893721
2,1,5577022,1194,GROCERY,1,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,130.0,39021.0,...,40.0,4.0,4.0,2,0,1.0,20.0,1.0,2.494884,2.893721
3,1,877391,584,NUTRITION,1,RICE CAKES,RICE CAKES,6.57 OZ,308.0,2914.0,...,40.0,4.0,4.0,2,0,1.0,13.0,1.0,2.494884,2.608235
4,1,9655212,1011,GROCERY,1,PAPER HOUSEWARES,DESIGNER PAPER: MEDIUM WEIGHT,45 CT,75.0,39021.0,...,40.0,4.0,4.0,2,0,1.0,19.0,1.117647,2.494884,3.0695


### extra_feature 5

In [63]:
user_quantity_com_desc = data_train_lvl_2_extra_features.groupby(['user_id', 'commodity_desc'])['quantity'].sum().reset_index()

data_train = data_train.merge(user_quantity_com_desc, on=['user_id', 'commodity_desc'], how='left')
data_train.rename(columns={'quantity': 'user_quantity_com_desc'}, inplace=True)
data_train.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,manufacturer_freq,department_freq,...,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,target,user_item_quantity,mean_user_item_quantity_a_week,user_mean_sales_value,user_mean_sales_value_com_desc,user_quantity_com_desc
0,1,856942,159,GROCERY,1,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,330.0,39021.0,...,4.0,4.0,2,0,1.0,53.0,1.06,2.494884,2.478421,117.0
1,1,9297615,1194,GROCERY,1,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,130.0,39021.0,...,4.0,4.0,2,0,1.0,21.0,1.05,2.494884,2.893721,44.0
2,1,5577022,1194,GROCERY,1,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,130.0,39021.0,...,4.0,4.0,2,0,1.0,20.0,1.0,2.494884,2.893721,44.0
3,1,877391,584,NUTRITION,1,RICE CAKES,RICE CAKES,6.57 OZ,308.0,2914.0,...,4.0,4.0,2,0,1.0,13.0,1.0,2.494884,2.608235,17.0
4,1,9655212,1011,GROCERY,1,PAPER HOUSEWARES,DESIGNER PAPER: MEDIUM WEIGHT,45 CT,75.0,39021.0,...,4.0,4.0,2,0,1.0,19.0,1.117647,2.494884,3.0695,22.0


### extra_feature 6

In [64]:
user_quantity_a_week_com_desc = data_train_lvl_2_extra_features.groupby(['user_id', 'week_no', 'commodity_desc'])['quantity'].sum().reset_index()
user_mean_quantity_a_week_com_desc = user_quantity_a_week_com_desc.groupby(['user_id', 'commodity_desc'])['quantity'].mean().reset_index()

data_train = data_train.merge(user_mean_quantity_a_week_com_desc, on=['user_id', 'commodity_desc'], how='left')
data_train.rename(columns={'quantity': 'user_mean_quantity_a_week_com_desc'}, inplace=True)
data_train.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,manufacturer_freq,department_freq,...,hh_comp_desc,household_size_desc,kid_category_desc,target,user_item_quantity,mean_user_item_quantity_a_week,user_mean_sales_value,user_mean_sales_value_com_desc,user_quantity_com_desc,user_mean_quantity_a_week_com_desc
0,1,856942,159,GROCERY,1,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,330.0,39021.0,...,4.0,2,0,1.0,53.0,1.06,2.494884,2.478421,117.0,2.166667
1,1,9297615,1194,GROCERY,1,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,130.0,39021.0,...,4.0,2,0,1.0,21.0,1.05,2.494884,2.893721,44.0,1.073171
2,1,5577022,1194,GROCERY,1,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,130.0,39021.0,...,4.0,2,0,1.0,20.0,1.0,2.494884,2.893721,44.0,1.073171
3,1,877391,584,NUTRITION,1,RICE CAKES,RICE CAKES,6.57 OZ,308.0,2914.0,...,4.0,2,0,1.0,13.0,1.0,2.494884,2.608235,17.0,1.214286
4,1,9655212,1011,GROCERY,1,PAPER HOUSEWARES,DESIGNER PAPER: MEDIUM WEIGHT,45 CT,75.0,39021.0,...,4.0,2,0,1.0,19.0,1.117647,2.494884,3.0695,22.0,1.222222



### extra_feature 7

In [65]:
purchases_a_week_com_desc = data_train_lvl_2_extra_features.groupby(['week_no', 'commodity_desc'])['quantity'].sum().reset_index()
mean_purchases_a_week_com_desc = purchases_a_week_com_desc.groupby(['commodity_desc'])['quantity'].mean().reset_index()

data_train = data_train.merge(mean_purchases_a_week_com_desc, on=['commodity_desc'], how='left')
data_train.rename(columns={'quantity': 'mean_purchases_a_week_com_desc'}, inplace=True)
data_train.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,manufacturer_freq,department_freq,...,household_size_desc,kid_category_desc,target,user_item_quantity,mean_user_item_quantity_a_week,user_mean_sales_value,user_mean_sales_value_com_desc,user_quantity_com_desc,user_mean_quantity_a_week_com_desc,mean_purchases_a_week_com_desc
0,1,856942,159,GROCERY,1,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,330.0,39021.0,...,2,0,1.0,53.0,1.06,2.494884,2.478421,117.0,2.166667,967.483516
1,1,9297615,1194,GROCERY,1,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,130.0,39021.0,...,2,0,1.0,21.0,1.05,2.494884,2.893721,44.0,1.073171,372.131868
2,1,5577022,1194,GROCERY,1,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,130.0,39021.0,...,2,0,1.0,20.0,1.0,2.494884,2.893721,44.0,1.073171,372.131868
3,1,877391,584,NUTRITION,1,RICE CAKES,RICE CAKES,6.57 OZ,308.0,2914.0,...,2,0,1.0,13.0,1.0,2.494884,2.608235,17.0,1.214286,16.932584
4,1,9655212,1011,GROCERY,1,PAPER HOUSEWARES,DESIGNER PAPER: MEDIUM WEIGHT,45 CT,75.0,39021.0,...,2,0,1.0,19.0,1.117647,2.494884,3.0695,22.0,1.222222,119.340659


### extra_feature 8

In [66]:
item_quantity_a_week = data_train_lvl_2_extra_features.groupby(['week_no', 'item_id'])['quantity'].sum().reset_index()
mean_item_quantity_a_week = item_quantity_a_week.groupby(['item_id'])['quantity'].mean().reset_index()

data_train = data_train.merge(mean_item_quantity_a_week, on=['item_id'], how='left')
data_train.rename(columns={'quantity': 'mean_item_quantity_a_week'}, inplace=True)
data_train.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,manufacturer_freq,department_freq,...,kid_category_desc,target,user_item_quantity,mean_user_item_quantity_a_week,user_mean_sales_value,user_mean_sales_value_com_desc,user_quantity_com_desc,user_mean_quantity_a_week_com_desc,mean_purchases_a_week_com_desc,mean_item_quantity_a_week
0,1,856942,159,GROCERY,1,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,330.0,39021.0,...,0,1.0,53.0,1.06,2.494884,2.478421,117.0,2.166667,967.483516,3.698795
1,1,9297615,1194,GROCERY,1,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,130.0,39021.0,...,0,1.0,21.0,1.05,2.494884,2.893721,44.0,1.073171,372.131868,1.984127
2,1,5577022,1194,GROCERY,1,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64 OZ,130.0,39021.0,...,0,1.0,20.0,1.0,2.494884,2.893721,44.0,1.073171,372.131868,2.453333
3,1,877391,584,NUTRITION,1,RICE CAKES,RICE CAKES,6.57 OZ,308.0,2914.0,...,0,1.0,13.0,1.0,2.494884,2.608235,17.0,1.214286,16.932584,1.190476
4,1,9655212,1011,GROCERY,1,PAPER HOUSEWARES,DESIGNER PAPER: MEDIUM WEIGHT,45 CT,75.0,39021.0,...,0,1.0,19.0,1.117647,2.494884,3.0695,22.0,1.222222,119.340659,2.573529


In [67]:
data_train_lvl_2_extra_features

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,sub_commodity_desc_freq,curr_size_of_product_freq,commodity_category,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2375,26984851472,1,1004906,1,1.39,364,-0.60,1631,1,...,67.0,216.0,171.0,,,,,,,
1,2375,26984851472,1,1033142,1,0.82,364,0.00,1631,1,...,36.0,82.0,190.0,,,,,,,
2,2375,26984851472,1,1036325,1,0.99,364,-0.30,1631,1,...,34.0,30607.0,139.0,,,,,,,
3,2375,26984851472,1,1082185,1,1.21,364,0.00,1631,1,...,17.0,82.0,215.0,,,,,,,
4,2375,26984851472,1,8160430,1,1.50,364,-0.39,1631,1,...,23.0,536.0,104.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2278485,222,41297772783,635,1120741,1,0.59,304,0.00,1716,91,...,343.0,236.0,4.0,40.0,1.0,40.0,0.0,4.0,2,0
2278486,462,41297773713,635,993339,1,1.99,304,0.00,2040,91,...,512.0,1011.0,30.0,70.0,0.0,112.0,0.0,1.0,1,0
2278487,462,41297773713,635,995242,1,1.00,304,-0.89,2040,91,...,194.0,30607.0,61.0,70.0,0.0,112.0,0.0,1.0,1,0
2278488,462,41297773713,635,10180324,1,3.00,304,-0.29,2040,91,...,82.0,67.0,24.0,70.0,0.0,112.0,0.0,1.0,1,0


In [68]:
# item_als_vector = pd.DataFrame(recommender.model.item_factors).reset_index()
# item_als_vector['item_id'] = recommender.itemid_to_id
# item_als_vector['item_id'].unique()
# data_train = data_train.merge(item_als_vector, on=['item_id'], how='left')

In [69]:
cat_features = ['manufacturer', 'department', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product', 'household_size_desc', 'kid_category_desc']

In [70]:
data_train[data_train['user_item_quantity'].isna()== True]

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,manufacturer_freq,department_freq,...,kid_category_desc,target,user_item_quantity,mean_user_item_quantity_a_week,user_mean_sales_value,user_mean_sales_value_com_desc,user_quantity_com_desc,user_mean_quantity_a_week_com_desc,mean_purchases_a_week_com_desc,mean_item_quantity_a_week
25,1,948640,1213,DRUG GM,1,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,9.0,31529.0,...,0,0.0,,,2.494884,2.662727,11.0,1.222222,116.230769,1.289474
33,1,7152319,2627,GROCERY,1,DOMESTIC WINE,POPULAR 750ML WINES,4/187 ML,71.0,39021.0,...,0,0.0,,,2.494884,,,,64.505495,2.095238
39,1,979674,111,DRUG GM,1,CIGARETTES,CIGARETTES,CTN,131.0,31529.0,...,0,0.0,,,2.494884,3.920000,4.0,2.000000,165.153846,1.113636
50,1,10285454,584,GROCERY,1,HOT CEREAL,INSTANT OATMEAL,12.6 OZ,308.0,39021.0,...,0,0.0,,,2.494884,,,,66.846154,1.750000
66,1,9859017,2366,GROCERY,1,DOMESTIC WINE,PREMIUM 750ML WINES,750 ML,38.0,39021.0,...,0,0.0,,,2.494884,,,,64.505495,2.687500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612595,2500,999270,1425,MEAT-PCKGD,1,BACON,ECONOMY,1 LB,62.0,2427.0,...,0,0.0,,,3.688531,,,,153.945055,14.088608
612596,2500,999104,1194,GROCERY,1,REFRGRATD JUICES/DRNKS,DAIRY CASE 100% PURE JUICE - O,64OZ,130.0,39021.0,...,0,0.0,,,3.688531,7.000000,6.0,3.000000,372.131868,10.956044
612597,2500,832678,586,GROCERY,1,CRACKERS/MISC BKD FD,BUTTER SPRAY CRACKER (RITZ/CLU,16 OZ,200.0,39021.0,...,0,0.0,,,3.688531,2.703684,21.0,1.312500,315.780220,14.522222
612598,2500,838186,1790,GROCERY,1,BAKED SWEET GOODS,SW GDS:DONUTS,18.2 OZ,49.0,39021.0,...,0,0.0,,,3.688531,1.937500,9.0,3.000000,304.285714,8.643678


In [71]:
extra_features_na_list = ['user_item_quantity', 'mean_user_item_quantity_a_week', 'user_mean_sales_value_com_desc', 'user_quantity_com_desc', 'user_mean_quantity_a_week_com_desc']

for el in extra_features_na_list:
    data_train[el].fillna(0, inplace= True)

In [74]:
X_train = data_train.drop('target', axis=1)
y_train = data_train[['target']]

In [75]:
y_train.value_counts()

target
0.0       317243
1.0       295357
dtype: int64

In [76]:
X_train.columns

Index(['user_id', 'item_id', 'manufacturer', 'department', 'brand',
       'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product',
       'manufacturer_freq', 'department_freq', 'commodity_desc_freq',
       'sub_commodity_desc_freq', 'curr_size_of_product_freq',
       'commodity_category', 'age_desc', 'marital_status_code', 'income_desc',
       'homeowner_desc', 'hh_comp_desc', 'household_size_desc',
       'kid_category_desc', 'user_item_quantity',
       'mean_user_item_quantity_a_week', 'user_mean_sales_value',
       'user_mean_sales_value_com_desc', 'user_quantity_com_desc',
       'user_mean_quantity_a_week_com_desc', 'mean_purchases_a_week_com_desc',
       'mean_item_quantity_a_week'],
      dtype='object')

In [77]:
# XGBclf = XGBClassifier(random_state=21, n_estimators=500, max_depth=8, eta=0.01, gamma = 0.1, alpha = 5)

# XGBclf.fit(X_train, y_train)

In [78]:
# y_train_pred = XGBclf.predict(X_train)

# y_train_pred_proba = XGBclf.predict_proba(X_train)[:,1]

In [79]:
# cat_model = catb.CatBoostClassifier(n_estimators = 3000, max_depth = 5, eta = 0.1, silent=True, random_state=21)

# cat_model.fit(X_train, y_train)

In [80]:
# y_train_pred = cat_model.predict(X_train)
# y_train_pred_proba = cat_model.predict_proba(X_train)[:, 1]

In [81]:
X_train[cat_features] = X_train[cat_features].astype('category')

#Обучение классификатора:
# lgbm_model = lgbm.LGBMClassifier(objective='binary',
#                      max_depth=8,
#                      n_estimators=3000,
#                      learning_rate=0.05, random_state = 21,                        
#                      categorical_column=cat_features)


lgbm_model = lgbm.LGBMClassifier(objective='binary',
                    max_depth=10,
                    n_estimators=5000,
                    learning_rate=0.02, 
                    random_state = 21,  
                    boosting_type = 'gbdt',
                    metric = 'auc',
                    num_boost_round = 10000,
                    class_weight ='balanced',
                    n_jobs = 6,     
                    categorical_column=cat_features)


                  
# lgbm_model = lgbm.LGBMClassifier(random_state = 21, categorical_column=cat_features)                


lgbm_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




LGBMClassifier(categorical_column=['manufacturer', 'department',
                                   'commodity_desc', 'sub_commodity_desc',
                                   'curr_size_of_product',
                                   'household_size_desc', 'kid_category_desc'],
               class_weight='balanced', learning_rate=0.02, max_depth=10,
               metric='auc', n_estimators=5000, n_jobs=6, num_boost_round=10000,
               objective='binary', random_state=21)

In [82]:
y_train_pred = lgbm_model.predict(X_train)
y_train_pred_proba = lgbm_model.predict_proba(X_train)[:, 1]
y_train_pred_proba

array([1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
       2.64813058e-09, 9.19037509e-08, 1.89756506e-07])

In [83]:
# RFclf = RandomForestClassifier(n_estimators = 1000, max_depth = 10, min_samples_leaf = 3, random_state=21)

# RFclf.fit(X_train, y_train)

In [84]:
# y_train_pred = RFclf.predict(X_train)

# y_train_pred_proba = RFclf.predict_proba(X_train)[:,1]

In [85]:
def rerank(user_id, df):
    result = df[df['user_id']==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()
    if result == []:
        result = recommender.get_own_recommendations(user = user_id, N=5)
    return result

In [86]:
df_train = data_train[['user_id', 'item_id']]

In [87]:
df_train['proba_item_purchase'] = y_train_pred_proba

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['proba_item_purchase'] = y_train_pred_proba


In [88]:
df_train

Unnamed: 0,user_id,item_id,proba_item_purchase
0,1,856942,1.000000e+00
1,1,9297615,1.000000e+00
2,1,5577022,1.000000e+00
3,1,877391,1.000000e+00
4,1,9655212,1.000000e+00
...,...,...,...
612595,2500,999270,3.513575e-12
612596,2500,999104,2.281056e-08
612597,2500,832678,2.648131e-09
612598,2500,838186,9.190375e-08


In [89]:
result['candidates_2_lvl'] = result['user_id'].apply(lambda user_id: rerank(user_id, df_train))

In [90]:
result

Unnamed: 0,user_id,actual,candidates,candidates_own_5,candidates_2_lvl
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[856942, 9297615, 5577022, 877391, 9655212, 82...","[856942, 9297615, 5577022, 877391, 9655212]","[856942, 888104, 9297615, 10149640, 5577022]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1092937, 1008714, 12132312, 1075979, 998206, ...","[1092937, 1008714, 12132312, 1075979, 998206]","[854405, 854405, 847573, 948953, 1136486]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[13003092, 995598, 972416, 13115971, 923600, 1...","[13003092, 995598, 972416, 13115971, 923600]","[1108094, 13003092, 996269, 12384953, 863447]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[998519, 7147142, 894360, 9338009, 896666, 939...","[998519, 7147142, 894360, 9338009, 896666]","[7147142, 9338009, 993838, 12524016, 1075505]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[12808385, 981660, 7410201, 939860, 847374, 64...","[12808385, 981660, 7410201, 939860, 847374]","[10282046, 12172240, 919766, 829722, 7025362]"
...,...,...,...,...,...
2037,2496,[6534178],"[872826, 983665, 12452939, 991546, 1134296, 74...","[872826, 983665, 12452939, 991546, 1134296]","[7441210, 5591939, 12452939, 933913, 5569230]"
2038,2497,"[1016709, 9835695, 1132298, 16809501, 845294, ...","[870515, 1102207, 1117219, 1010950, 1103513, 1...","[870515, 1102207, 1117219, 1010950, 1103513]","[1120361, 1004436, 849202, 1103513, 919766]"
2039,2498,"[15716530, 834484, 901776, 914190, 958382, 972...","[1100379, 1022066, 1076580, 931579, 5565356, 9...","[1100379, 1022066, 1076580, 931579, 5565356]","[8119004, 1110031, 933354, 6463949, 10456152]"
2040,2499,"[867188, 877580, 902396, 914190, 951590, 95813...","[7168055, 1128395, 6904613, 5570048, 830202, 8...","[7168055, 1128395, 6904613, 5570048, 830202]","[919766, 986021, 882308, 1097458, 6463949]"


Посмотрим, есть ли в валидационных данных пользователи, которых не было среди кандидатов, рекомендованных на первом этапе

In [91]:
result['warm'] = pd.Series(result['user_id'].unique()).isin(data_train['user_id'].unique())

In [92]:
result[result['warm'] == False]

Unnamed: 0,user_id,actual,candidates,candidates_own_5,candidates_2_lvl,warm


### Сделаем предсказания и рассчитаем precision_at_k k=5 для применения одноуровневой модели OWN на валидационных данных для сравнения результатов работы моделей

In [93]:
result['candidates_own'] = result['user_id'].apply(lambda row: recommender.get_own_recommendations(row, N=5))

In [94]:
result['candidates_als'] = result['user_id'].apply(lambda row: recommender.get_als_recommendations(row, N=5))

In [95]:
precision_at_k_own = result.apply(lambda row: precision_at_k(row['candidates_own'], row['actual'], k = 5), axis=1).mean()

In [96]:
ndcg_own = result.apply(lambda row: ndcg_at_k(row['candidates_own'], row['actual'], k = 5), axis=1).mean()

In [97]:
#map own
map_own = result.apply(lambda row: ap_at_k(row['candidates_own'], row['actual'], k = 5), axis=1).mean()

In [98]:
precision_at_k_als = result.apply(lambda row: precision_at_k(row['candidates_als'], row['actual'], k = 5), axis=1).mean()

In [99]:
ndcg_als = result.apply(lambda row: ndcg_at_k(row['candidates_als'], row['actual'], k = 5), axis=1).mean()

In [100]:
#map als
map_als = result.apply(lambda row: ap_at_k(row['candidates_als'], row['actual'], k = 5), axis=1).mean()

In [101]:
precision_at_k_2lvl = result.apply(lambda row: precision_at_k(row['candidates_2_lvl'], row['actual'], k = 5), axis=1).mean()

In [102]:
ndcg_2lvl = result.apply(lambda row: ndcg_at_k(row['candidates_2_lvl'], row['actual'], k = 5), axis=1).mean()

In [103]:
#map 2 level train
map_2lvl = result.apply(lambda row: ap_at_k(row['candidates_2_lvl'], row['actual'], k = 5), axis=1).mean()

In [104]:
result.head(1)

Unnamed: 0,user_id,actual,candidates,candidates_own_5,candidates_2_lvl,warm,candidates_own,candidates_als
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[856942, 9297615, 5577022, 877391, 9655212, 82...","[856942, 9297615, 5577022, 877391, 9655212]","[856942, 888104, 9297615, 10149640, 5577022]",True,"[856942, 9297615, 5577022, 877391, 9655212]","[15926844, 920200, 856942, 1082212, 9655212]"


In [105]:
print(f'OWN precision_at_5 {round(precision_at_k_own,2)}, ndcg_at_5 {round(ndcg_own,2)}, map_at_5 {round(map_own,2)}')

OWN precision_at_5 0.16, ndcg_at_5 0.17, map_at_5 0.11


In [106]:
print(f'ALS precision_at_5 {round(precision_at_k_als,2)}, ndcg_at_5 {round(ndcg_als,2)}, map_at_5 {round(map_als,2)}')

ALS precision_at_5 0.12, ndcg_at_5 0.13, map_at_5 0.07


In [107]:
print(f'2_lvl precision_at_5 {round(precision_at_k_2lvl,2)}, ndcg_at_5 {round(ndcg_2lvl,2)}, map_at_5 {round(map_2lvl,2)}')

2_lvl precision_at_5 0.15, ndcg_at_5 0.16, map_at_5 0.1


In [108]:
pd.Series(data_test['user_id'].unique()).isin(data['user_id'].unique()).value_counts()

True     1884
False       1
dtype: int64

In [114]:
result_test = data_test.groupby('user_id')['item_id'].unique().reset_index()
result_test.columns=['user_id', 'actual']
result_test['candidates_2_lvl'] = result_test['user_id'].apply(lambda user_id: rerank(user_id, df_train))
result_test.shape

(1885, 3)

In [115]:
precision_at_k_2lvl_test = result_test.apply(lambda row: precision_at_k(row['candidates_2_lvl'], row['actual'], k = 5), axis=1).mean()

In [116]:
ndcg_2lvl_test = result_test.apply(lambda row: ndcg_at_k(row['candidates_2_lvl'], row['actual'], k = 5), axis=1).mean()

In [117]:
map_2lvl_test = result_test.apply(lambda row: ap_at_k(row['candidates_2_lvl'], row['actual'], k = 5), axis=1).mean()

In [118]:
print(f'2_lvl TEST precision_at_5 {round(precision_at_k_2lvl_test,2)}, ndcg_at_5 {round(ndcg_2lvl_test,2)}, map_at_5 {round(map_2lvl_test,2)}')

2_lvl TEST precision_at_5 0.13, ndcg_at_5 0.14, map_at_5 0.08


In [119]:
df = result_test[['user_id', 'candidates_2_lvl']].copy()
df.to_csv('predictions.csv', index=False)