# Курсовой проект


In [1]:
#устанавливаем библиотеку implicit
!pip install implicit --no-use-pep517

Collecting implicit
  Downloading implicit-0.4.8.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 5.4 MB/s 
Building wheels for collected packages: implicit
  Building wheel for implicit (setup.py) ... [?25l[?25hdone
  Created wheel for implicit: filename=implicit-0.4.8-cp37-cp37m-linux_x86_64.whl size=4606638 sha256=aeaf1c347bad0bca9f25056f061d08b1eace237772d8cb99e65c3ce57d0e1153
  Stored in directory: /root/.cache/pip/wheels/88/e6/34/25e73cccbaf1a961154bb562a5f86123b68fdbf40e306073d6
Successfully built implicit
Installing collected packages: implicit
Successfully installed implicit-0.4.8


In [3]:
#Прикрепляем google disc

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
# Присоединяем директорию с модулями на гугл диске
import sys
sys.path.insert(0,"/content/drive/My Drive/")

----

# Import libs

In [5]:
# делаем необходимые импорты библиотек
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from metrics_2 import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders_6 import MainRecommender

## Read data

In [6]:
# скачиваем датасеты
data = pd.read_csv('/content/drive/MyDrive/retail_train.csv')
item_features = pd.read_csv('/content/drive/MyDrive/product.csv')
user_features = pd.read_csv('/content/drive/MyDrive/hh_demographic.csv')

# тестовый датасет для оценки итоговой метрики
test_data = pd.read_csv('/content/drive/MyDrive/retail_test1.csv')

# Process features dataset

In [7]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'

In [8]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

In [11]:
# Для user-item матрицы я хочу использовать отношение количества купленного товара,
# деленного на "давность" покупки. Для этого создадим новый признак в data
max_week_no = data['week_no'].max() + 1
data['value'] = data['quantity'] / (max_week_no - data['week_no'])

# Split dataset for train, eval, test

In [13]:
# т.к. стоим двухуровневую модель - разбиваем датасет на 3 части
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 

VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [36]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]


# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [37]:
# функция для визуализации информации о размерности датасета, количестве юзеров и товаров

def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [16]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')
print_stats_data(test_data, 'final_test')

train_matcher
Shape: (2108779, 13) Users: 2498 Items: 83685
val_matcher
Shape: (169711, 13) Users: 2154 Items: 27649
train_ranker
Shape: (169711, 13) Users: 2154 Items: 27649
val_ranker
Shape: (118314, 13) Users: 2042 Items: 24329
final_test
Shape: (88734, 12) Users: 1885 Items: 20497


# Prefilter items

In [38]:
# Воспользуемся функцией prefilter_items, чтобы оставить только топ-3000 самых популярных товаров

n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, take_n_popular=3000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 3001


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


# Make cold-start to warm-start

In [39]:
# В условиях задания сказано, что нужно использовать только "теплых юзеров", 
# проведем фильтрацию и уберем из всех датасетов "холодных" юзеров

# ищем общих пользователей
common_users = list(set(data_train_matcher.user_id.values)&set(data_val_matcher.user_id.values)&set(data_val_ranker.user_id.values)&set(test_data.user_id.values))

data_train_matcher = data_train_matcher[data_train_matcher.user_id.isin(common_users)]
data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]
test_data = test_data[test_data.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')
print_stats_data(test_data,'final_test')

train_matcher
Shape: (1788923, 13) Users: 1663 Items: 2998
val_matcher
Shape: (153229, 13) Users: 1663 Items: 26453
train_ranker
Shape: (153229, 13) Users: 1663 Items: 26453
val_ranker
Shape: (108983, 13) Users: 1663 Items: 23346
final_test
Shape: (83656, 12) Users: 1663 Items: 19981


После фильтрации у нас осталось 1663 юзера, которые есть во всех датасетах

# Init/train recommender

In [40]:
# создадим экземпляр класса MainRecommender (данный класс делали в течение курса)
# в классе я поменял n_factors у als (сделал n_factors=50)
# также изменил значение гиперпараметра values при создании user_item_matrix
# user_item_matrix заполнена данными из data['value']

recommender = MainRecommender(data_train_matcher)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/2998 [00:00<?, ?it/s]

In [41]:
# создадим функция для вычисления целевой метрики precision@5

def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

# Ranking part

### Обучаем модель 2-ого уровня на выбранных кандидатах

- Обучаем на data_train_ranking
- Обучаем *только* на выбранных кандидатах
 - 1) Первый вариант - сгенерирую топ-30 кандидиатов через get_als_recommendations
 - 2) Второй вариант- сгенерирую топ-50 кадидиатов через get_own_recommendations
 (!) Если юзер купил < 50 товаров, то get_own_recommendations дополнит рекоммендации топ-популярными

## Вариант 1 get_als_recommendations

## Подготовка данных для трейна

In [42]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [43]:
# собираем кандитатов с первого этапа (matcher)
# количество рекомендуемых товаров для последующего ранжирования = 30
N_PREDICT=30
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_als_recommendations(x, N=N_PREDICT))

In [44]:
# пример кандидатов als по 2-м юзерам
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[1082185, 1096036, 981760, 896369, 1080414, 90..."
1,2021,"[951590, 981760, 1044078, 1081177, 1082185, 92..."


In [45]:
# изменим формат представления данных по кандидатам
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)
df_match_candidates.head()

Unnamed: 0,user_id,item_id
0,2070,1082185
0,2070,1096036
0,2070,981760
0,2070,896369
0,2070,1080414


### Check warm start

In [46]:
# проверим, что кандидаты посчитаны только для "теплых" юзеров
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (49890, 2) Users: 1663 Items: 966


## Подготавливаем фичи для обучения модели

In [49]:
# Заранее создам в трейне 2 новых признака (флага): наличие скидки в магазине / скидка по купону
data_train_ranker['retail_discount_flag'] = np.where(data_train_ranker.retail_disc != 0, 1, 0)
data_train_ranker['coupon_discount_flag'] = np.where(data_train_ranker.coupon_disc != 0, 1, 0)

### Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1 

In [67]:
df_train_ranker = data_train_ranker.copy()

df_train_ranker['target'] = 1  # тут только покупки 

df_train_ranker = df_match_candidates.merge(df_train_ranker, on=[USER_COL, ITEM_COL], how='left')

df_train_ranker['target'].fillna(0, inplace= True)

df_train_ranker.head(4)

Unnamed: 0,user_id,item_id,basket_id,day,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,value,retail_discount_flag,coupon_discount_flag,target
0,2070,1082185,41160130000.0,630.0,1.0,1.92,311.0,0.0,14.0,91.0,0.0,0.0,0.2,0.0,0.0,1.0
1,2070,1096036,,,,,,,,,,,,,,0.0
2,2070,981760,40826480000.0,610.0,2.0,2.08,311.0,-0.24,1300.0,88.0,0.0,0.0,0.25,1.0,0.0,1.0
3,2070,896369,,,,,,,,,,,,,,0.0


In [68]:
# распределение таргетов в трейне 
df_ranker_train.target.value_counts()

0.0    39830
1.0    17909
Name: target, dtype: int64

In [69]:
# присоединяем фичи юзеров и айтемов
df_train_ranker = df_train_ranker.merge(item_features, on='item_id', how='left')
df_train_ranker = df_train_ranker.merge(user_features, on='user_id', how='left')

df_train_ranker.head()

Unnamed: 0,user_id,item_id,basket_id,day,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,value,retail_discount_flag,coupon_discount_flag,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1082185,41160130000.0,630.0,1.0,1.92,311.0,0.0,14.0,91.0,0.0,0.0,0.2,0.0,0.0,1.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1096036,,,,,,,,,,,,,,0.0,69,GROCERY,Private,CHEESE,IWS SINGLE CHEESE,12 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
2,2070,981760,40826480000.0,610.0,2.0,2.08,311.0,-0.24,1300.0,88.0,0.0,0.0,0.25,1.0,0.0,1.0,69,GROCERY,Private,EGGS,EGGS - X-LARGE,1 DZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
3,2070,896369,,,,,,,,,,,,,,0.0,69,GROCERY,Private,PNT BTR/JELLY/JAMS,PEANUT BUTTER,18 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
4,2070,1080414,41160130000.0,629.0,1.0,0.0,311.0,-0.59,1902.0,91.0,0.0,0.0,0.2,1.0,0.0,1.0,693,DRUG GM,National,CANDY - CHECKLANE,CANDY BARS (SINGLES)(INCLUDING,1.6 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [70]:
# Посчитаем общее количество покупок, общую сумму покупок и средний чек по каждому юзеру
user_sum_all_amounts = data_train_ranker.groupby('user_id')['sales_value'].sum().reset_index()
user_all_quantity = data_train_ranker.groupby('user_id')['quantity'].count().reset_index()
user_new_features = user_sum_all_amounts.merge(user_all_quantity, on=[USER_COL], how='left')
user_new_features['av_check'] = user_new_features['sales_value'] / user_new_features['quantity']
user_new_features.rename(columns={'sales_value': 'all_sales_sum', 'quantity': 'user_total_quantity'}, inplace=True)

# добавим новые признаки в обучающую выборку
df_train_ranker = df_train_ranker.merge(user_new_features, on='user_id', how='left')
df_train_ranker

df_train_ranker.head(2)

Unnamed: 0,user_id,item_id,basket_id,day,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,value,retail_discount_flag,coupon_discount_flag,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,all_sales_sum,user_total_quantity,av_check
0,2070,1082185,41160130000.0,630.0,1.0,1.92,311.0,0.0,14.0,91.0,0.0,0.0,0.2,0.0,0.0,1.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931
1,2070,1096036,,,,,,,,,,,,,,0.0,69,GROCERY,Private,CHEESE,IWS SINGLE CHEESE,12 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931


In [71]:
# Посчитаем количество и сумму покупок в разрезе юзеров и commodities
sales_of_cat_per_user = df_train_ranker.groupby(['user_id', 'commodity_desc'])[['sales_value', 'quantity']].sum().reset_index()
sales_of_cat_per_user.rename(columns={'sales_value': 'user_sales_in_category', 'quantity': 'commodity_quantity' }, inplace=True)

# добавим новые признаки в обучающую выборку
df_train_ranker = df_train_ranker.merge(sales_of_cat_per_user, on=['user_id', 'commodity_desc'], how='left')
df_train_ranker.head(2)

Unnamed: 0,user_id,item_id,basket_id,day,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,value,retail_discount_flag,coupon_discount_flag,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,all_sales_sum,user_total_quantity,av_check,user_sales_in_category,commodity_quantity
0,2070,1082185,41160130000.0,630.0,1.0,1.92,311.0,0.0,14.0,91.0,0.0,0.0,0.2,0.0,0.0,1.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931,1.92,1.0
1,2070,1096036,,,,,,,,,,,,,,0.0,69,GROCERY,Private,CHEESE,IWS SINGLE CHEESE,12 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931,0.0,0.0


In [72]:
# посчитаем для каждого юзера долю его покупок в каждом commodity, а также посчитаем среднее количество покупок .юзером каждого commodity в неделю
# добавим эти признаки в обучающую выборку
df_train_ranker['share_of_cat_per_user'] = df_train_ranker['user_sales_in_category'] / df_train_ranker['all_sales_sum']
df_train_ranker['commodity_purchases_per_week'] = df_train_ranker['commodity_quantity'] / VAL_MATCHER_WEEKS
df_train_ranker.head(2)

Unnamed: 0,user_id,item_id,basket_id,day,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,value,retail_discount_flag,coupon_discount_flag,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,all_sales_sum,user_total_quantity,av_check,user_sales_in_category,commodity_quantity,share_of_cat_per_user,commodity_purchases_per_week
0,2070,1082185,41160130000.0,630.0,1.0,1.92,311.0,0.0,14.0,91.0,0.0,0.0,0.2,0.0,0.0,1.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931,1.92,1.0,0.00311,0.166667
1,2070,1096036,,,,,,,,,,,,,,0.0,69,GROCERY,Private,CHEESE,IWS SINGLE CHEESE,12 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931,0.0,0.0,0.0,0.0


In [73]:
# Посчитаем общее количество покупок по скидкам
discount_purch_per_user = df_train_ranker.groupby(['user_id'])[['retail_discount_flag', 'coupon_discount_flag']].sum().reset_index()
discount_purch_per_user.rename(columns={'retail_discount_flag': 'user_retail_discount_flag', 'coupon_discount_flag': 'user_coupon_discount_flag' }, inplace=True)
discount_purch_per_user['discount_purchases_count'] = discount_purch_per_user['user_retail_discount_flag'] + discount_purch_per_user['user_coupon_discount_flag'] 


# добавим эти признаки в обучающую выборку
df_train_ranker = df_train_ranker.merge(discount_purch_per_user, on=['user_id',], how='left')
df_train_ranker.head(2)

Unnamed: 0,user_id,item_id,basket_id,day,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,value,retail_discount_flag,coupon_discount_flag,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,all_sales_sum,user_total_quantity,av_check,user_sales_in_category,commodity_quantity,share_of_cat_per_user,commodity_purchases_per_week,user_retail_discount_flag,user_coupon_discount_flag,discount_purchases_count
0,2070,1082185,41160130000.0,630.0,1.0,1.92,311.0,0.0,14.0,91.0,0.0,0.0,0.2,0.0,0.0,1.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931,1.92,1.0,0.00311,0.166667,8.0,0.0,8.0
1,2070,1096036,,,,,,,,,,,,,,0.0,69,GROCERY,Private,CHEESE,IWS SINGLE CHEESE,12 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931,0.0,0.0,0.0,0.0,8.0,0.0,8.0


In [74]:
# Добавим признак: доля покупок по скидкам
df_train_ranker['discount_purchases_share'] = df_train_ranker['discount_purchases_count'] / df_train_ranker['user_total_quantity']
df_train_ranker.head(2)

Unnamed: 0,user_id,item_id,basket_id,day,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,value,retail_discount_flag,coupon_discount_flag,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,all_sales_sum,user_total_quantity,av_check,user_sales_in_category,commodity_quantity,share_of_cat_per_user,commodity_purchases_per_week,user_retail_discount_flag,user_coupon_discount_flag,discount_purchases_count,discount_purchases_share
0,2070,1082185,41160130000.0,630.0,1.0,1.92,311.0,0.0,14.0,91.0,0.0,0.0,0.2,0.0,0.0,1.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931,1.92,1.0,0.00311,0.166667,8.0,0.0,8.0,0.039216
1,2070,1096036,,,,,,,,,,,,,,0.0,69,GROCERY,Private,CHEESE,IWS SINGLE CHEESE,12 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931,0.0,0.0,0.0,0.0,8.0,0.0,8.0,0.039216


In [75]:
#Подготовим обучающую выборку (в т.ч. удалим некоторые признаки) и выделим таргет
X_train = df_train_ranker.drop(['target', 'basket_id', 'store_id', 'curr_size_of_product'], axis=1)
y_train = df_train_ranker[['target']]

In [76]:
# Выделим категориальные признаки
cat_feats = ['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [77]:
# Обучим модель
lgb = LGBMClassifier(objective='binary',
                     max_depth=8,
                     n_estimators=300,
                     learning_rate=0.05,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Сделаем предсказания

In [78]:
df_ranker_predict = df_train_ranker.copy()
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]
df_ranker_predict.head(2)

Unnamed: 0,user_id,item_id,basket_id,day,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,value,retail_discount_flag,coupon_discount_flag,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,all_sales_sum,user_total_quantity,av_check,user_sales_in_category,commodity_quantity,share_of_cat_per_user,commodity_purchases_per_week,user_retail_discount_flag,user_coupon_discount_flag,discount_purchases_count,discount_purchases_share,proba_item_purchase
0,2070,1082185,41160130000.0,630.0,1.0,1.92,311.0,0.0,14.0,91.0,0.0,0.0,0.2,0.0,0.0,1.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931,1.92,1.0,0.00311,0.166667,8.0,0.0,8.0,0.039216,0.9999998
1,2070,1096036,,,,,,,,,,,,,,0.0,69,GROCERY,Private,CHEESE,IWS SINGLE CHEESE,12 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931,0.0,0.0,0.0,0.0,8.0,0.0,8.0,0.039216,9.399507e-08


In [79]:
# Посмотрим предсказания по конкретному юзеру
df_ranker_predict.loc[df_ranker_predict['user_id']==2070].sort_values('proba_item_purchase', ascending=False)

Unnamed: 0,user_id,item_id,basket_id,day,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,value,retail_discount_flag,coupon_discount_flag,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,all_sales_sum,user_total_quantity,av_check,user_sales_in_category,commodity_quantity,share_of_cat_per_user,commodity_purchases_per_week,user_retail_discount_flag,user_coupon_discount_flag,discount_purchases_count,discount_purchases_share,proba_item_purchase
0,2070,1082185,41160130000.0,630.0,1.0,1.92,311.0,0.0,14.0,91.0,0.0,0.0,0.2,0.0,0.0,1.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931,1.92,1.0,0.00311,0.166667,8.0,0.0,8.0,0.039216,0.9999998
7,2070,1055863,41160130000.0,629.0,1.0,0.59,311.0,0.0,1902.0,91.0,0.0,0.0,0.2,0.0,0.0,1.0,693,DRUG GM,National,CANDY - CHECKLANE,CANDY BARS (SINGLES)(INCLUDING,1.55 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931,0.59,2.0,0.000956,0.333333,8.0,0.0,8.0,0.039216,0.9999998
19,2070,899624,41160130000.0,630.0,1.0,2.69,311.0,-0.3,14.0,91.0,0.0,0.0,0.2,1.0,0.0,1.0,69,PRODUCE,Private,POTATOES,POTATOES RUSSET (BULK&BAG),10 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931,2.69,1.0,0.004358,0.166667,8.0,0.0,8.0,0.039216,0.9999998
13,2070,883404,41160130000.0,630.0,2.0,2.4,311.0,-0.18,14.0,91.0,0.0,0.0,0.4,1.0,0.0,1.0,69,GROCERY,Private,BAKED BREAD/BUNS/ROLLS,MAINSTREAM WHITE BREAD,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931,3.99,3.0,0.006464,0.5,8.0,0.0,8.0,0.039216,0.9999998
12,2070,961554,40826480000.0,610.0,2.0,3.64,311.0,-0.34,1300.0,88.0,0.0,0.0,0.25,1.0,0.0,1.0,69,PRODUCE,Private,CARROTS,CARROTS MINI PEELED,1 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931,3.64,2.0,0.005897,0.333333,8.0,0.0,8.0,0.039216,0.9999998
11,2070,1085604,41238440000.0,632.0,1.0,1.11,311.0,-0.18,1510.0,91.0,0.0,0.0,0.2,1.0,0.0,1.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931,1.11,1.0,0.001798,0.166667,8.0,0.0,8.0,0.039216,0.9999998
18,2070,908531,41160130000.0,630.0,2.0,3.58,311.0,-0.2,14.0,91.0,0.0,0.0,0.4,1.0,0.0,1.0,69,GROCERY,Private,FLUID MILK PRODUCTS,CHOCOLATE MILK,,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931,3.58,2.0,0.0058,0.333333,8.0,0.0,8.0,0.039216,0.9999998
25,2070,951590,40764910000.0,605.0,1.0,1.59,311.0,-0.7,1900.0,87.0,0.0,0.0,0.111111,1.0,0.0,1.0,910,GROCERY,National,BAKED BREAD/BUNS/ROLLS,MAINSTREAM WHITE BREAD,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931,3.99,3.0,0.006464,0.5,8.0,0.0,8.0,0.039216,0.9999998
4,2070,1080414,41160130000.0,629.0,1.0,0.0,311.0,-0.59,1902.0,91.0,0.0,0.0,0.2,1.0,0.0,1.0,693,DRUG GM,National,CANDY - CHECKLANE,CANDY BARS (SINGLES)(INCLUDING,1.6 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931,0.59,2.0,0.000956,0.333333,8.0,0.0,8.0,0.039216,0.9999998
2,2070,981760,40826480000.0,610.0,2.0,2.08,311.0,-0.24,1300.0,88.0,0.0,0.0,0.25,1.0,0.0,1.0,69,GROCERY,Private,EGGS,EGGS - X-LARGE,1 DZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,204,3.025931,2.08,2.0,0.00337,0.333333,8.0,0.0,8.0,0.039216,0.9999998


**Оценим метрику precision@5**

In [82]:
ACTUAL_COL = 'actual'
TOPK_PRECISION = 5

# списки фактических покупок из тестового датасета для оценки качества модели курсового проекта
result_eval_ranker = test_data.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,6,"[956902, 960791, 1037863, 1119051, 1137688, 84..."


Для сравнения посчитаем метрику только на модели 1-го уровня

In [255]:
"""%%time
result_eval_ranker['own_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))"""

CPU times: user 6.37 s, sys: 89.6 ms, total: 6.46 s
Wall time: 6.44 s


In [83]:
%%time
result_eval_ranker['als_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_als_recommendations(x, N=N_PREDICT))

CPU times: user 30.9 s, sys: 24 s, total: 55 s
Wall time: 28.2 s


In [84]:
sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('als_rec', 0.22260974143114584)]

Ранжируем рекомендации с помощью обученной модели 2-го уровня и посчитаем метрику

    

In [85]:
# функция ранжирования с помощью обученной модели
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [86]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [87]:
result_eval_ranker['reranked_own_rec']

0          [1005186, 979707, 840361, 840361, 1005186]
1           [878996, 1037863, 994928, 994928, 994928]
2       [1082185, 1082185, 1126899, 1106523, 1013321]
3        [1005186, 833715, 1082185, 1082185, 1029743]
4        [1082185, 1029743, 995242, 8090521, 6534178]
                            ...                      
1658       [826249, 999858, 899624, 1082185, 1082185]
1659         [845208, 995242, 834484, 859075, 860776]
1660    [1070820, 1070820, 1053690, 1106523, 1106523]
1661     [1070820, 5569327, 826249, 5568378, 5569327]
1662     [6534178, 995242, 1082185, 1082185, 1058997]
Name: reranked_own_rec, Length: 1663, dtype: object

# Оценим метрику после ранжирования моделью 2-го уровня

In [88]:
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.25580276608538505)
('als_rec', 0.22260974143114584)


После ранжирования получили метрику на датасете для оценки итоговой метрики (test_data) = 0.2558 

Поставленная задача выполнена!