### Двухуровневая модель

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender
from src.metrics import v_precision_at_k, v_recall_at_k

In [2]:
data = pd.read_csv('data/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


### Разделение на трейн-вал для двух-уровневой модели
(Пока пойдем по предложенным на вебинаре параметрам)

In [3]:
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [4]:
# Тестовые таргеты для получения метрик на первом уровне
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [5]:
# Модель первого уровня. Там будет ALS (и популярити рекомендации - для случаев отсутствия юзера в трейне)
first_level_model = MainRecommender(data_train_lvl_1, 
                                       n_factors=100,
                                       top_5000=True,
                                       strip_not_popular=False,
                                       strip_outdated=False,
                                       weighting=False,  
                                       K1=100,
                                       B=0.5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [6]:
# Собираем результаты первого уровня. Если юзера не было в трейне но он есть в тесте -
# Он получает N популярных товаров в качестве рекомендаций
# Если у юзера из ALS получается меньше 50 рекомендаций - их дополняют популярные
result_lvl_1['als'] = first_level_model.get_all_recommendations(result_lvl_1['user_id'], N=50)

In [7]:
result_lvl_1.head()

Unnamed: 0,user_id,actual,als
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[1132771, 867188, 958046, 1098248, 10149640, 8..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[1003421, 1071939, 1020581, 968215, 909714, 87..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[929668, 908531, 1048462, 8090521, 865705, 990..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[871611, 1041796, 996070, 1033220, 911878, 866..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[836281, 909894, 995242, 5591154, 6534178, 897..."


In [8]:
v_precision_at_k(result_lvl_1.als, result_lvl_1.actual, k=50).mean()

0.14163416898792947

In [9]:
v_recall_at_k(result_lvl_1.als, result_lvl_1.actual, k=50).mean()

0.13740893696798437

### Готовим данные на второй уровень

In [42]:
df=pd.DataFrame({'user_id':result_lvl_1.user_id.values.repeat(len(result_lvl_1.als[0])),
                 'item_id':np.concatenate(result_lvl_1.als.values)})
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = df.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)

In [43]:
df.head(2)

Unnamed: 0,user_id,item_id
0,1,1132771
1,1,867188


In [44]:
data_train_lvl_2.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2104867,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0
2107468,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,0.0,0.0
2107469,2021,40618753059,594,856060,1,1.77,443,-0.09,101,86,0.0,0.0
2107470,2021,40618753059,594,869344,1,1.67,443,-0.22,101,86,0.0,0.0
2107471,2021,40618753059,594,896862,2,5.0,443,-2.98,101,86,0.0,0.0


In [45]:
targets_lvl_2['target'].mean()

0.21034244176610775

In [198]:
targets_lvl_2.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,0_x,...,90_y,91_y,92_y,93_y,94_y,95_y,96_y,97_y,98_y,99_y
0,1,1132771,0.0,3787,DELI,National,CHEESES,CHEESE: NATURAL BULK,,0.003314,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721
1,1,867188,0.0,1046,GROCERY,National,FROZEN PIE/DESSERTS,FRZN WHIPPED TOPPING,8 OZ,0.005018,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721
2,1,958046,1.0,827,GROCERY,National,FRUIT - SHELF STABLE,PINEAPPLE,20 OZ,0.000943,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721
3,1,1098248,1.0,2,PRODUCE,National,PEARS,PEARS BARTLETT,,0.007953,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721
4,1,10149640,1.0,1011,GROCERY,National,BATH TISSUES,TOILET TISSUE,,0.004682,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721


In [47]:
# Добавляем туда фичи айтемов
item_features = pd.read_csv('data/product.csv')
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,1,1132771,0.0,3787.0,DELI,National,CHEESES,CHEESE: NATURAL BULK,
1,1,867188,0.0,1046.0,GROCERY,National,FROZEN PIE/DESSERTS,FRZN WHIPPED TOPPING,8 OZ
2,1,958046,1.0,827.0,GROCERY,National,FRUIT - SHELF STABLE,PINEAPPLE,20 OZ
3,1,1098248,1.0,2.0,PRODUCE,National,PEARS,PEARS BARTLETT,
4,1,10149640,1.0,1011.0,GROCERY,National,BATH TISSUES,TOILET TISSUE,


In [48]:
# Добавляем туда эмбеддинги айтемов из ALS
embs = pd.DataFrame(first_level_model.model.item_factors)
embs['item_id'] = embs.index
embs['item_id'] = embs['item_id'].apply(lambda x: first_level_model.id_to_itemid[x])

targets_lvl_2 = targets_lvl_2.merge(embs, on='item_id', how='left')
targets_lvl_2.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,0,...,90,91,92,93,94,95,96,97,98,99
0,1,1132771,0.0,3787.0,DELI,National,CHEESES,CHEESE: NATURAL BULK,,0.003314,...,0.016595,0.009209,-0.002801,0.009618,0.009802,-0.008644,0.020062,0.006508,-0.00251,0.001342
1,1,867188,0.0,1046.0,GROCERY,National,FROZEN PIE/DESSERTS,FRZN WHIPPED TOPPING,8 OZ,0.005018,...,0.027209,0.001818,0.016416,-0.00144,-0.004537,0.01313,-0.019439,0.008619,0.014195,0.010904
2,1,958046,1.0,827.0,GROCERY,National,FRUIT - SHELF STABLE,PINEAPPLE,20 OZ,0.000943,...,0.011196,0.010146,0.004122,-0.001052,0.004223,0.003454,-0.002848,0.012128,0.012867,0.006614
3,1,1098248,1.0,2.0,PRODUCE,National,PEARS,PEARS BARTLETT,,0.007953,...,0.005468,0.022737,0.016989,0.011878,0.012684,0.014125,0.007525,0.002706,0.008896,0.005842
4,1,10149640,1.0,1011.0,GROCERY,National,BATH TISSUES,TOILET TISSUE,,0.004682,...,-0.001779,0.009789,0.005005,0.005598,0.009248,-0.000633,-0.007506,0.008811,0.00583,-0.002205


In [49]:
# Далее - ембеддинги юзеров (это будут единственные фичи по юзерам - 
# поскольку hh_demographics - не полный и скорее всего вообще не имеет к нам отношения)

In [50]:
embs = pd.DataFrame(first_level_model.model.user_factors)
embs['user_id'] = embs.index
embs['user_id'] = embs['user_id'].apply(lambda x: first_level_model.id_to_userid[x])

targets_lvl_2 = targets_lvl_2.merge(embs, on='user_id', how='left')
targets_lvl_2.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,0_x,...,90_y,91_y,92_y,93_y,94_y,95_y,96_y,97_y,98_y,99_y
0,1,1132771,0.0,3787.0,DELI,National,CHEESES,CHEESE: NATURAL BULK,,0.003314,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721
1,1,867188,0.0,1046.0,GROCERY,National,FROZEN PIE/DESSERTS,FRZN WHIPPED TOPPING,8 OZ,0.005018,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721
2,1,958046,1.0,827.0,GROCERY,National,FRUIT - SHELF STABLE,PINEAPPLE,20 OZ,0.000943,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721
3,1,1098248,1.0,2.0,PRODUCE,National,PEARS,PEARS BARTLETT,,0.007953,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721
4,1,10149640,1.0,1011.0,GROCERY,National,BATH TISSUES,TOILET TISSUE,,0.004682,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721


In [51]:
targets_lvl_2.shape

(117071, 209)

In [52]:
cat_feats = ('manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product')

In [76]:
# Manufacturer - странное поле - выглядит как float но по смыслу это должен быть некий id
targets_lvl_2['manufacturer'] = targets_lvl_2['manufacturer'].fillna(0).astype('int')

In [88]:
targets_lvl_2.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,0_x,...,90_y,91_y,92_y,93_y,94_y,95_y,96_y,97_y,98_y,99_y
0,1,1132771,0.0,3787,DELI,National,CHEESES,CHEESE: NATURAL BULK,,0.003314,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721
1,1,867188,0.0,1046,GROCERY,National,FROZEN PIE/DESSERTS,FRZN WHIPPED TOPPING,8 OZ,0.005018,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721
2,1,958046,1.0,827,GROCERY,National,FRUIT - SHELF STABLE,PINEAPPLE,20 OZ,0.000943,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721
3,1,1098248,1.0,2,PRODUCE,National,PEARS,PEARS BARTLETT,,0.007953,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721
4,1,10149640,1.0,1011,GROCERY,National,BATH TISSUES,TOILET TISSUE,,0.004682,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721


In [93]:
# Надо было стрипнуть оттуда 999999 - теперь у него наны в фичах
targets_lvl_2 = targets_lvl_2.fillna(0)

In [92]:
targets_lvl_2.describe()

Unnamed: 0,user_id,item_id,target,manufacturer,0_x,1_x,2_x,3_x,4_x,5_x,...,90_y,91_y,92_y,93_y,94_y,95_y,96_y,97_y,98_y,99_y
count,117071.0,117071.0,117071.0,117071.0,117071.0,117071.0,117071.0,117071.0,117071.0,117071.0,...,117071.0,117071.0,117071.0,117071.0,117071.0,117071.0,117071.0,117071.0,117071.0,117071.0
mean,1247.454664,1623079.0,0.210342,659.373269,0.005507,0.006637,0.003527,0.005513,0.003915,0.005446,...,-0.194989,-0.953215,-3.832024,-0.655766,1.961513,-3.488868,1.48028,-0.681189,-2.680624,4.319109
std,719.097502,1989799.0,0.407554,1215.944111,0.008771,0.008681,0.008887,0.008748,0.00866,0.009077,...,2.89877,2.677695,2.770562,2.650566,2.583545,2.587128,2.711413,2.459437,2.630559,2.503356
min,1.0,818980.0,0.0,0.0,-0.026302,-0.01719,-0.021956,-0.023752,-0.022287,-0.022104,...,-12.309825,-16.090813,-15.890063,-13.93902,-9.17216,-14.752202,-13.508883,-13.013363,-12.290492,-8.558477
25%,627.0,903325.0,0.0,69.0,0.000548,0.000699,-0.001916,-0.001078,-0.000604,-0.000763,...,-2.160346,-2.387689,-5.504837,-2.059971,0.615827,-4.851421,0.083159,-1.940815,-4.21422,3.111503
50%,1252.0,995876.0,0.0,69.0,0.006085,0.006231,0.00336,0.006641,0.003226,0.005367,...,-0.633807,-1.003487,-3.584659,-1.031029,1.781677,-3.728982,1.499941,-0.767395,-2.692656,4.167582
75%,1864.0,1082185.0,0.0,759.0,0.010797,0.011308,0.00829,0.011524,0.008885,0.011837,...,1.431793,0.305583,-2.028195,0.663066,3.249728,-2.170223,2.911618,0.549368,-1.310789,5.450657
max,2500.0,15927660.0,1.0,6331.0,0.037347,0.03242,0.035733,0.036285,0.031098,0.032994,...,11.173027,12.790009,7.232758,13.049061,15.003448,7.810928,12.480314,10.189717,8.175714,21.221157


In [94]:
targets_lvl_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 117071 entries, 0 to 117070
Columns: 209 entries, user_id to 99_y
dtypes: float32(200), float64(1), int64(3), object(5)
memory usage: 98.2+ MB


### Обучаем вторую модель

In [95]:
from sklearn.model_selection import train_test_split

In [96]:
def get_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n-----------------------------\n' + classification_report(y_train_true, y_train_pred))
    print('\nTEST\n-----------------------------\n' + classification_report(y_test_true, y_test_pred))

In [97]:
X = targets_lvl_2.drop('target', axis=1)
y = targets_lvl_2[['target']]

In [98]:
# Учить и смотреть метрики в процессе учебы будем на втором трейне
# Финальный тест побережем на финальную оценку
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True, random_state=0)

In [99]:
lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats, )
lgb.fit(X_train, y_train)

  return f(**kwargs)


ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in the following fields: department, brand, commodity_desc, sub_commodity_desc, curr_size_of_product

In [100]:
# Эмм.. На вебинаре кажется говорилось, что lightgbm может нативно категориальные признаки... без one_hot..

In [101]:
# Окей - тогда catboost. Этот точно может
from catboost import CatBoostClassifier

In [200]:
model = CatBoostClassifier(
    iterations=50,
    learning_rate=0.1,
    loss_function='Logloss',
    custom_loss=['AUC', 'Accuracy']
)
model.fit(
    X_train, y_train,
    cat_features=cat_feats,
    eval_set=(X_test, y_test),
    plot=False,
    verbose=True
)

0:	learn: 0.6508038	test: 0.6504680	best: 0.6504680 (0)	total: 105ms	remaining: 5.15s
1:	learn: 0.6145158	test: 0.6140570	best: 0.6140570 (1)	total: 200ms	remaining: 4.81s
2:	learn: 0.5872046	test: 0.5865507	best: 0.5865507 (2)	total: 286ms	remaining: 4.49s
3:	learn: 0.5637098	test: 0.5629327	best: 0.5629327 (3)	total: 409ms	remaining: 4.71s
4:	learn: 0.5441255	test: 0.5432595	best: 0.5432595 (4)	total: 488ms	remaining: 4.39s
5:	learn: 0.5290393	test: 0.5282379	best: 0.5282379 (5)	total: 564ms	remaining: 4.14s
6:	learn: 0.5163725	test: 0.5155875	best: 0.5155875 (6)	total: 640ms	remaining: 3.93s
7:	learn: 0.5059244	test: 0.5050873	best: 0.5050873 (7)	total: 745ms	remaining: 3.91s
8:	learn: 0.4975402	test: 0.4967207	best: 0.4967207 (8)	total: 830ms	remaining: 3.78s
9:	learn: 0.4878118	test: 0.4871720	best: 0.4871720 (9)	total: 931ms	remaining: 3.73s
10:	learn: 0.4803667	test: 0.4798919	best: 0.4798919 (10)	total: 1.01s	remaining: 3.57s
11:	learn: 0.4736366	test: 0.4730705	best: 0.4730705

<catboost.core.CatBoostClassifier at 0x7ffb98710b20>

In [115]:
# Как-то учится. Можно даже продолжать наверное еще.
# А еще красивые catboost графики рисует.. с plot=True, по test-ROC-AUC на них получается 0.79.. test-accuracy 0.816
# Ну - это ведь далеко не конечный результат еще. Надо теперь прогнать на test-е и взять 5 с наибольшей вероятностью
# тогда уже можно будет посмотреть скор

In [117]:
data_val_lvl_2.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2277416,338,41260573635,636,840173,1,1.99,369,0.0,112,92,0.0,0.0
2277417,338,41260573635,636,1037348,1,0.89,369,-0.3,112,92,0.0,0.0


In [124]:
# Делаем таргеты для теста второго уровня (метрики считать) 
# и получаем рекомендации от фильтрующей модели
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns = ('user_id', 'actual')
result_lvl_2['als'] = first_level_model.get_all_recommendations(result_lvl_2['user_id'], N=50)
result_lvl_2.head(2)

Unnamed: 0,user_id,actual,als
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1132771, 867188, 958046, 1098248, 10149640, 8..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[844179, 859075, 12810393, 940947, 6463658, 10..."


In [197]:
# Делаем пары user_id - item_id
df=pd.DataFrame({'user_id':result_lvl_2.user_id.values.repeat(len(result_lvl_2.als[0])),
                 'item_id':np.concatenate(result_lvl_2.als.values)})

df.head()

Unnamed: 0,user_id,item_id
0,1,1132771
1,1,867188
2,1,958046
3,1,1098248
4,1,10149640


In [141]:
# Теперь сюда надо опять добавить айтем-фичи + айтем-ембеддинги + юзер-ембеддниги
val_lvl_2 = df.copy()

In [142]:
# Добавляем туда фичи айтемов
val_lvl_2 = val_lvl_2.merge(item_features, on='item_id', how='left')
val_lvl_2.head(2)

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,1,1132771,3787.0,DELI,National,CHEESES,CHEESE: NATURAL BULK,
1,1,867188,1046.0,GROCERY,National,FROZEN PIE/DESSERTS,FRZN WHIPPED TOPPING,8 OZ


In [143]:
# Эмбеддинги айтемов из ALS
embs = pd.DataFrame(first_level_model.model.item_factors)
embs['item_id'] = embs.index
embs['item_id'] = embs['item_id'].apply(lambda x: first_level_model.id_to_itemid[x])

val_lvl_2 = val_lvl_2.merge(embs, on='item_id', how='left')
val_lvl_2.head(2)

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,0,1,...,90,91,92,93,94,95,96,97,98,99
0,1,1132771,3787.0,DELI,National,CHEESES,CHEESE: NATURAL BULK,,0.003314,-0.003628,...,0.016595,0.009209,-0.002801,0.009618,0.009802,-0.008644,0.020062,0.006508,-0.00251,0.001342
1,1,867188,1046.0,GROCERY,National,FROZEN PIE/DESSERTS,FRZN WHIPPED TOPPING,8 OZ,0.005018,-0.004096,...,0.027209,0.001818,0.016416,-0.00144,-0.004537,0.01313,-0.019439,0.008619,0.014195,0.010904


In [144]:
# Юзер эмбеддинги
embs = pd.DataFrame(first_level_model.model.user_factors)
embs['user_id'] = embs.index
embs['user_id'] = embs['user_id'].apply(lambda x: first_level_model.id_to_userid[x])

val_lvl_2 = val_lvl_2.merge(embs, on='user_id', how='left')
val_lvl_2.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,0_x,1_x,...,90_y,91_y,92_y,93_y,94_y,95_y,96_y,97_y,98_y,99_y
0,1,1132771,3787.0,DELI,National,CHEESES,CHEESE: NATURAL BULK,,0.003314,-0.003628,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721
1,1,867188,1046.0,GROCERY,National,FROZEN PIE/DESSERTS,FRZN WHIPPED TOPPING,8 OZ,0.005018,-0.004096,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721
2,1,958046,827.0,GROCERY,National,FRUIT - SHELF STABLE,PINEAPPLE,20 OZ,0.000943,0.000465,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721
3,1,1098248,2.0,PRODUCE,National,PEARS,PEARS BARTLETT,,0.007953,0.003463,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721
4,1,10149640,1011.0,GROCERY,National,BATH TISSUES,TOILET TISSUE,,0.004682,0.004622,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721


In [145]:
# И опять manufacturer в int (надо класс для модели второго уровня делать тоже - чтобы не повторять это все заново)
# Ну или в пайплайн заворачивать
val_lvl_2['manufacturer'] = val_lvl_2['manufacturer'].fillna(0).astype('int')

In [149]:
val_lvl_2 = val_lvl_2.fillna(0)
val_lvl_2.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,0_x,1_x,...,90_y,91_y,92_y,93_y,94_y,95_y,96_y,97_y,98_y,99_y
0,1,1132771,3787,DELI,National,CHEESES,CHEESE: NATURAL BULK,,0.003314,-0.003628,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721
1,1,867188,1046,GROCERY,National,FROZEN PIE/DESSERTS,FRZN WHIPPED TOPPING,8 OZ,0.005018,-0.004096,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721
2,1,958046,827,GROCERY,National,FRUIT - SHELF STABLE,PINEAPPLE,20 OZ,0.000943,0.000465,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721
3,1,1098248,2,PRODUCE,National,PEARS,PEARS BARTLETT,,0.007953,0.003463,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721
4,1,10149640,1011,GROCERY,National,BATH TISSUES,TOILET TISSUE,,0.004682,0.004622,...,4.921842,3.1956,-0.312043,0.663701,5.147588,-4.404645,-5.020015,-1.007887,-4.002372,2.887721


In [153]:
val_lvl_2.shape

(102100, 208)

### Данные последнего теста готовы - запускаем классифайер

In [150]:
preds = model.predict_proba(val_lvl_2)

In [175]:
# Мерджим вероятности с юзерами-айтемами (для последующей сортировки)
preds_df = pd.DataFrame(preds[:,1:])
preds_df.columns = ('probabilities',)
prob_df = df.join(preds_df)
prob_df.head()

Unnamed: 0,user_id,item_id,probabilities
0,1,1132771,0.304914
1,1,867188,0.280074
2,1,958046,0.470006
3,1,1098248,0.298016
4,1,10149640,0.230087


In [176]:
# Теперь надо айтемы сгруппировать по юзеру сортируя по убыванию probability

In [188]:
sorted_prob = prob_df.sort_values('probabilities', ascending=False).reset_index(drop=True)
test_result = sorted_prob.groupby('user_id')['item_id'].unique().reset_index()
test_result.head()

Unnamed: 0,user_id,item_id
0,1,"[1082185, 1029743, 995242, 958046, 940947, 100..."
1,3,"[1082185, 1106523, 1133018, 940947, 1110244, 1..."
2,6,"[1082185, 1029743, 995242, 1070820, 862349, 98..."
3,7,"[1082185, 1106523, 6534178, 1126899, 995242, 5..."
4,8,"[1082185, 1106523, 1029743, 6534178, 1133018, ..."


In [189]:
# Пришла пора посмотреть - насколько правильно получилось все это сделать...

In [194]:
v_precision_at_k(test_result.item_id, result_lvl_2.actual).mean()

0.2514201762977474

In [195]:
# Фух... так. Ожидалось конечно сразу 0.9 :)) но судя по цифре скора - видимо все работает правильно.
# Если бы я где-то совсем накосячил - было бы что-то типа 0.05 (такой вариант тоже ожидался :)
# А 0.251 - больше чем получалось на own и на ALS и на чем бы то ни было до этого.
# Но - поработать все-же придется )
# Будем считать это бэйзлайном и прув оф концепт для двухуровневой модели 
# И займемся фильтрацией исходных интеракций, фиче-инжинирингом и тюнингом бустинга в конце-концов..