In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt

import lightgbm as lgbm
import gc
import xgboost as xgb
import pickle as pickle


from catboost import CatBoostRegressor
import dask.dataframe as dd
from sklearn.model_selection import KFold

In [3]:
items           = pd.read_csv('items.csv',usecols=["item_id", "item_category_id"])
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')

In [4]:
y_cols = ['shop_id', 'item_id', 'date_block_num']

In [5]:
sales_train[['day','month', 'year']] = sales_train['date'].str.split('.', expand=True).astype(int)
sales_train = sales_train[sales_train['year'] != 2013]
sales_train = sales_train.set_index('item_id').join(items.set_index('item_id'))
sales_train.reset_index(inplace=True)

In [6]:
train_item_ids = sales_train['item_id'].unique()
train_shop_ids = sales_train['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = sales_train['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [7]:
combinations = []
for shop in all_shop_ids:
    #get all article ids ever associated to this shop
    train_ids = sales_train[sales_train['shop_id'] == shop]['item_id'].unique()
    test_ids = sales_train[sales_train['shop_id'] == shop]['item_id'].unique()
    all_shop = np.unique(np.append(train_ids, test_ids))
    all_shop_combo = [[shop, item, block] for item in all_shop for block in train_blocks]
    for combo in all_shop_combo:
        combinations.append(combo)
        
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['shop_id','item_id','date_block_num'])
del combinations

In [8]:
ys = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"y"})

In [9]:
#

In [10]:
training = all_combos.merge(ys, on=['shop_id', 'item_id', 'date_block_num'], how='left').fillna(0)
training = training.apply(pd.to_numeric,downcast='unsigned')

training['y'] = training['y'].clip(0,20)
training['y'] = training['y'].astype('int8')

training = training.set_index('item_id').join(items.set_index('item_id'))
training.reset_index(inplace=True)

In [11]:
def get_mean_encoding(df, group_cols, target):
    cumsum = df.groupby(group_cols)[target].cumsum() - df[target]
    cumcnt = df.groupby(group_cols).cumcount()
    return cumsum/cumcnt

training['item_me'] = pd.to_numeric(get_mean_encoding(training, ['item_id'], 'y'), downcast='float')
training['shop_me'] = pd.to_numeric(get_mean_encoding(training, ['shop_id'], 'y'), downcast='float')
training['category_me'] = pd.to_numeric(get_mean_encoding(training, ['item_category_id'], 'y'), downcast='float')
training['shop_category_me'] = pd.to_numeric(get_mean_encoding(training, ['shop_id', 'item_category_id'], 'y'), downcast='float')
training['shop_item_me'] = pd.to_numeric(get_mean_encoding(training, ['shop_id', 'item_id'], 'y'), downcast='float')

training.fillna(0,inplace=True)

In [12]:
cols = ['item_id','date_block_num']
item_block_units = sales_train.groupby(cols,as_index=False)['item_cnt_day'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_day':'item_block_units'})

training = training.merge(item_block_units, on=cols, how='left').fillna(0)
del item_block_units
gc.collect()
#
cols = ['shop_id','date_block_num']
shop_block_units = sales_train.groupby(cols,as_index=False)['item_cnt_day'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_day':'shop_block_units'})

training = training.merge(shop_block_units, on=cols, how='left').fillna(0)
del shop_block_units
gc.collect()
#
cols = ['item_category_id','date_block_num']
cat_block_units = sales_train.groupby(cols,as_index=False)['item_cnt_day'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_day':'cat_block_units'})

training = training.merge(cat_block_units, on=cols, how='left').fillna(0)
del cat_block_units
gc.collect()
#

cols = ['shop_id', 'item_category_id','date_block_num']
shop_cat_block_units = sales_train.groupby(cols,as_index=False)['item_cnt_day'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_day':'shop_cat_block_units'})

training = training.merge(shop_cat_block_units, on=cols, how='left').fillna(0)
del shop_cat_block_units
gc.collect()
#
cols = ['shop_id', 'item_id','date_block_num']
shop_item_block_units = sales_train.groupby(cols,as_index=False)['item_cnt_day'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_day':'shop_item_block_units'})

training = training.merge(shop_item_block_units, on=cols, how='left').fillna(0)
del shop_item_block_units
gc.collect()

63

In [13]:
number_of_items = sales_train['item_id'].nunique()
print("number_of_items:", number_of_items)
number_of_categories = sales_train['item_category_id'].nunique()
print("number_of_categories:", number_of_categories)
number_of_shops = sales_train['shop_id'].nunique()
print("number_of_shops:", number_of_shops)
number_of_days = 365 + 365 - 30 - 31
print("number_of_days:", number_of_days)
number_of_blocks = sales_train['date_block_num'].nunique()
print("number_of_blocks:", number_of_blocks)
total_sales = sales_train['item_cnt_day'].sum()
print("total_sales:", total_sales)
average_price = sales_train['item_price'].mean()
print("average_price:", average_price)

number_of_items: 17054
number_of_categories: 79
number_of_shops: 55
number_of_days: 669
number_of_blocks: 22
total_sales: 2085473.0
average_price: 1015.5023073772021


In [14]:
training['item_units'] = training.groupby(['date_block_num'])['item_block_units'].transform(np.sum)
training['item_max_units_block'] = training.groupby(['item_id'])['item_block_units']\
        .transform(np.max)
training['item_min_units_block'] = training.groupby(['item_id'])['item_block_units']\
        .transform(np.min)
training['item_minmax_mean'] = training[['item_max_units_block', 'item_min_units_block']].mean(axis=1)

#
training['shop_units'] = training.groupby(['date_block_num'])['shop_block_units'].transform(np.sum)
training['shop_max_units_block'] = training.groupby(['shop_id'])['shop_block_units']\
        .transform(np.max)
training['shop_min_units_block'] = training.groupby(['shop_id'])['shop_block_units']\
        .transform(np.min)
training['shop_minmax_mean'] = training[['shop_max_units_block', 'shop_min_units_block']].mean(axis=1)

#
training['cat_units'] = training.groupby(['date_block_num'])['cat_block_units'].transform(np.sum)
training['cat_max_units_block'] = training.groupby(['item_category_id'])['cat_block_units']\
        .transform(np.max)
training['cat_min_units_block'] = training.groupby(['item_category_id'])['cat_block_units']\
        .transform(np.min)
training['cat_minmax_mean'] = training[['cat_max_units_block', 'cat_min_units_block']].mean(axis=1)
#
training['shop_cat_units'] = training.groupby(['date_block_num'])['shop_cat_block_units'].transform(np.sum)
training['shop_cat_max_units_block'] = training.groupby(['shop_id', 'item_category_id'])['shop_cat_block_units']\
        .transform(np.max)
training['shop_cat_min_units_block'] = training.groupby(['shop_id', 'item_category_id'])['shop_cat_block_units']\
        .transform(np.min)
training['shop_cat_minmax_mean'] = training[['shop_cat_max_units_block', 'shop_cat_min_units_block']].mean(axis=1)
#
training['shop_item_units'] = training.groupby(['date_block_num'])['shop_item_block_units'].transform(np.sum)
training['shop_item_max_units_block'] = training.groupby(['shop_id', 'item_id'])['shop_item_block_units']\
        .transform(np.max)
training['shop_item_min_units_block'] = training.groupby(['shop_id', 'item_id'])['shop_item_block_units']\
        .transform(np.min)
training['shop_item_minmax_mean'] = training[['shop_item_max_units_block', 'shop_item_min_units_block']].mean(axis=1)

In [15]:
training['item_share_of_total_units'] = training['item_units'] * 100 / total_sales
training['category_share_of_total_units'] = training['cat_units'] * 100 / total_sales
training['shop_share_of_units'] = training['shop_units'] * 100 / total_sales
training['shop_item_share_of_total_units'] = training['shop_item_units'] * 100\
                        / total_sales
training['shop_item_share_of_shop_units'] = training['shop_item_units'] * 100\
                        / training['shop_units']

In [16]:
training['shop_share_item_units_comp'] = training['item_units'] * training['shop_share_of_units']
training['shop_item_units_comp'] = training['item_units'] / training['shop_units']

In [17]:
rolls = [3,6]
cols = ['item_id','date_block_num']

for roll in rolls:
    print(roll)
    roll_name = "item_block_units_rolling_" + str(roll)
    roll_name_tmp = roll_name + "_tmp"
    
    item_block_units_rolling_temp = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['item_id'],as_index=False)\
        ['item_block_units'].rolling(roll,min_periods=2).mean().reset_index()\
        .rename(columns={'item_block_units':roll_name_tmp})\
        [['item_id','date_block_num',roll_name_tmp]]
    
    training = training.merge(item_block_units_rolling_temp, on=cols, how='left')
    del item_block_units_rolling_temp
    gc.collect()

    item_block_units_rolling = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['item_id'],as_index=False)\
        [roll_name_tmp].shift(1)\
        .rename(columns={roll_name_tmp:roll_name}).reset_index()

    training = training.merge(item_block_units_rolling, on=cols, how='left')
    training.drop(columns=[roll_name_tmp], inplace=True)
    del item_block_units_rolling
    gc.collect()

3
6


In [18]:
rolls = [3,6]
cols = ['shop_id','date_block_num']

for roll in rolls:
    print(roll)
    roll_name = "shop_block_units_rolling_" + str(roll)
    roll_name_tmp = roll_name + "_tmp"
    
    shop_block_units_rolling_temp = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['shop_id'],as_index=False)\
        ['shop_block_units'].rolling(roll,min_periods=2).mean().reset_index()\
        .rename(columns={'shop_block_units':roll_name_tmp})\
        [['shop_id','date_block_num',roll_name_tmp]]
    
    training = training.merge(shop_block_units_rolling_temp, on=cols, how='left')
    del shop_block_units_rolling_temp
    gc.collect()

    shop_block_units_rolling = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['shop_id'],as_index=False)\
        [roll_name_tmp].shift(1)\
        .rename(columns={roll_name_tmp:roll_name}).reset_index()

    training = training.merge(shop_block_units_rolling, on=cols, how='left')
    training.drop(columns=[roll_name_tmp], inplace=True)
    del shop_block_units_rolling
    gc.collect()

3
6


In [19]:
rolls = [3,6]
cols = ['shop_id','item_id', 'date_block_num']

for roll in rolls:
    print(roll)
    roll_name = "shop_item_block_units_rolling_" + str(roll)
    roll_name_tmp = roll_name + "_tmp"
    
    shop_item_block_units_rolling_temp = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['shop_id','item_id'],as_index=False)\
        ['shop_item_block_units'].rolling(roll,min_periods=2).mean().reset_index()\
        .rename(columns={'shop_item_block_units':roll_name_tmp})\
        [['shop_id','item_id','date_block_num',roll_name_tmp]]
    
    training = training.merge(shop_item_block_units_rolling_temp, on=cols, how='left')
    print("lol")
    del shop_item_block_units_rolling_temp
    gc.collect()
    
    shop_item_block_units_rolling = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['shop_id','item_id'],as_index=False)\
        [roll_name_tmp].shift(1)\
        .rename(columns={roll_name_tmp:roll_name}).reset_index()

    training = training.merge(shop_item_block_units_rolling, on=cols, how='left')
    training.drop(columns=[roll_name_tmp], inplace=True)
    del shop_item_block_units_rolling
    gc.collect()

3
lol
6
lol


In [26]:
training.fillna(0, inplace=True)

In [20]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'y', 'item_category_id',
       'item_me', 'shop_me', 'category_me', 'shop_category_me',
       'shop_item_me', 'item_block_units', 'shop_block_units',
       'cat_block_units', 'shop_cat_block_units', 'shop_item_block_units',
       'item_units', 'item_max_units_block', 'item_min_units_block',
       'item_minmax_mean', 'shop_units', 'shop_max_units_block',
       'shop_min_units_block', 'shop_minmax_mean', 'cat_units',
       'cat_max_units_block', 'cat_min_units_block', 'cat_minmax_mean',
       'shop_cat_units', 'shop_cat_max_units_block',
       'shop_cat_min_units_block', 'shop_cat_minmax_mean',
       'shop_item_units', 'shop_item_max_units_block',
       'shop_item_min_units_block', 'shop_item_minmax_mean',
       'item_share_of_total_units', 'category_share_of_total_units',
       'shop_share_of_units', 'shop_item_share_of_total_units',
       'shop_item_share_of_shop_units', 'shop_share_item_units_comp',
       'shop_item_units_c

In [196]:
np.min([2,2])

2

In [23]:
training['rolling_composite'] =  training['shop_block_units_rolling3'].clip(1, None) *\
            training['item_block_units_rolling3'].clip(1, None) 
training['me_composite'] =  training['item_me'].clip(1, None)  * training['shop_me'].clip(1, None) 

In [52]:
training.dtypes

item_id                    int64
shop_id                    uint8
date_block_num             uint8
y                           int8
item_category_id           int64
item_me                  float32
shop_me                  float32
category_me              float32
shop_category_me         float32
shop_item_me             float32
item_block_units         float64
shop_block_units         float64
cat_block_units          float64
shop_cat_block_units     float64
shop_item_block_units    float64
dtype: object

In [55]:
training.sample(10)

Unnamed: 0,item_id,shop_id,date_block_num,y,item_category_id,item_me,shop_me,category_me,shop_category_me,shop_item_me,item_block_units,shop_block_units,cat_block_units,shop_cat_block_units,shop_item_block_units,item_units,item_max_units_block,item_min_units_block
650693,2916,49,33,0,23,0.30847,0.217947,0.31367,0.211524,0.333333,0.0,648.0,2203.0,42.0,0.0,2499996.0,71.0,0.0
1680959,6608,19,17,0,25,0.040816,0.434715,0.180928,0.12246,0.0,0.0,2067.0,460.0,17.0,0.0,4156779.0,6.0,0.0
3105180,12080,38,24,0,40,0.155172,0.369399,0.239276,0.121243,0.0,0.0,1441.0,15109.0,76.0,0.0,4698631.0,30.0,0.0
2142666,8401,28,32,0,37,0.26971,0.715043,0.181585,0.367775,0.65,0.0,2979.0,2989.0,158.0,0.0,2528824.0,47.0,0.0
114587,1021,7,23,1,67,0.151515,0.219186,0.195883,0.15978,0.727273,15.0,3318.0,2778.0,49.0,1.0,7399317.0,18.0,0.0
4914915,18159,31,17,0,40,0.111111,0.686598,0.2298,0.635294,0.4,1.0,7701.0,16064.0,1835.0,0.0,4156779.0,4.0,0.0
2916352,11329,31,22,0,43,0.256098,0.74062,0.183567,0.555323,1.4,6.0,9865.0,614.0,183.0,0.0,5112241.0,9.0,0.0
4491899,16628,27,17,0,40,0.372549,0.494315,0.232576,0.246279,0.0,0.0,4282.0,16064.0,390.0,0.0,4156779.0,146.0,0.0
3687409,14174,52,23,0,43,0.072727,0.242092,0.189033,0.097643,0.181818,0.0,2243.0,740.0,11.0,0.0,7399317.0,4.0,0.0
4218749,15811,59,19,1,55,0.306122,0.265957,0.224547,0.150501,0.0,20.0,1244.0,11180.0,125.0,1.0,4474489.0,23.0,5.0


In [314]:
print(np.mean(np.array([31,24,58])))
print(np.mean(np.array([15,14,11])))
print(np.mean(np.array([6,9,53])))

37.666666666666664
13.333333333333334
22.666666666666668


In [324]:
training[(training['item_id'].isin([30,31])) & (training['shop_id'] == 30)]\
        .sort_values(['item_id','date_block_num'])[['item_id','shop_id',\
                                                    'date_block_num','item_block_units',\
                                                    'item_block_units_rolling_3', 'item_block_units_rolling_6']]
                                                    #'item_block_units_rolling_6']]

Unnamed: 0,item_id,shop_id,date_block_num,item_block_units,item_block_units_rolling_3,item_block_units_rolling_6
1144,30,30,12,58.0,,
1145,30,30,13,24.0,,
1146,30,30,14,31.0,41.0,41.0
1147,30,30,15,21.0,37.666667,37.666667
1148,30,30,16,16.0,25.333333,33.5
1149,30,30,17,13.0,22.666667,30.0
1150,30,30,18,13.0,16.666667,27.166667
1151,30,30,19,12.0,14.0,19.666667
1152,30,30,20,11.0,12.666667,17.666667
1153,30,30,21,13.0,12.0,14.333333


In [35]:
len(training[training['item_block_units'] > 0])

3456135

In [37]:
len(transactions_items_blocks)

135451

In [21]:
gc.collect()
val = training[training['date_block_num'] == 33]
print("val length", len(val))

unique_pairs_val = list(set(list(zip(val.shop_id, val.item_id))))
print("number of unique shop/item pairs in val", len(unique_pairs_val))
unique_pairs_val_ignore = unique_pairs_val[0:int(len(unique_pairs_val)/2)]


def tuple2key(t):
    return "%d_%d" % (t[0], t[1])

val_pairs_ignore_dict = {}
for t in unique_pairs_val_ignore:
    val_pairs_ignore_dict[tuple2key(t)] = 1
 
    
training['val_ignore'] = (training['shop_id'].astype(str) + '_' +  training['item_id'].astype(str))\
                                    .apply(lambda x: x in val_pairs_ignore_dict)

val length 270742
number of unique shop/item pairs in val 270742


In [101]:
len(training[training['val_ignore'] == True])

2978162

In [27]:
gc.collect()


x_train = training[(training['date_block_num'] < 33) & (training['val_ignore'] == False)]
y_train = x_train['y']


pos_train_len = len(y_train[y_train != 0])
print("pos_train_len", pos_train_len)

zeros_keep_indices_train = y_train[y_train == 0].sample(int(pos_train_len/6)).index
print("zeros_keep_indices_train", len(zeros_keep_indices_train))
non_zeros_train_indices = y_train[y_train != 0].index
print("non_zeros_train_indices", len(non_zeros_train_indices))

train_indices = np.append(np.array(zeros_keep_indices_train), np.array(non_zeros_train_indices))

y_train = y_train.loc[train_indices]
x_train = x_train.loc[train_indices]




x_val = training[training['date_block_num'] == 33]
y_val = x_val['y']

pos_val_len = len(y_val[y_val != 0])
print("pos_val_len", pos_val_len)

zeros_keep_indices_val = y_val[y_val == 0].sample(int(pos_val_len/6)).index
print("zeros_keep_indices_val", len(zeros_keep_indices_val))
non_zeros_val_indices = y_val[y_val != 0].index
print("non_zeros_val_indices", len(non_zeros_val_indices))

val_indices = np.append(np.array(zeros_keep_indices_val), np.array(non_zeros_val_indices))

y_val = y_val.loc[val_indices]
x_val = x_val.loc[val_indices]

pos_train_len 444760
zeros_keep_indices_train 74126
non_zeros_train_indices 444760
pos_val_len 31471
zeros_keep_indices_val 5245
non_zeros_val_indices 31471


In [415]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'y', 'item_category_id',
       'item_me', 'shop_me', 'category_me', 'shop_category_me',
       'shop_item_me', 'item_block_units', 'shop_block_units',
       'cat_block_units', 'shop_cat_block_units', 'shop_item_block_units',
       'item_units', 'item_max_units_block', 'item_min_units_block',
       'item_minmax_mean', 'shop_units', 'shop_max_units_block',
       'shop_min_units_block', 'shop_minmax_mean', 'cat_units',
       'cat_max_units_block', 'cat_min_units_block', 'cat_minmax_mean',
       'shop_cat_units', 'shop_cat_max_units_block',
       'shop_cat_min_units_block', 'shop_cat_minmax_mean',
       'shop_item_units', 'shop_item_max_units_block',
       'shop_item_min_units_block', 'shop_item_minmax_mean',
       'item_share_of_total_units', 'category_share_of_total_units',
       'shop_share_of_units', 'shop_item_share_of_total_units',
       'shop_item_share_of_shop_units', 'item_block_units_rolling_3',
       'item_block_units_

In [34]:



cb_features = [
      
     'item_me', 'shop_me', 'category_me', 'shop_category_me',
       'shop_item_me', 
       'item_units', 'item_max_units_block', 'item_min_units_block',
       'item_minmax_mean', 'shop_units', 'shop_max_units_block',
       'shop_min_units_block', 'shop_minmax_mean', 'cat_units',
       'cat_max_units_block', 'cat_min_units_block', 'cat_minmax_mean',
       'shop_cat_units', 'shop_cat_max_units_block',
       'shop_cat_min_units_block', 'shop_cat_minmax_mean',
       'shop_item_units', 'shop_item_max_units_block',
       'shop_item_min_units_block', 'shop_item_minmax_mean',
       'item_share_of_total_units', 'category_share_of_total_units',
       'shop_share_of_units', 'shop_item_share_of_total_units',
       'shop_item_share_of_shop_units', 'shop_share_item_units_comp',
       'shop_item_units_comp', 'item_block_units_rolling_3',
       'item_block_units_rolling_6', 'shop_block_units_rolling_3',
       'shop_block_units_rolling_6', 'shop_item_block_units_rolling_3',
       'shop_item_block_units_rolling_6']

In [38]:
cb_model = CatBoostRegressor(iterations=70000,
                             learning_rate=0.01,
                             eval_metric='RMSE',
                             #thread_count=16,
                             task_type = "GPU",
                             use_best_model=True,
                             #l2_leaf_reg = 1000,
                             od_type = "Iter",
                             od_wait = 30,
                             #random_strength = 10,
                             #bagging_temperature = 1,
                             #one_hot_max_size = 2,
                             random_seed = 42)

#drops = ['subcategory','area']
#x_train = x_train.drop(columns=drops)
#x_val = x_val.drop(columns=drops)


cb_model.fit(x_train[cb_features], y_train, #cat_features=categorical_features_indices,
             eval_set=(x_val[cb_features],y_val),
             #cat_features=categorical_features_pos,         
             verbose=True)

0:	learn: 2.9633788	test: 2.9484797	best: 2.9484797 (0)	total: 5.4ms	remaining: 6m 18s
1:	learn: 2.9444356	test: 2.9292629	best: 2.9292629 (1)	total: 10.4ms	remaining: 6m 2s
2:	learn: 2.9256879	test: 2.9114816	best: 2.9114816 (2)	total: 15.4ms	remaining: 5m 59s
3:	learn: 2.9072047	test: 2.8936267	best: 2.8936267 (3)	total: 20.3ms	remaining: 5m 55s
4:	learn: 2.8890343	test: 2.8751661	best: 2.8751661 (4)	total: 25.3ms	remaining: 5m 54s
5:	learn: 2.8710477	test: 2.8581659	best: 2.8581659 (5)	total: 30.3ms	remaining: 5m 53s
6:	learn: 2.8534118	test: 2.8407296	best: 2.8407296 (6)	total: 35.3ms	remaining: 5m 52s
7:	learn: 2.8358191	test: 2.8239141	best: 2.8239141 (7)	total: 40.3ms	remaining: 5m 52s
8:	learn: 2.8185046	test: 2.8075927	best: 2.8075927 (8)	total: 45.2ms	remaining: 5m 51s
9:	learn: 2.8013940	test: 2.7901660	best: 2.7901660 (9)	total: 52ms	remaining: 6m 4s
10:	learn: 2.7846504	test: 2.7759263	best: 2.7759263 (10)	total: 57.3ms	remaining: 6m 4s
11:	learn: 2.7681016	test: 2.7596454

116:	learn: 1.8495972	test: 1.8239268	best: 1.8239268 (116)	total: 600ms	remaining: 5m 58s
117:	learn: 1.8459877	test: 1.8198872	best: 1.8198872 (117)	total: 604ms	remaining: 5m 57s
118:	learn: 1.8424637	test: 1.8163437	best: 1.8163437 (118)	total: 609ms	remaining: 5m 57s
119:	learn: 1.8388751	test: 1.8121617	best: 1.8121617 (119)	total: 614ms	remaining: 5m 57s
120:	learn: 1.8353870	test: 1.8081359	best: 1.8081359 (120)	total: 618ms	remaining: 5m 57s
121:	learn: 1.8320151	test: 1.8043741	best: 1.8043741 (121)	total: 623ms	remaining: 5m 56s
122:	learn: 1.8286404	test: 1.7998594	best: 1.7998594 (122)	total: 628ms	remaining: 5m 56s
123:	learn: 1.8252930	test: 1.7959322	best: 1.7959322 (123)	total: 632ms	remaining: 5m 56s
124:	learn: 1.8221123	test: 1.7918366	best: 1.7918366 (124)	total: 638ms	remaining: 5m 56s
125:	learn: 1.8188158	test: 1.7877443	best: 1.7877443 (125)	total: 644ms	remaining: 5m 57s
126:	learn: 1.8156361	test: 1.7846013	best: 1.7846013 (126)	total: 649ms	remaining: 5m 57s

235:	learn: 1.6432738	test: 1.5507771	best: 1.5507771 (235)	total: 1.19s	remaining: 5m 52s
236:	learn: 1.6426116	test: 1.5497273	best: 1.5497273 (236)	total: 1.2s	remaining: 5m 52s
237:	learn: 1.6419319	test: 1.5483594	best: 1.5483594 (237)	total: 1.2s	remaining: 5m 52s
238:	learn: 1.6411981	test: 1.5474827	best: 1.5474827 (238)	total: 1.21s	remaining: 5m 52s
239:	learn: 1.6404621	test: 1.5466659	best: 1.5466659 (239)	total: 1.21s	remaining: 5m 52s
240:	learn: 1.6397973	test: 1.5455793	best: 1.5455793 (240)	total: 1.22s	remaining: 5m 52s
241:	learn: 1.6391792	test: 1.5444671	best: 1.5444671 (241)	total: 1.22s	remaining: 5m 53s
242:	learn: 1.6386280	test: 1.5437086	best: 1.5437086 (242)	total: 1.23s	remaining: 5m 53s
243:	learn: 1.6380138	test: 1.5425097	best: 1.5425097 (243)	total: 1.24s	remaining: 5m 53s
244:	learn: 1.6373963	test: 1.5413824	best: 1.5413824 (244)	total: 1.24s	remaining: 5m 53s
245:	learn: 1.6367778	test: 1.5399226	best: 1.5399226 (245)	total: 1.25s	remaining: 5m 53s
2

356:	learn: 1.5946645	test: 1.4662557	best: 1.4662557 (356)	total: 1.78s	remaining: 5m 47s
357:	learn: 1.5944541	test: 1.4661198	best: 1.4661198 (357)	total: 1.78s	remaining: 5m 47s
358:	learn: 1.5941751	test: 1.4657340	best: 1.4657340 (358)	total: 1.79s	remaining: 5m 47s
359:	learn: 1.5939034	test: 1.4649160	best: 1.4649160 (359)	total: 1.79s	remaining: 5m 46s
360:	learn: 1.5937140	test: 1.4647212	best: 1.4647212 (360)	total: 1.8s	remaining: 5m 46s
361:	learn: 1.5934992	test: 1.4644673	best: 1.4644673 (361)	total: 1.8s	remaining: 5m 47s
362:	learn: 1.5932790	test: 1.4640307	best: 1.4640307 (362)	total: 1.81s	remaining: 5m 47s
363:	learn: 1.5930486	test: 1.4637592	best: 1.4637592 (363)	total: 1.81s	remaining: 5m 47s
364:	learn: 1.5927694	test: 1.4631940	best: 1.4631940 (364)	total: 1.82s	remaining: 5m 47s
365:	learn: 1.5925769	test: 1.4626021	best: 1.4626021 (365)	total: 1.82s	remaining: 5m 47s
366:	learn: 1.5923497	test: 1.4624608	best: 1.4624608 (366)	total: 1.83s	remaining: 5m 46s
3

453:	learn: 1.5764922	test: 1.4327740	best: 1.4327740 (453)	total: 2.38s	remaining: 6m 5s
454:	learn: 1.5763856	test: 1.4324956	best: 1.4324956 (454)	total: 2.39s	remaining: 6m 5s
455:	learn: 1.5762022	test: 1.4321675	best: 1.4321675 (455)	total: 2.4s	remaining: 6m 5s
456:	learn: 1.5760479	test: 1.4319683	best: 1.4319683 (456)	total: 2.41s	remaining: 6m 6s
457:	learn: 1.5758639	test: 1.4316769	best: 1.4316769 (457)	total: 2.42s	remaining: 6m 6s
458:	learn: 1.5756866	test: 1.4314914	best: 1.4314914 (458)	total: 2.42s	remaining: 6m 7s
459:	learn: 1.5755192	test: 1.4312035	best: 1.4312035 (459)	total: 2.43s	remaining: 6m 7s
460:	learn: 1.5753900	test: 1.4310259	best: 1.4310259 (460)	total: 2.44s	remaining: 6m 8s
461:	learn: 1.5752841	test: 1.4309993	best: 1.4309993 (461)	total: 2.45s	remaining: 6m 8s
462:	learn: 1.5751196	test: 1.4307828	best: 1.4307828 (462)	total: 2.46s	remaining: 6m 8s
463:	learn: 1.5749899	test: 1.4306860	best: 1.4306860 (463)	total: 2.46s	remaining: 6m 9s
464:	learn:

552:	learn: 1.5630037	test: 1.4145236	best: 1.4145236 (552)	total: 2.97s	remaining: 6m 13s
553:	learn: 1.5628551	test: 1.4141539	best: 1.4141539 (553)	total: 2.98s	remaining: 6m 13s
554:	learn: 1.5626542	test: 1.4139426	best: 1.4139426 (554)	total: 2.98s	remaining: 6m 13s
555:	learn: 1.5625674	test: 1.4135720	best: 1.4135720 (555)	total: 2.99s	remaining: 6m 13s
556:	learn: 1.5624271	test: 1.4135220	best: 1.4135220 (556)	total: 3s	remaining: 6m 13s
557:	learn: 1.5622807	test: 1.4134266	best: 1.4134266 (557)	total: 3s	remaining: 6m 13s
558:	learn: 1.5621664	test: 1.4133616	best: 1.4133616 (558)	total: 3s	remaining: 6m 13s
559:	learn: 1.5620315	test: 1.4132890	best: 1.4132890 (559)	total: 3.01s	remaining: 6m 13s
560:	learn: 1.5618753	test: 1.4126971	best: 1.4126971 (560)	total: 3.01s	remaining: 6m 13s
561:	learn: 1.5617719	test: 1.4126513	best: 1.4126513 (561)	total: 3.02s	remaining: 6m 13s
562:	learn: 1.5617109	test: 1.4126702	best: 1.4126513 (561)	total: 3.02s	remaining: 6m 12s
563:	lea

675:	learn: 1.5500239	test: 1.3972939	best: 1.3972939 (675)	total: 3.56s	remaining: 6m 5s
676:	learn: 1.5499272	test: 1.3973818	best: 1.3972939 (675)	total: 3.57s	remaining: 6m 5s
677:	learn: 1.5498137	test: 1.3973820	best: 1.3972939 (675)	total: 3.58s	remaining: 6m 5s
678:	learn: 1.5497061	test: 1.3972791	best: 1.3972791 (678)	total: 3.58s	remaining: 6m 5s
679:	learn: 1.5495790	test: 1.3971633	best: 1.3971633 (679)	total: 3.59s	remaining: 6m 5s
680:	learn: 1.5494986	test: 1.3971685	best: 1.3971633 (679)	total: 3.59s	remaining: 6m 5s
681:	learn: 1.5494489	test: 1.3970764	best: 1.3970764 (681)	total: 3.6s	remaining: 6m 5s
682:	learn: 1.5493241	test: 1.3966926	best: 1.3966926 (682)	total: 3.6s	remaining: 6m 5s
683:	learn: 1.5492371	test: 1.3966566	best: 1.3966566 (683)	total: 3.6s	remaining: 6m 5s
684:	learn: 1.5491211	test: 1.3964027	best: 1.3964027 (684)	total: 3.61s	remaining: 6m 5s
685:	learn: 1.5490172	test: 1.3963942	best: 1.3963942 (685)	total: 3.62s	remaining: 6m 5s
686:	learn: 1

796:	learn: 1.5399800	test: 1.3848529	best: 1.3848529 (796)	total: 4.16s	remaining: 6m 1s
797:	learn: 1.5398640	test: 1.3843347	best: 1.3843347 (797)	total: 4.16s	remaining: 6m 1s
798:	learn: 1.5398092	test: 1.3843186	best: 1.3843186 (798)	total: 4.17s	remaining: 6m
799:	learn: 1.5397003	test: 1.3842686	best: 1.3842686 (799)	total: 4.17s	remaining: 6m
800:	learn: 1.5396171	test: 1.3843965	best: 1.3842686 (799)	total: 4.18s	remaining: 6m
801:	learn: 1.5395657	test: 1.3844100	best: 1.3842686 (799)	total: 4.18s	remaining: 6m
802:	learn: 1.5395000	test: 1.3842729	best: 1.3842686 (799)	total: 4.19s	remaining: 6m
803:	learn: 1.5394141	test: 1.3842312	best: 1.3842312 (803)	total: 4.19s	remaining: 6m
804:	learn: 1.5392888	test: 1.3840983	best: 1.3840983 (804)	total: 4.2s	remaining: 6m
805:	learn: 1.5391965	test: 1.3838210	best: 1.3838210 (805)	total: 4.2s	remaining: 6m
806:	learn: 1.5390734	test: 1.3839281	best: 1.3838210 (805)	total: 4.21s	remaining: 6m
807:	learn: 1.5390304	test: 1.3838771	b

917:	learn: 1.5316790	test: 1.3769199	best: 1.3769199 (917)	total: 4.75s	remaining: 5m 57s
918:	learn: 1.5316258	test: 1.3762965	best: 1.3762965 (918)	total: 4.75s	remaining: 5m 57s
919:	learn: 1.5315633	test: 1.3762616	best: 1.3762616 (919)	total: 4.76s	remaining: 5m 57s
920:	learn: 1.5315290	test: 1.3762787	best: 1.3762616 (919)	total: 4.76s	remaining: 5m 57s
921:	learn: 1.5314818	test: 1.3762463	best: 1.3762463 (921)	total: 4.77s	remaining: 5m 57s
922:	learn: 1.5314315	test: 1.3760991	best: 1.3760991 (922)	total: 4.77s	remaining: 5m 57s
923:	learn: 1.5313529	test: 1.3759508	best: 1.3759508 (923)	total: 4.78s	remaining: 5m 57s
924:	learn: 1.5313109	test: 1.3759365	best: 1.3759365 (924)	total: 4.78s	remaining: 5m 57s
925:	learn: 1.5312626	test: 1.3759119	best: 1.3759119 (925)	total: 4.79s	remaining: 5m 57s
926:	learn: 1.5312105	test: 1.3758628	best: 1.3758628 (926)	total: 4.79s	remaining: 5m 57s
927:	learn: 1.5311620	test: 1.3758405	best: 1.3758405 (927)	total: 4.8s	remaining: 5m 57s


1037:	learn: 1.5249111	test: 1.3723918	best: 1.3721096 (1024)	total: 5.34s	remaining: 5m 54s
1038:	learn: 1.5248583	test: 1.3722562	best: 1.3721096 (1024)	total: 5.34s	remaining: 5m 54s
1039:	learn: 1.5247951	test: 1.3722528	best: 1.3721096 (1024)	total: 5.35s	remaining: 5m 54s
1040:	learn: 1.5247499	test: 1.3722108	best: 1.3721096 (1024)	total: 5.35s	remaining: 5m 54s
1041:	learn: 1.5247122	test: 1.3721834	best: 1.3721096 (1024)	total: 5.36s	remaining: 5m 54s
1042:	learn: 1.5246629	test: 1.3721589	best: 1.3721096 (1024)	total: 5.36s	remaining: 5m 54s
1043:	learn: 1.5246246	test: 1.3721469	best: 1.3721096 (1024)	total: 5.37s	remaining: 5m 54s
1044:	learn: 1.5245644	test: 1.3719452	best: 1.3719452 (1044)	total: 5.37s	remaining: 5m 54s
1045:	learn: 1.5245100	test: 1.3717791	best: 1.3717791 (1045)	total: 5.38s	remaining: 5m 54s
1046:	learn: 1.5244400	test: 1.3718903	best: 1.3717791 (1045)	total: 5.38s	remaining: 5m 54s
1047:	learn: 1.5244182	test: 1.3718877	best: 1.3717791 (1045)	total: 5

1159:	learn: 1.5189737	test: 1.3655256	best: 1.3655256 (1159)	total: 5.93s	remaining: 5m 52s
1160:	learn: 1.5189505	test: 1.3654235	best: 1.3654235 (1160)	total: 5.94s	remaining: 5m 52s
1161:	learn: 1.5188994	test: 1.3654324	best: 1.3654235 (1160)	total: 5.94s	remaining: 5m 51s
1162:	learn: 1.5188748	test: 1.3655098	best: 1.3654235 (1160)	total: 5.95s	remaining: 5m 51s
1163:	learn: 1.5188287	test: 1.3655302	best: 1.3654235 (1160)	total: 5.95s	remaining: 5m 51s
1164:	learn: 1.5187831	test: 1.3655026	best: 1.3654235 (1160)	total: 5.96s	remaining: 5m 51s
1165:	learn: 1.5187597	test: 1.3654973	best: 1.3654235 (1160)	total: 5.96s	remaining: 5m 51s
1166:	learn: 1.5187026	test: 1.3654179	best: 1.3654179 (1166)	total: 5.96s	remaining: 5m 51s
1167:	learn: 1.5186624	test: 1.3653743	best: 1.3653743 (1167)	total: 5.97s	remaining: 5m 51s
1168:	learn: 1.5185960	test: 1.3651812	best: 1.3651812 (1168)	total: 5.97s	remaining: 5m 51s
1169:	learn: 1.5185705	test: 1.3652107	best: 1.3651812 (1168)	total: 5

<catboost.core.CatBoostRegressor at 0x7ffb2ef7e780>

In [148]:
#training.to_pickle("pickled/training")
#training = pd.read_pickle("pickled/training")

#pickle.dump(cb_model, open( "pickled/cb_model", "wb"), protocol=4)

#cb_model = pickle.load( open( "pickled/cb_model", "rb" ) )

In [39]:
scores = {}
for i,score in enumerate(cb_model.get_feature_importance()):
    scores[cb_features[i]] = score

sorted(scores.items(), key=lambda x: x[1])[::-1]

[('shop_item_minmax_mean', 24.912735863254795),
 ('shop_item_me', 23.105267641120804),
 ('shop_item_block_units_rolling_3', 15.504534185815288),
 ('shop_item_max_units_block', 15.13634937690366),
 ('cat_units', 8.838657103989243),
 ('item_me', 6.797299590449437),
 ('item_block_units_rolling_6', 5.705156238466762)]

In [37]:
cb_features = [item[0] for item in scores.items() if item[1] > 2]