In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt

import lightgbm as lgbm
import gc
import xgboost as xgb
import pickle as pickle


from catboost import CatBoostRegressor
import dask.dataframe as dd
from sklearn.model_selection import KFold

In [3]:
items           = pd.read_csv('items.csv',usecols=["item_id", "item_category_id"])
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')

In [4]:
y_cols = ['shop_id', 'item_id', 'date_block_num']

In [5]:
sales_train[['day','month', 'year']] = sales_train['date'].str.split('.', expand=True).astype(int)
sales_train = sales_train[sales_train['year'] != 2013]
sales_train = sales_train.set_index('item_id').join(items.set_index('item_id'))
sales_train.reset_index(inplace=True)

In [6]:
train_item_ids = sales_train['item_id'].unique()
train_shop_ids = sales_train['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = sales_train['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [7]:
combinations = []
for shop in all_shop_ids:
    #get all article ids ever associated to this shop
    train_ids = sales_train[sales_train['shop_id'] == shop]['item_id'].unique()
    test_ids = sales_train[sales_train['shop_id'] == shop]['item_id'].unique()
    all_shop = np.unique(np.append(train_ids, test_ids))
    all_shop_combo = [[shop, item, block] for item in all_shop for block in train_blocks]
    for combo in all_shop_combo:
        combinations.append(combo)
        
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['shop_id','item_id','date_block_num'])
del combinations

In [8]:
ys = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"y"})

In [9]:
#

In [42]:
training = all_combos.merge(ys, on=['shop_id', 'item_id', 'date_block_num'], how='left').fillna(0)
training = training.apply(pd.to_numeric,downcast='unsigned')

training['y'] = training['y'].clip(0,20)
training['y'] = training['y'].astype('int8')

training = training.set_index('item_id').join(items.set_index('item_id'))
training.reset_index(inplace=True)

In [43]:
def get_mean_encoding(df, group_cols, target):
    cumsum = df.groupby(group_cols)[target].cumsum() - df[target]
    cumcnt = df.groupby(group_cols).cumcount()
    return cumsum/cumcnt

training['item_me'] = pd.to_numeric(get_mean_encoding(training, ['item_id'], 'y'), downcast='float')
training['shop_me'] = pd.to_numeric(get_mean_encoding(training, ['shop_id'], 'y'), downcast='float')
training['category_me'] = pd.to_numeric(get_mean_encoding(training, ['item_category_id'], 'y'), downcast='float')
training['shop_category_me'] = pd.to_numeric(get_mean_encoding(training, ['shop_id', 'item_category_id'], 'y'), downcast='float')
training['shop_item_me'] = pd.to_numeric(get_mean_encoding(training, ['shop_id', 'item_id'], 'y'), downcast='float')

training.fillna(0,inplace=True)

In [196]:
cols = ['item_id','date_block_num']
item_block_units = sales_train.groupby(cols,as_index=False)['item_cnt_day'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_day':'item_block_units'})

training = training.merge(item_block_units, on=cols, how='left').fillna(0)
del item_block_units
gc.collect()
#
cols = ['shop_id','date_block_num']
shop_block_units = sales_train.groupby(cols,as_index=False)['item_cnt_day'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_day':'shop_block_units'})

training = training.merge(shop_block_units, on=cols, how='left').fillna(0)
del shop_block_units
gc.collect()
#
cols = ['item_category_id','date_block_num']
cat_block_units = sales_train.groupby(cols,as_index=False)['item_cnt_day'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_day':'cat_block_units'})

training = training.merge(cat_block_units, on=cols, how='left').fillna(0)
del cat_block_units
gc.collect()
#

cols = ['shop_id', 'item_category_id','date_block_num']
shop_cat_block_units = sales_train.groupby(cols,as_index=False)['item_cnt_day'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_day':'shop_cat_block_units'})

training = training.merge(shop_cat_block_units, on=cols, how='left').fillna(0)
del shop_cat_block_units
gc.collect()
#
cols = ['shop_id', 'item_id','date_block_num']
shop_item_block_units = sales_train.groupby(cols,as_index=False)['item_cnt_day'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_day':'shop_item_block_units'})

training = training.merge(shop_item_block_units, on=cols, how='left').fillna(0)
del shop_item_block_units
gc.collect()

84

In [59]:
number_of_items = sales_train['item_id'].nunique()
print("number_of_items:", number_of_items)
number_of_categories = sales_train['item_category_id'].nunique()
print("number_of_categories:", number_of_categories)
number_of_shops = sales_train['shop_id'].nunique()
print("number_of_shops:", number_of_shops)
number_of_days = 365 + 365 - 30 - 31
print("number_of_days:", number_of_days)
number_of_blocks = sales_train['date_block_num'].nunique()
print("number_of_blocks:", number_of_blocks)
total_sales = sales_train['item_cnt_day'].sum()
print("total_sales:", total_sales)
average_price = sales_train['item_price'].mean()
print("average_price:", average_price)

number_of_items: 17054
number_of_categories: 79
number_of_shops: 55
number_of_days: 669
number_of_blocks: 22
total_sales: 2085473.0
average_price: 1015.5023073772021


In [179]:
training['item_units'] = training.groupby(['date_block_num'])['item_block_units'].transform(np.sum)
training['item_max_units_block'] = training.groupby(['item_id'])['item_block_units']\
        .transform(np.max)
training['item_min_units_block'] = training.groupby(['item_id'])['item_block_units']\
        .transform(np.min)
training['item_minmax_mean'] = training[['item_max_units_block', 'item_min_units_block']].mean(axis=1)

for q in [0.25,0.50,0.75]:
    name = 'item_minmax_q' + str(q)
    training[name] =  training[['item_min_units_block','item_max_units_block']].quantile(q,axis=1)


#
training['shop_units'] = training.groupby(['date_block_num'])['shop_block_units'].transform(np.sum)
training['shop_max_units_block'] = training.groupby(['shop_id'])['shop_block_units']\
        .transform(np.max)
training['shop_min_units_block'] = training.groupby(['shop_id'])['shop_block_units']\
        .transform(np.min)
training['shop_minmax_mean'] = training[['shop_max_units_block', 'shop_min_units_block']].mean(axis=1)

#
training['cat_units'] = training.groupby(['date_block_num'])['cat_block_units'].transform(np.sum)
training['cat_max_units_block'] = training.groupby(['item_category_id'])['cat_block_units']\
        .transform(np.max)
training['cat_min_units_block'] = training.groupby(['item_category_id'])['cat_block_units']\
        .transform(np.min)
training['cat_minmax_mean'] = training[['cat_max_units_block', 'cat_min_units_block']].mean(axis=1)
#
training['shop_cat_units'] = training.groupby(['date_block_num'])['shop_cat_block_units'].transform(np.sum)
training['shop_cat_max_units_block'] = training.groupby(['shop_id', 'item_category_id'])['shop_cat_block_units']\
        .transform(np.max)
training['shop_cat_min_units_block'] = training.groupby(['shop_id', 'item_category_id'])['shop_cat_block_units']\
        .transform(np.min)
training['shop_cat_minmax_mean'] = training[['shop_cat_max_units_block', 'shop_cat_min_units_block']].mean(axis=1)
#
training['shop_item_units'] = training.groupby(['date_block_num'])['shop_item_block_units'].transform(np.sum)
training['shop_item_max_units_block'] = training.groupby(['shop_id', 'item_id'])['shop_item_block_units']\
        .transform(np.max)
training['shop_item_min_units_block'] = training.groupby(['shop_id', 'item_id'])['shop_item_block_units']\
        .transform(np.min)
training['shop_item_minmax_mean'] = training[['shop_item_max_units_block', 'shop_item_min_units_block']].mean(axis=1)

In [114]:
training['item_share_of_total_units'] = training['item_units'] * 100 / total_sales
training['category_share_of_total_units'] = training['cat_units'] * 100 / total_sales
training['shop_share_of_units'] = training['shop_units'] * 100 / total_sales
training['shop_item_share_of_total_units'] = training['shop_item_units'] * 100\
                        / total_sales
training['shop_item_share_of_shop_units'] = training['shop_item_units'] * 100\
                        / training['shop_units']

In [62]:
training['shop_share_item_units_comp'] = training['item_units'] * training['shop_share_of_units']
training['shop_item_units_comp'] = training['item_units'] / training['shop_units']

In [46]:
rolls = [3,6,12]
cols = ['item_id','date_block_num']

for roll in rolls:
    print(roll)
    roll_name = "item_block_units_rolling_" + str(roll)
    roll_name_tmp = roll_name + "_tmp"
    
    item_block_units_rolling_temp = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['item_id'],as_index=False)\
        ['item_block_units'].rolling(roll,min_periods=2).mean().reset_index()\
        .rename(columns={'item_block_units':roll_name_tmp})\
        [['item_id','date_block_num',roll_name_tmp]]
    
    training = training.merge(item_block_units_rolling_temp, on=cols, how='left')
    del item_block_units_rolling_temp
    gc.collect()

    item_block_units_rolling = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['item_id'],as_index=False)\
        [roll_name_tmp].shift(1)\
        .rename(columns={roll_name_tmp:roll_name}).reset_index()

    training = training.merge(item_block_units_rolling, on=cols, how='left')
    training.drop(columns=[roll_name_tmp], inplace=True)
    del item_block_units_rolling
    gc.collect()

3
6
12


In [47]:
rolls = [3,6,12]
cols = ['shop_id','date_block_num']

for roll in rolls:
    print(roll)
    roll_name = "shop_block_units_rolling_" + str(roll)
    roll_name_tmp = roll_name + "_tmp"
    
    shop_block_units_rolling_temp = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['shop_id'],as_index=False)\
        ['shop_block_units'].rolling(roll,min_periods=2).mean().reset_index()\
        .rename(columns={'shop_block_units':roll_name_tmp})\
        [['shop_id','date_block_num',roll_name_tmp]]
    
    training = training.merge(shop_block_units_rolling_temp, on=cols, how='left')
    del shop_block_units_rolling_temp
    gc.collect()

    shop_block_units_rolling = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['shop_id'],as_index=False)\
        [roll_name_tmp].shift(1)\
        .rename(columns={roll_name_tmp:roll_name}).reset_index()

    training = training.merge(shop_block_units_rolling, on=cols, how='left')
    training.drop(columns=[roll_name_tmp], inplace=True)
    del shop_block_units_rolling
    gc.collect()

3
6
12


In [131]:
rolls = [3,6,12]
cols = ['item_category_id', 'date_block_num']

for roll in rolls:
    print(roll)
    roll_name = "cat_block_units_rolling_" + str(roll)
    roll_name_tmp = roll_name + "_tmp"
    
    cat_block_units_rolling_temp = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['item_category_id'],as_index=False)\
        ['cat_block_units'].rolling(roll,min_periods=2).mean().reset_index()\
        .rename(columns={'cat_block_units':roll_name_tmp})\
        [['item_category_id','date_block_num',roll_name_tmp]]
    
    training = training.merge(cat_block_units_rolling_temp, on=cols, how='left')
    del cat_block_units_rolling_temp
    gc.collect()
    
    cat_block_units_rolling = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['item_category_id'],as_index=False)\
        [roll_name_tmp].shift(1)\
        .rename(columns={roll_name_tmp:roll_name}).reset_index()

    training = training.merge(cat_block_units_rolling, on=cols, how='left')
    training.drop(columns=[roll_name_tmp], inplace=True)
    del cat_block_units_rolling
    gc.collect()

3
lol
6
lol
12
lol


In [132]:
rolls = [3,6,12]
cols = ['shop_id', 'item_category_id', 'date_block_num']

for roll in rolls:
    print(roll)
    roll_name = "shop_cat_block_units_rolling_" + str(roll)
    roll_name_tmp = roll_name + "_tmp"
    
    shop_cat_block_units_rolling_temp = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['shop_id', 'item_category_id'],as_index=False)\
        ['shop_cat_block_units'].rolling(roll,min_periods=2).mean().reset_index()\
        .rename(columns={'shop_cat_block_units':roll_name_tmp})\
        [['shop_id', 'item_category_id','date_block_num',roll_name_tmp]]
    
    training = training.merge(shop_cat_block_units_rolling_temp, on=cols, how='left')
    del shop_cat_block_units_rolling_temp
    gc.collect()
    
    shop_cat_block_units_rolling = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['shop_id', 'item_category_id'],as_index=False)\
        [roll_name_tmp].shift(1)\
        .rename(columns={roll_name_tmp:roll_name}).reset_index()

    training = training.merge(shop_cat_block_units_rolling, on=cols, how='left')
    training.drop(columns=[roll_name_tmp], inplace=True)
    del shop_cat_block_units_rolling
    gc.collect()

3
6
12


In [23]:
training.fillna(0, inplace=True)

In [20]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'y', 'item_category_id',
       'item_me', 'shop_me', 'category_me', 'shop_category_me',
       'shop_item_me', 'item_block_units', 'shop_block_units',
       'cat_block_units', 'shop_cat_block_units', 'shop_item_block_units',
       'item_units', 'item_max_units_block', 'item_min_units_block',
       'item_minmax_mean', 'shop_units', 'shop_max_units_block',
       'shop_min_units_block', 'shop_minmax_mean', 'cat_units',
       'cat_max_units_block', 'cat_min_units_block', 'cat_minmax_mean',
       'shop_cat_units', 'shop_cat_max_units_block',
       'shop_cat_min_units_block', 'shop_cat_minmax_mean',
       'shop_item_units', 'shop_item_max_units_block',
       'shop_item_min_units_block', 'shop_item_minmax_mean',
       'item_share_of_total_units', 'category_share_of_total_units',
       'shop_share_of_units', 'shop_item_share_of_total_units',
       'shop_item_share_of_shop_units', 'shop_share_item_units_comp',
       'shop_item_units_c

In [196]:
np.min([2,2])

2

In [48]:
lags = [1,2,3,6,12]

cols = ['item_id', 'date_block_num']


for lag in lags:
    print(lag)
    lag_name = "item_block_units_lag_" + str(lag)
    item_block_units_lag = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['item_id'],as_index=False)\
        ['item_block_units'].shift(lag)\
        .rename(columns={'item_block_units':lag_name}).reset_index()

    training = training.merge(item_block_units_lag, on=cols, how='left')
    del item_block_units_lag
    gc.collect()

1
2
3
6
12


In [49]:
lags = [1,2,3,6,12]

cols = ['shop_id', 'date_block_num']


for lag in lags:
    print(lag)
    lag_name = "shop_block_units_lag_" + str(lag)
    shop_block_units_lag = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['shop_id'],as_index=False)\
        ['shop_block_units'].shift(lag)\
        .rename(columns={'shop_block_units':lag_name}).reset_index()

    training = training.merge(shop_block_units_lag, on=cols, how='left')
    del shop_block_units_lag
    gc.collect()

1
2
3
6
12


In [64]:
lags = [1,2,3,6,12]

cols = ['shop_id', 'item_id', 'date_block_num']


for lag in lags:
    print(lag)
    lag_name = "shop_item_block_units_lag_" + str(lag)
    shop_item_block_units_lag = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['shop_id', 'item_id'],as_index=False)\
        ['shop_item_block_units'].shift(lag)\
        .rename(columns={'shop_item_block_units':lag_name}).reset_index()

    training = training.merge(shop_item_block_units_lag, on=cols, how='left')
    del shop_item_block_units_lag
    gc.collect()

1
2
3
6
12


In [133]:
lags = [1,2,3,6,12]

cols = ['shop_id', 'item_category_id', 'date_block_num']


for lag in lags:
    print(lag)
    lag_name = "shop_cat_block_units_lag_" + str(lag)
    shop_cat_block_units_lag = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['shop_id', 'item_category_id'],as_index=False)\
        ['shop_cat_block_units'].shift(lag)\
        .rename(columns={'shop_cat_block_units':lag_name}).reset_index()

    training = training.merge(shop_cat_block_units_lag, on=cols, how='left')
    del shop_cat_block_units_lag
    gc.collect()

1
2
3
6
12


In [197]:
lags = [1,2,3,6,12]

cols = ['item_category_id', 'date_block_num']


for lag in lags:
    print(lag)
    lag_name = "cat_block_units_lag_" + str(lag)
    cat_block_units_lag = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['item_category_id'],as_index=False)\
        ['cat_block_units'].shift(lag)\
        .rename(columns={'cat_block_units':lag_name}).reset_index()

    training = training.merge(cat_block_units_lag, on=cols, how='left')
    del cat_block_units_lag
    gc.collect()

1
2
3
6
12


In [183]:
str(int(0.25))

'0'

In [69]:
training['rolling_composite'] =  training['shop_block_units_rolling_3'].clip(1, None) *\
            training['item_block_units_rolling_3'].clip(1, None) 
training['me_composite'] =  training['item_me'].clip(1, None)  * training['shop_me'].clip(1, None) 

In [198]:
training['shop_item_1'] = training['item_block_units_lag_1'] * training['shop_block_units_lag_1']
training['shop_item_2'] = training['item_block_units_lag_2'] * training['shop_block_units_lag_2']
training['shop_item_3'] = training['item_block_units_lag_3'] * training['shop_block_units_lag_3']
training['shop_item_4'] = training['item_block_units_lag_6'] * training['shop_block_units_lag_6']
training['shop_item_5'] = training['item_block_units_lag_12'] * training['shop_block_units_lag_12']

training['shop_cat_1'] = training['cat_block_units_lag_1'] * training['shop_block_units_lag_1']
training['shop_cat_2'] = training['cat_block_units_lag_2'] * training['shop_block_units_lag_2']
training['shop_cat_3'] = training['cat_block_units_lag_3'] * training['shop_block_units_lag_3']
training['shop_cat_4'] = training['cat_block_units_lag_6'] * training['shop_block_units_lag_6']
training['shop_cat_5'] = training['cat_block_units_lag_12'] * training['shop_block_units_lag_12']


In [134]:
training.fillna(0, inplace=True)
for column in training.columns.values:
    if "units" in column and "share" not in column:
        training[column] = pd.to_numeric(training[column].astype(int), downcast='unsigned')
    else:
        training[column] = pd.to_numeric(training[column], downcast='float')


In [135]:
training.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5956324 entries, 0 to 5956323
Data columns (total 81 columns):
item_id                            float32
shop_id                            uint8
date_block_num                     uint8
y                                  int8
item_category_id                   float32
item_me                            float32
shop_me                            float32
category_me                        float32
shop_category_me                   float32
shop_item_me                       float32
item_block_units                   int64
shop_block_units                   int64
cat_block_units                    int64
shop_cat_block_units               int64
shop_item_block_units              int64
item_block_units_rolling_3         int64
item_block_units_rolling_6         int64
item_block_units_rolling_12        int64
shop_block_units_rolling_3         uint16
shop_block_units_rolling_6         uint16
shop_block_units_rolling_12        uint16
item_block

In [20]:
training.dtypes

item_id                             int64
shop_id                             uint8
date_block_num                      uint8
y                                    int8
item_category_id                    int64
item_me                           float32
shop_me                           float32
category_me                       float32
shop_category_me                  float32
shop_item_me                      float32
item_block_units                  float64
shop_block_units                  float64
cat_block_units                   float64
shop_cat_block_units              float64
shop_item_block_units             float64
item_units                        float64
item_max_units_block              float64
item_min_units_block              float64
item_minmax_mean                  float64
shop_units                        float64
shop_max_units_block              float64
shop_min_units_block              float64
shop_minmax_mean                  float64
cat_units                         

In [52]:
training.sample(10)

Unnamed: 0,item_id,shop_id,date_block_num,y,item_category_id,item_me,shop_me,category_me,shop_category_me,shop_item_me,...,item_block_units_lag_1,item_block_units_lag_2,item_block_units_lag_3,item_block_units_lag_6,item_block_units_lag_12,shop_block_units_lag_1,shop_block_units_lag_2,shop_block_units_lag_3,shop_block_units_lag_6,shop_block_units_lag_12
3431788,13345.0,44,20,0,82.0,2.211539,0.201273,2.027027,4.233333,11.5,...,0,0,0,50,0,1082,1055,1184,1788,0
4968060,18393.0,42,32,0,57.0,0.290698,0.443604,0.244254,0.296899,0.25,...,5,4,3,10,0,3832,3578,3618,4133,3740
3077795,11973.0,35,29,0,40.0,0.093023,0.364863,0.240576,0.180015,0.0,...,0,0,2,0,0,1537,1570,1641,3499,1963
5354234,19754.0,10,18,0,37.0,0.217391,0.153341,0.159108,0.075314,0.166667,...,8,12,54,0,0,575,702,646,784,0
4916289,18165.0,35,27,0,37.0,0.098765,0.305935,0.161398,0.17008,0.133333,...,0,2,0,0,1,1641,1550,1993,1808,1775
3515773,13616.0,31,31,1,69.0,0.483101,0.697052,0.217963,0.426396,1.526316,...,14,15,15,39,13,5987,6160,6327,7404,8248
5720734,21385.0,50,20,0,38.0,0.532658,0.268298,0.183076,0.124646,0.75,...,23,19,38,0,0,1603,1602,1722,2934,0
5319460,19641.0,38,26,0,40.0,0.04902,0.297104,0.2255,0.111741,0.071429,...,0,2,1,0,0,1403,1441,2675,1492,1561
4686383,17332.0,41,21,1,40.0,0.100437,0.248932,0.230197,0.140513,0.0,...,0,0,0,0,0,1001,1119,1152,1222,0
775091,3352.0,59,21,0,23.0,0.913183,0.346701,0.403374,0.257798,0.0,...,0,0,0,0,0,1161,1244,1082,1228,0


In [314]:
print(np.mean(np.array([31,24,58])))
print(np.mean(np.array([15,14,11])))
print(np.mean(np.array([6,9,53])))

37.666666666666664
13.333333333333334
22.666666666666668


In [40]:
training[(training['item_id'].isin([30,31])) & (training['shop_id'] == 30)]\
        .sort_values(['item_id','date_block_num'])[['item_id','shop_id',\
                                                    'date_block_num','item_block_units', 'item_block_units_lag_1',\
                                                    'item_block_units_lag_2','item_block_units_lag_3',\
                                                    'item_block_units_lag_6','item_block_units_lag_12'
                                                   ]]
                                                    #'item_block_units_rolling_3', 'item_block_units_rolling_6']]
                                                    #'item_block_units_rolling_6']]

Unnamed: 0,item_id,shop_id,date_block_num,item_block_units,item_block_units_lag_1,item_block_units_lag_2,item_block_units_lag_3,item_block_units_lag_6,item_block_units_lag_12
1144,30.0,30,12,58,0,0,0,0,0
1145,30.0,30,13,24,58,0,0,0,0
1146,30.0,30,14,31,24,58,0,0,0
1147,30.0,30,15,21,31,24,58,0,0
1148,30.0,30,16,16,21,31,24,0,0
1149,30.0,30,17,13,16,21,31,0,0
1150,30.0,30,18,13,13,16,21,58,0
1151,30.0,30,19,12,13,13,16,24,0
1152,30.0,30,20,11,12,13,13,31,0
1153,30.0,30,21,13,11,12,13,21,0


In [35]:
len(training[training['item_block_units'] > 0])

3456135

In [37]:
len(transactions_items_blocks)

135451

In [136]:
gc.collect()
val = training[training['date_block_num'] == 33]
print("val length", len(val))

unique_pairs_val = list(set(list(zip(val.shop_id, val.item_id))))
print("number of unique shop/item pairs in val", len(unique_pairs_val))
unique_pairs_val_ignore = unique_pairs_val[0:int(len(unique_pairs_val)/2)]


def tuple2key(t):
    return "%d_%d" % (t[0], t[1])

val_pairs_ignore_dict = {}
for t in unique_pairs_val_ignore:
    val_pairs_ignore_dict[tuple2key(t)] = 1
 
    
training['val_ignore'] = (training['shop_id'].astype(str) + '_' +  training['item_id'].astype(str))\
                                    .apply(lambda x: x in val_pairs_ignore_dict)

val length 270742
number of unique shop/item pairs in val 270742


In [101]:
len(training[training['val_ignore'] == True])

2978162

In [208]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5).fit(training[cb_features].fillna(0))

In [211]:
training = pca.transform(training[cb_features].fillna(0))

In [212]:
training

array([[ 4.98182322e+10, -4.65810723e+08, -2.75325082e+07,
         1.26088922e+07,  1.42313351e+07],
       [ 3.31271870e+10, -2.49748493e+08,  5.72508936e+07,
        -7.23767122e+07,  3.62384000e+07],
       [ 5.11289034e+10, -4.10739526e+08,  1.19342534e+08,
        -8.88580053e+07, -2.51284993e+07],
       ...,
       [-7.16370277e+10,  2.16420075e+08, -2.03850638e+07,
        -1.58275527e+06, -1.93098311e+07],
       [-7.08046752e+10,  3.82998421e+08, -2.28012866e+07,
        -2.10013664e+07,  1.86338577e+07],
       [-7.35907982e+10,  2.69601857e+08, -2.08469179e+07,
        -9.94234057e+06, -3.02176912e+05]])

In [199]:
gc.collect()

ZEROS_KEEP=3


#x_train = training[(training['date_block_num'] < 33) & (training['val_ignore'] == False)]
x_train = training[(training['date_block_num'] < 33)]
y_train = x_train['y']


pos_train_len = len(y_train[y_train != 0])
print("pos_train_len", pos_train_len)

zeros_keep_indices_train = y_train[y_train == 0].sample(int(pos_train_len/ZEROS_KEEP)).index
print("zeros_keep_indices_train", len(zeros_keep_indices_train))
non_zeros_train_indices = y_train[y_train != 0].index
print("non_zeros_train_indices", len(non_zeros_train_indices))

train_indices = np.append(np.array(zeros_keep_indices_train), np.array(non_zeros_train_indices))

y_train = y_train.loc[train_indices]
x_train = x_train.loc[train_indices]




x_val = training[training['date_block_num'] == 33]
y_val = x_val['y']

pos_val_len = len(y_val[y_val != 0])
print("pos_val_len", pos_val_len)

zeros_keep_indices_val = y_val[y_val == 0].sample(int(pos_val_len/ZEROS_KEEP)).index
print("zeros_keep_indices_val", len(zeros_keep_indices_val))
non_zeros_val_indices = y_val[y_val != 0].index
print("non_zeros_val_indices", len(non_zeros_val_indices))

val_indices = np.append(np.array(zeros_keep_indices_val), np.array(non_zeros_val_indices))

y_val = y_val.loc[val_indices]
x_val = x_val.loc[val_indices]

pos_train_len 887869
zeros_keep_indices_train 295956
non_zeros_train_indices 887869
pos_val_len 31471
zeros_keep_indices_val 10490
non_zeros_val_indices 31471


In [200]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'y', 'item_category_id',
       'item_me', 'shop_me', 'category_me', 'shop_category_me',
       'shop_item_me', 'item_block_units_x', 'shop_block_units_x',
       'cat_block_units_x', 'shop_cat_block_units_x',
       'shop_item_block_units_x', 'item_block_units_rolling_3',
       'item_block_units_rolling_6', 'item_block_units_rolling_12',
       'shop_block_units_rolling_3', 'shop_block_units_rolling_6',
       'shop_block_units_rolling_12', 'item_block_units_lag_1',
       'item_block_units_lag_2', 'item_block_units_lag_3',
       'item_block_units_lag_6', 'item_block_units_lag_12',
       'shop_block_units_lag_1', 'shop_block_units_lag_2',
       'shop_block_units_lag_3', 'shop_block_units_lag_6',
       'shop_block_units_lag_12', 'val_ignore', 'item_units',
       'item_max_units_block', 'item_min_units_block', 'item_minmax_mean',
       'shop_units', 'shop_max_units_block', 'shop_min_units_block',
       'shop_minmax_mean', 'cat_units

In [201]:



cb_features = [
       'item_me', 'shop_me', 'category_me', 'shop_category_me',
       'item_block_units_rolling_3', 'item_block_units_rolling_6',
       'item_block_units_rolling_12', 'shop_block_units_rolling_3',
       'shop_block_units_rolling_6', 'shop_block_units_rolling_12',
       'item_block_units_lag_1', 'item_block_units_lag_2',
       'item_block_units_lag_3', 'item_block_units_lag_6',
       'item_block_units_lag_12', 'shop_block_units_lag_1',
       'shop_block_units_lag_2', 'shop_block_units_lag_3',
       'shop_block_units_lag_6', 'shop_block_units_lag_12', 
       'item_units', 'item_max_units_block', 'item_min_units_block',
        'shop_units', 'shop_max_units_block',
       'shop_min_units_block', 'shop_minmax_mean', 'cat_units',
       'cat_max_units_block', 'cat_min_units_block', 'cat_minmax_mean',
       'shop_cat_units', 'shop_cat_max_units_block',
       'shop_cat_min_units_block', 'shop_cat_minmax_mean',

       'item_share_of_total_units', 'category_share_of_total_units',
       'shop_share_of_units',
       'shop_share_item_units_comp',
       'rolling_composite', 'me_composite',
'shop_item_1', 'shop_item_2',
       'shop_item_3', 'shop_item_4', 'shop_item_5'
    , 'cat_block_units_rolling_3',
       'cat_block_units_rolling_6', 'cat_block_units_rolling_12',
       'shop_cat_block_units_rolling_3', 'shop_cat_block_units_rolling_6',
       'shop_cat_block_units_rolling_12', 'shop_cat_block_units_lag_1',
       'shop_cat_block_units_lag_2', 'shop_cat_block_units_lag_3',
       'shop_cat_block_units_lag_6', 'shop_cat_block_units_lag_12',
     'item_minmax_q0.25', 'item_minmax_q0.5',
       'item_minmax_q0.75',
       'cat_block_units_lag_1', 'cat_block_units_lag_2',
       'cat_block_units_lag_3', 'cat_block_units_lag_6',
       'cat_block_units_lag_12', 'shop_cat_1', 'shop_cat_2', 'shop_cat_3',
       'shop_cat_4', 'shop_cat_5'
]
           

In [202]:
cb_model = CatBoostRegressor(iterations=70000,
                             learning_rate=0.01,
                             eval_metric='RMSE',
                             #thread_count=16,
                             task_type = "GPU",
                             use_best_model=True,
                             #l2_leaf_reg = 1000,
                             od_type = "Iter",
                             od_wait = 30,
                             #random_strength = 10,
                             #bagging_temperature = 1,
                             #one_hot_max_size = 2,
                             random_seed = 42)

#drops = ['subcategory','area']
#x_train = x_train.drop(columns=drops)
#x_val = x_val.drop(columns=drops)


cb_model.fit(x_train[cb_features], y_train, #cat_features=categorical_features_indices,
             eval_set=(x_val[cb_features],y_val),
             #cat_features=categorical_features_pos,         
             verbose=True)

0:	learn: 2.7806195	test: 2.7642343	best: 2.7642343 (0)	total: 14.4ms	remaining: 16m 50s
1:	learn: 2.7661218	test: 2.7528441	best: 2.7528441 (1)	total: 28.1ms	remaining: 16m 22s
2:	learn: 2.7519858	test: 2.7418858	best: 2.7418858 (2)	total: 41.5ms	remaining: 16m 8s
3:	learn: 2.7378166	test: 2.7301257	best: 2.7301257 (3)	total: 55.9ms	remaining: 16m 17s
4:	learn: 2.7238259	test: 2.7197359	best: 2.7197359 (4)	total: 69.9ms	remaining: 16m 18s
5:	learn: 2.7100738	test: 2.7092332	best: 2.7092332 (5)	total: 82.7ms	remaining: 16m 4s
6:	learn: 2.6961774	test: 2.6983369	best: 2.6983369 (6)	total: 96.7ms	remaining: 16m 7s
7:	learn: 2.6827089	test: 2.6885161	best: 2.6885161 (7)	total: 110ms	remaining: 16m 2s
8:	learn: 2.6693748	test: 2.6776415	best: 2.6776415 (8)	total: 125ms	remaining: 16m 14s
9:	learn: 2.6561144	test: 2.6667208	best: 2.6667208 (9)	total: 140ms	remaining: 16m 19s
10:	learn: 2.6431522	test: 2.6559526	best: 2.6559526 (10)	total: 153ms	remaining: 16m 14s
11:	learn: 2.6305083	test: 

93:	learn: 2.0146815	test: 2.1702365	best: 2.1702365 (93)	total: 1.25s	remaining: 15m 27s
94:	learn: 2.0109064	test: 2.1676190	best: 2.1676190 (94)	total: 1.26s	remaining: 15m 25s
95:	learn: 2.0069943	test: 2.1646300	best: 2.1646300 (95)	total: 1.27s	remaining: 15m 27s
96:	learn: 2.0033474	test: 2.1614388	best: 2.1614388 (96)	total: 1.28s	remaining: 15m 26s
97:	learn: 1.9994540	test: 2.1585423	best: 2.1585423 (97)	total: 1.3s	remaining: 15m 25s
98:	learn: 1.9958185	test: 2.1556761	best: 2.1556761 (98)	total: 1.31s	remaining: 15m 25s
99:	learn: 1.9924038	test: 2.1531560	best: 2.1531560 (99)	total: 1.32s	remaining: 15m 25s
100:	learn: 1.9887640	test: 2.1501813	best: 2.1501813 (100)	total: 1.33s	remaining: 15m 24s
101:	learn: 1.9851210	test: 2.1472406	best: 2.1472406 (101)	total: 1.35s	remaining: 15m 23s
102:	learn: 1.9814674	test: 2.1444527	best: 2.1444527 (102)	total: 1.36s	remaining: 15m 23s
103:	learn: 1.9780309	test: 2.1415357	best: 2.1415357 (103)	total: 1.37s	remaining: 15m 22s
104

189:	learn: 1.8058924	test: 2.0078015	best: 2.0078015 (189)	total: 2.5s	remaining: 15m 17s
190:	learn: 1.8049269	test: 2.0071063	best: 2.0071063 (190)	total: 2.51s	remaining: 15m 17s
191:	learn: 1.8038999	test: 2.0064014	best: 2.0064014 (191)	total: 2.52s	remaining: 15m 17s
192:	learn: 1.8027923	test: 2.0054326	best: 2.0054326 (192)	total: 2.54s	remaining: 15m 17s
193:	learn: 1.8017396	test: 2.0044723	best: 2.0044723 (193)	total: 2.55s	remaining: 15m 17s
194:	learn: 1.8005733	test: 2.0036858	best: 2.0036858 (194)	total: 2.56s	remaining: 15m 17s
195:	learn: 1.7995345	test: 2.0025642	best: 2.0025642 (195)	total: 2.58s	remaining: 15m 17s
196:	learn: 1.7981928	test: 2.0004482	best: 2.0004482 (196)	total: 2.59s	remaining: 15m 18s
197:	learn: 1.7971473	test: 1.9994262	best: 1.9994262 (197)	total: 2.6s	remaining: 15m 18s
198:	learn: 1.7960397	test: 1.9982483	best: 1.9982483 (198)	total: 2.62s	remaining: 15m 18s
199:	learn: 1.7950516	test: 1.9975402	best: 1.9975402 (199)	total: 2.63s	remaining

279:	learn: 1.7352658	test: 1.9366064	best: 1.9366064 (279)	total: 3.71s	remaining: 15m 23s
280:	learn: 1.7347656	test: 1.9364065	best: 1.9364065 (280)	total: 3.72s	remaining: 15m 23s
281:	learn: 1.7342537	test: 1.9358204	best: 1.9358204 (281)	total: 3.74s	remaining: 15m 23s
282:	learn: 1.7336931	test: 1.9352635	best: 1.9352635 (282)	total: 3.75s	remaining: 15m 24s
283:	learn: 1.7331581	test: 1.9351205	best: 1.9351205 (283)	total: 3.76s	remaining: 15m 24s
284:	learn: 1.7327183	test: 1.9349405	best: 1.9349405 (284)	total: 3.78s	remaining: 15m 23s
285:	learn: 1.7321902	test: 1.9347583	best: 1.9347583 (285)	total: 3.79s	remaining: 15m 24s
286:	learn: 1.7316191	test: 1.9338868	best: 1.9338868 (286)	total: 3.81s	remaining: 15m 24s
287:	learn: 1.7311080	test: 1.9336303	best: 1.9336303 (287)	total: 3.82s	remaining: 15m 24s
288:	learn: 1.7305894	test: 1.9331831	best: 1.9331831 (288)	total: 3.83s	remaining: 15m 24s
289:	learn: 1.7301324	test: 1.9329431	best: 1.9329431 (289)	total: 3.84s	remaini

369:	learn: 1.6991551	test: 1.8973073	best: 1.8973073 (369)	total: 4.92s	remaining: 15m 26s
370:	learn: 1.6988678	test: 1.8968113	best: 1.8968113 (370)	total: 4.94s	remaining: 15m 26s
371:	learn: 1.6985449	test: 1.8963533	best: 1.8963533 (371)	total: 4.95s	remaining: 15m 26s
372:	learn: 1.6982481	test: 1.8960225	best: 1.8960225 (372)	total: 4.96s	remaining: 15m 26s
373:	learn: 1.6979087	test: 1.8958478	best: 1.8958478 (373)	total: 4.98s	remaining: 15m 26s
374:	learn: 1.6976446	test: 1.8952767	best: 1.8952767 (374)	total: 4.99s	remaining: 15m 26s
375:	learn: 1.6973299	test: 1.8952066	best: 1.8952066 (375)	total: 5s	remaining: 15m 26s
376:	learn: 1.6970310	test: 1.8945824	best: 1.8945824 (376)	total: 5.02s	remaining: 15m 26s
377:	learn: 1.6967478	test: 1.8943962	best: 1.8943962 (377)	total: 5.03s	remaining: 15m 26s
378:	learn: 1.6965095	test: 1.8943555	best: 1.8943555 (378)	total: 5.04s	remaining: 15m 26s
379:	learn: 1.6962205	test: 1.8940525	best: 1.8940525 (379)	total: 5.06s	remaining:

459:	learn: 1.6740945	test: 1.8686115	best: 1.8686115 (459)	total: 6.15s	remaining: 15m 29s
460:	learn: 1.6737983	test: 1.8683091	best: 1.8683091 (460)	total: 6.16s	remaining: 15m 29s
461:	learn: 1.6734851	test: 1.8682321	best: 1.8682321 (461)	total: 6.18s	remaining: 15m 29s
462:	learn: 1.6732616	test: 1.8683308	best: 1.8682321 (461)	total: 6.19s	remaining: 15m 29s
463:	learn: 1.6730412	test: 1.8682719	best: 1.8682321 (461)	total: 6.21s	remaining: 15m 29s
464:	learn: 1.6728360	test: 1.8681319	best: 1.8681319 (464)	total: 6.22s	remaining: 15m 29s
465:	learn: 1.6725969	test: 1.8677753	best: 1.8677753 (465)	total: 6.23s	remaining: 15m 29s
466:	learn: 1.6724132	test: 1.8675579	best: 1.8675579 (466)	total: 6.24s	remaining: 15m 29s
467:	learn: 1.6722488	test: 1.8675128	best: 1.8675128 (467)	total: 6.26s	remaining: 15m 29s
468:	learn: 1.6720325	test: 1.8671952	best: 1.8671952 (468)	total: 6.27s	remaining: 15m 29s
469:	learn: 1.6718295	test: 1.8670959	best: 1.8670959 (469)	total: 6.29s	remaini

551:	learn: 1.6550900	test: 1.8492688	best: 1.8492688 (551)	total: 7.38s	remaining: 15m 28s
552:	learn: 1.6548519	test: 1.8489070	best: 1.8489070 (552)	total: 7.39s	remaining: 15m 28s
553:	learn: 1.6546746	test: 1.8485583	best: 1.8485583 (553)	total: 7.41s	remaining: 15m 28s
554:	learn: 1.6545197	test: 1.8482309	best: 1.8482309 (554)	total: 7.42s	remaining: 15m 28s
555:	learn: 1.6543432	test: 1.8481812	best: 1.8481812 (555)	total: 7.43s	remaining: 15m 28s
556:	learn: 1.6541731	test: 1.8480190	best: 1.8480190 (556)	total: 7.45s	remaining: 15m 28s
557:	learn: 1.6539085	test: 1.8481590	best: 1.8480190 (556)	total: 7.46s	remaining: 15m 28s
558:	learn: 1.6536401	test: 1.8481165	best: 1.8480190 (556)	total: 7.47s	remaining: 15m 28s
559:	learn: 1.6535248	test: 1.8479849	best: 1.8479849 (559)	total: 7.49s	remaining: 15m 28s
560:	learn: 1.6533702	test: 1.8479814	best: 1.8479814 (560)	total: 7.5s	remaining: 15m 28s
561:	learn: 1.6532573	test: 1.8478099	best: 1.8478099 (561)	total: 7.51s	remainin

641:	learn: 1.6392648	test: 1.8351879	best: 1.8351879 (641)	total: 8.59s	remaining: 15m 27s
642:	learn: 1.6391203	test: 1.8351433	best: 1.8351433 (642)	total: 8.6s	remaining: 15m 27s
643:	learn: 1.6389784	test: 1.8349652	best: 1.8349652 (643)	total: 8.61s	remaining: 15m 27s
644:	learn: 1.6388446	test: 1.8347972	best: 1.8347972 (644)	total: 8.62s	remaining: 15m 27s
645:	learn: 1.6387115	test: 1.8347687	best: 1.8347687 (645)	total: 8.64s	remaining: 15m 27s
646:	learn: 1.6385817	test: 1.8347392	best: 1.8347392 (646)	total: 8.65s	remaining: 15m 27s
647:	learn: 1.6384681	test: 1.8346860	best: 1.8346860 (647)	total: 8.66s	remaining: 15m 27s
648:	learn: 1.6383383	test: 1.8346363	best: 1.8346363 (648)	total: 8.68s	remaining: 15m 27s
649:	learn: 1.6382414	test: 1.8346137	best: 1.8346137 (649)	total: 8.69s	remaining: 15m 26s
650:	learn: 1.6380535	test: 1.8341955	best: 1.8341955 (650)	total: 8.7s	remaining: 15m 27s
651:	learn: 1.6378905	test: 1.8339636	best: 1.8339636 (651)	total: 8.72s	remaining

735:	learn: 1.6261205	test: 1.8235220	best: 1.8234702 (734)	total: 9.82s	remaining: 15m 23s
736:	learn: 1.6259289	test: 1.8234186	best: 1.8234186 (736)	total: 9.83s	remaining: 15m 24s
737:	learn: 1.6257613	test: 1.8232830	best: 1.8232830 (737)	total: 9.85s	remaining: 15m 24s
738:	learn: 1.6256418	test: 1.8232852	best: 1.8232830 (737)	total: 9.86s	remaining: 15m 24s
739:	learn: 1.6255339	test: 1.8231631	best: 1.8231631 (739)	total: 9.87s	remaining: 15m 23s
740:	learn: 1.6253984	test: 1.8230106	best: 1.8230106 (740)	total: 9.88s	remaining: 15m 23s
741:	learn: 1.6252507	test: 1.8230548	best: 1.8230106 (740)	total: 9.9s	remaining: 15m 23s
742:	learn: 1.6251289	test: 1.8229813	best: 1.8229813 (742)	total: 9.91s	remaining: 15m 23s
743:	learn: 1.6250556	test: 1.8228594	best: 1.8228594 (743)	total: 9.92s	remaining: 15m 23s
744:	learn: 1.6249775	test: 1.8226990	best: 1.8226990 (744)	total: 9.94s	remaining: 15m 23s
745:	learn: 1.6247277	test: 1.8228126	best: 1.8226990 (744)	total: 9.95s	remainin

828:	learn: 1.6142645	test: 1.8120785	best: 1.8120785 (828)	total: 11.1s	remaining: 15m 22s
829:	learn: 1.6141687	test: 1.8120030	best: 1.8120030 (829)	total: 11.1s	remaining: 15m 22s
830:	learn: 1.6139938	test: 1.8118612	best: 1.8118612 (830)	total: 11.1s	remaining: 15m 22s
831:	learn: 1.6138941	test: 1.8117541	best: 1.8117541 (831)	total: 11.1s	remaining: 15m 22s
832:	learn: 1.6138049	test: 1.8116619	best: 1.8116619 (832)	total: 11.1s	remaining: 15m 22s
833:	learn: 1.6136914	test: 1.8117064	best: 1.8116619 (832)	total: 11.1s	remaining: 15m 22s
834:	learn: 1.6135787	test: 1.8111972	best: 1.8111972 (834)	total: 11.1s	remaining: 15m 22s
835:	learn: 1.6134861	test: 1.8110948	best: 1.8110948 (835)	total: 11.1s	remaining: 15m 21s
836:	learn: 1.6133734	test: 1.8110433	best: 1.8110433 (836)	total: 11.2s	remaining: 15m 21s
837:	learn: 1.6133088	test: 1.8110208	best: 1.8110208 (837)	total: 11.2s	remaining: 15m 21s
838:	learn: 1.6131734	test: 1.8108846	best: 1.8108846 (838)	total: 11.2s	remaini

923:	learn: 1.6040747	test: 1.8031620	best: 1.8031620 (923)	total: 12.3s	remaining: 15m 19s
924:	learn: 1.6039068	test: 1.8030008	best: 1.8030008 (924)	total: 12.3s	remaining: 15m 19s
925:	learn: 1.6038119	test: 1.8029160	best: 1.8029160 (925)	total: 12.3s	remaining: 15m 19s
926:	learn: 1.6036479	test: 1.8028578	best: 1.8028578 (926)	total: 12.3s	remaining: 15m 19s
927:	learn: 1.6035388	test: 1.8027296	best: 1.8027296 (927)	total: 12.4s	remaining: 15m 19s
928:	learn: 1.6034489	test: 1.8026366	best: 1.8026366 (928)	total: 12.4s	remaining: 15m 19s
929:	learn: 1.6033145	test: 1.8025737	best: 1.8025737 (929)	total: 12.4s	remaining: 15m 19s
930:	learn: 1.6032523	test: 1.8024018	best: 1.8024018 (930)	total: 12.4s	remaining: 15m 19s
931:	learn: 1.6031574	test: 1.8023131	best: 1.8023131 (931)	total: 12.4s	remaining: 15m 19s
932:	learn: 1.6030814	test: 1.8022854	best: 1.8022854 (932)	total: 12.4s	remaining: 15m 19s
933:	learn: 1.6030036	test: 1.8022171	best: 1.8022171 (933)	total: 12.4s	remaini

1015:	learn: 1.5954527	test: 1.7965967	best: 1.7965903 (1014)	total: 13.5s	remaining: 15m 17s
1016:	learn: 1.5953679	test: 1.7964860	best: 1.7964860 (1016)	total: 13.5s	remaining: 15m 17s
1017:	learn: 1.5952767	test: 1.7959675	best: 1.7959675 (1017)	total: 13.5s	remaining: 15m 17s
1018:	learn: 1.5951798	test: 1.7959612	best: 1.7959612 (1018)	total: 13.5s	remaining: 15m 16s
1019:	learn: 1.5950865	test: 1.7959668	best: 1.7959612 (1018)	total: 13.6s	remaining: 15m 16s
1020:	learn: 1.5950037	test: 1.7955538	best: 1.7955538 (1020)	total: 13.6s	remaining: 15m 16s
1021:	learn: 1.5948948	test: 1.7952301	best: 1.7952301 (1021)	total: 13.6s	remaining: 15m 17s
1022:	learn: 1.5948151	test: 1.7951667	best: 1.7951667 (1022)	total: 13.6s	remaining: 15m 16s
1023:	learn: 1.5947327	test: 1.7951326	best: 1.7951326 (1023)	total: 13.6s	remaining: 15m 16s
1024:	learn: 1.5946596	test: 1.7951141	best: 1.7951141 (1024)	total: 13.6s	remaining: 15m 16s
1025:	learn: 1.5945642	test: 1.7952533	best: 1.7951141 (1024

1111:	learn: 1.5870453	test: 1.7896348	best: 1.7896348 (1111)	total: 14.8s	remaining: 15m 14s
1112:	learn: 1.5869603	test: 1.7898071	best: 1.7896348 (1111)	total: 14.8s	remaining: 15m 14s
1113:	learn: 1.5868952	test: 1.7897732	best: 1.7896348 (1111)	total: 14.8s	remaining: 15m 14s
1114:	learn: 1.5867701	test: 1.7899196	best: 1.7896348 (1111)	total: 14.8s	remaining: 15m 13s
1115:	learn: 1.5866673	test: 1.7898871	best: 1.7896348 (1111)	total: 14.8s	remaining: 15m 13s
1116:	learn: 1.5865879	test: 1.7898657	best: 1.7896348 (1111)	total: 14.8s	remaining: 15m 13s
1117:	learn: 1.5865214	test: 1.7898715	best: 1.7896348 (1111)	total: 14.8s	remaining: 15m 13s
1118:	learn: 1.5864611	test: 1.7897892	best: 1.7896348 (1111)	total: 14.8s	remaining: 15m 13s
1119:	learn: 1.5863878	test: 1.7897112	best: 1.7896348 (1111)	total: 14.9s	remaining: 15m 13s
1120:	learn: 1.5862952	test: 1.7897043	best: 1.7896348 (1111)	total: 14.9s	remaining: 15m 13s
1121:	learn: 1.5862204	test: 1.7895228	best: 1.7895228 (1121

1206:	learn: 1.5795057	test: 1.7832161	best: 1.7832161 (1206)	total: 16s	remaining: 15m 11s
1207:	learn: 1.5794337	test: 1.7831347	best: 1.7831347 (1207)	total: 16s	remaining: 15m 11s
1208:	learn: 1.5793872	test: 1.7831169	best: 1.7831169 (1208)	total: 16s	remaining: 15m 11s
1209:	learn: 1.5793185	test: 1.7832597	best: 1.7831169 (1208)	total: 16s	remaining: 15m 11s
1210:	learn: 1.5792394	test: 1.7833372	best: 1.7831169 (1208)	total: 16s	remaining: 15m 11s
1211:	learn: 1.5791468	test: 1.7832436	best: 1.7831169 (1208)	total: 16.1s	remaining: 15m 11s
1212:	learn: 1.5790497	test: 1.7833233	best: 1.7831169 (1208)	total: 16.1s	remaining: 15m 11s
1213:	learn: 1.5789786	test: 1.7831907	best: 1.7831169 (1208)	total: 16.1s	remaining: 15m 11s
1214:	learn: 1.5788804	test: 1.7830877	best: 1.7830877 (1214)	total: 16.1s	remaining: 15m 11s
1215:	learn: 1.5788355	test: 1.7830296	best: 1.7830296 (1215)	total: 16.1s	remaining: 15m 11s
1216:	learn: 1.5787777	test: 1.7830224	best: 1.7830224 (1216)	total: 1

1301:	learn: 1.5724314	test: 1.7774304	best: 1.7774304 (1301)	total: 17.2s	remaining: 15m 8s
1302:	learn: 1.5723497	test: 1.7774081	best: 1.7774081 (1302)	total: 17.2s	remaining: 15m 9s
1303:	learn: 1.5722502	test: 1.7774537	best: 1.7774081 (1302)	total: 17.3s	remaining: 15m 9s
1304:	learn: 1.5721842	test: 1.7774825	best: 1.7774081 (1302)	total: 17.3s	remaining: 15m 9s
1305:	learn: 1.5721160	test: 1.7774807	best: 1.7774081 (1302)	total: 17.3s	remaining: 15m 9s
1306:	learn: 1.5720550	test: 1.7771312	best: 1.7771312 (1306)	total: 17.3s	remaining: 15m 9s
1307:	learn: 1.5719818	test: 1.7771060	best: 1.7771060 (1307)	total: 17.3s	remaining: 15m 8s
1308:	learn: 1.5719294	test: 1.7770899	best: 1.7770899 (1308)	total: 17.3s	remaining: 15m 8s
1309:	learn: 1.5718305	test: 1.7771003	best: 1.7770899 (1308)	total: 17.3s	remaining: 15m 8s
1310:	learn: 1.5717731	test: 1.7770606	best: 1.7770606 (1310)	total: 17.3s	remaining: 15m 8s
1311:	learn: 1.5717041	test: 1.7769554	best: 1.7769554 (1311)	total: 1

1395:	learn: 1.5662121	test: 1.7705202	best: 1.7704285 (1394)	total: 18.4s	remaining: 15m 6s
1396:	learn: 1.5661533	test: 1.7704541	best: 1.7704285 (1394)	total: 18.5s	remaining: 15m 6s
1397:	learn: 1.5660796	test: 1.7704153	best: 1.7704153 (1397)	total: 18.5s	remaining: 15m 6s
1398:	learn: 1.5660245	test: 1.7703741	best: 1.7703741 (1398)	total: 18.5s	remaining: 15m 6s
1399:	learn: 1.5659677	test: 1.7703771	best: 1.7703741 (1398)	total: 18.5s	remaining: 15m 6s
1400:	learn: 1.5659263	test: 1.7703926	best: 1.7703741 (1398)	total: 18.5s	remaining: 15m 6s
1401:	learn: 1.5658789	test: 1.7703896	best: 1.7703741 (1398)	total: 18.5s	remaining: 15m 6s
1402:	learn: 1.5658225	test: 1.7703772	best: 1.7703741 (1398)	total: 18.5s	remaining: 15m 6s
1403:	learn: 1.5657336	test: 1.7703323	best: 1.7703323 (1403)	total: 18.5s	remaining: 15m 6s
1404:	learn: 1.5656832	test: 1.7702829	best: 1.7702829 (1404)	total: 18.6s	remaining: 15m 6s
1405:	learn: 1.5656178	test: 1.7702880	best: 1.7702829 (1404)	total: 1

1490:	learn: 1.5602336	test: 1.7666106	best: 1.7666106 (1490)	total: 19.7s	remaining: 15m 3s
1491:	learn: 1.5601835	test: 1.7665225	best: 1.7665225 (1491)	total: 19.7s	remaining: 15m 3s
1492:	learn: 1.5601005	test: 1.7664618	best: 1.7664618 (1492)	total: 19.7s	remaining: 15m 4s
1493:	learn: 1.5600078	test: 1.7664297	best: 1.7664297 (1493)	total: 19.7s	remaining: 15m 4s
1494:	learn: 1.5599703	test: 1.7664160	best: 1.7664160 (1494)	total: 19.7s	remaining: 15m 4s
1495:	learn: 1.5599245	test: 1.7663933	best: 1.7663933 (1495)	total: 19.7s	remaining: 15m 3s
1496:	learn: 1.5598650	test: 1.7662937	best: 1.7662937 (1496)	total: 19.8s	remaining: 15m 3s
1497:	learn: 1.5598089	test: 1.7661845	best: 1.7661845 (1497)	total: 19.8s	remaining: 15m 3s
1498:	learn: 1.5597564	test: 1.7661625	best: 1.7661625 (1498)	total: 19.8s	remaining: 15m 3s
1499:	learn: 1.5596923	test: 1.7663178	best: 1.7661625 (1498)	total: 19.8s	remaining: 15m 3s
1500:	learn: 1.5596052	test: 1.7660584	best: 1.7660584 (1500)	total: 1

1585:	learn: 1.5548847	test: 1.7632375	best: 1.7631794 (1583)	total: 20.9s	remaining: 15m 1s
1586:	learn: 1.5548092	test: 1.7632075	best: 1.7631794 (1583)	total: 20.9s	remaining: 15m 1s
1587:	learn: 1.5547589	test: 1.7631955	best: 1.7631794 (1583)	total: 20.9s	remaining: 15m 1s
1588:	learn: 1.5547054	test: 1.7631480	best: 1.7631480 (1588)	total: 20.9s	remaining: 15m 1s
1589:	learn: 1.5546670	test: 1.7631444	best: 1.7631444 (1589)	total: 21s	remaining: 15m 1s
1590:	learn: 1.5546106	test: 1.7631252	best: 1.7631252 (1590)	total: 21s	remaining: 15m 1s
1591:	learn: 1.5545583	test: 1.7631126	best: 1.7631126 (1591)	total: 21s	remaining: 15m 1s
1592:	learn: 1.5545344	test: 1.7630590	best: 1.7630590 (1592)	total: 21s	remaining: 15m 1s
1593:	learn: 1.5544675	test: 1.7630206	best: 1.7630206 (1593)	total: 21s	remaining: 15m 1s
1594:	learn: 1.5543820	test: 1.7629643	best: 1.7629643 (1594)	total: 21s	remaining: 15m 1s
1595:	learn: 1.5543271	test: 1.7629333	best: 1.7629333 (1595)	total: 21s	remaining

<catboost.core.CatBoostRegressor at 0x7f3f9f0a39e8>

In [148]:
#training.to_pickle("pickled/training")
#training = pd.read_pickle("pickled/training")

#pickle.dump(cb_model, open( "pickled/cb_model", "wb"), protocol=4)

#cb_model = pickle.load( open( "pickled/cb_model", "rb" ) )

In [203]:
scores = {}
for i,score in enumerate(cb_model.get_feature_importance()):
    scores[cb_features[i]] = score

sorted(scores.items(), key=lambda x: x[1])[::-1]

[('shop_item_1', 16.26980570729902),
 ('me_composite', 7.427424333538543),
 ('item_block_units_rolling_12', 6.985210172853927),
 ('item_minmax_q0.75', 6.348910023169344),
 ('item_me', 5.190917105655866),
 ('item_minmax_q0.5', 5.175815212938345),
 ('item_minmax_q0.25', 5.157502521027924),
 ('item_block_units_lag_2', 5.100091791169378),
 ('shop_category_me', 5.019527247589687),
 ('item_max_units_block', 4.003114853666277),
 ('item_block_units_lag_1', 2.8880113737401656),
 ('shop_max_units_block', 2.8500245390605543),
 ('shop_cat_minmax_mean', 2.807307122959253),
 ('shop_cat_max_units_block', 1.986530397115915),
 ('shop_item_2', 1.914076944266622),
 ('cat_units', 1.8060422807664238),
 ('shop_minmax_mean', 1.1326694068991385),
 ('category_me', 1.0064765948910008),
 ('cat_block_units_lag_1', 0.9413502227721215),
 ('shop_units', 0.9313217905826735),
 ('shop_cat_units', 0.9077859826432725),
 ('item_block_units_lag_12', 0.8726903536785704),
 ('shop_cat_block_units_lag_1', 0.8619152665591897),


In [187]:
cb_features = [item[0] for item in scores.items() if item[1] > 4]