In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt

import lightgbm as lgbm
import gc
import xgboost as xgb
import pickle as pickle


from catboost import CatBoostRegressor
import dask.dataframe as dd
from sklearn.model_selection import KFold

In [43]:
items           = pd.read_csv('items.csv',usecols=["item_id", "item_category_id"])
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')

In [44]:
y_cols = ['shop_id', 'item_id', 'date_block_num']

In [45]:
sales_train[['day','month', 'year']] = sales_train['date'].str.split('.', expand=True).astype(int)
sales_train = sales_train[sales_train['year'] != 2013]
sales_train = sales_train.set_index('item_id').join(items.set_index('item_id'))
sales_train.reset_index(inplace=True)

In [46]:
train_item_ids = sales_train['item_id'].unique()
train_shop_ids = sales_train['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = sales_train['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [47]:
combinations = []
for shop in all_shop_ids:
    #get all article ids ever associated to this shop
    train_ids = sales_train[sales_train['shop_id'] == shop]['item_id'].unique()
    test_ids = sales_train[sales_train['shop_id'] == shop]['item_id'].unique()
    all_shop = np.unique(np.append(train_ids, test_ids))
    all_shop_combo = [[shop, item, block] for item in all_shop for block in train_blocks]
    for combo in all_shop_combo:
        combinations.append(combo)
        
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['shop_id','item_id','date_block_num'])
del combinations

In [48]:
ys = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"y"})

In [None]:
#

In [320]:
training = all_combos.merge(ys, on=['shop_id', 'item_id', 'date_block_num'], how='left').fillna(0)
training = training.apply(pd.to_numeric,downcast='unsigned')

training['y'] = training['y'].clip(0,20)
training['y'] = training['y'].astype('int8')

training = training.set_index('item_id').join(items.set_index('item_id'))
training.reset_index(inplace=True)

In [325]:
def get_mean_encoding(df, group_cols, target):
    cumsum = df.groupby(group_cols)[target].cumsum() - df[target]
    cumcnt = df.groupby(group_cols).cumcount()
    return cumsum/cumcnt

training['item_me'] = pd.to_numeric(get_mean_encoding(training, ['item_id'], 'y'), downcast='float')
training['shop_me'] = pd.to_numeric(get_mean_encoding(training, ['shop_id'], 'y'), downcast='float')
training['category_me'] = pd.to_numeric(get_mean_encoding(training, ['item_category_id'], 'y'), downcast='float')
training['shop_category_me'] = pd.to_numeric(get_mean_encoding(training, ['shop_id', 'item_category_id'], 'y'), downcast='float')
training['shop_item_me'] = pd.to_numeric(get_mean_encoding(training, ['shop_id', 'item_id'], 'y'), downcast='float')

training.fillna(0,inplace=True)

In [321]:
cols = ['item_id','date_block_num']
item_block_units = sales_train.groupby(cols,as_index=False)['item_cnt_day'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_day':'item_block_units'})

training = training.merge(item_block_units, on=cols, how='left').fillna(0)
del item_block_units
gc.collect()
#
cols = ['shop_id','date_block_num']
shop_block_units = sales_train.groupby(cols,as_index=False)['item_cnt_day'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_day':'shop_block_units'})

training = training.merge(shop_block_units, on=cols, how='left').fillna(0)
del shop_block_units
gc.collect()
#
cols = ['item_category_id','date_block_num']
cat_block_units = sales_train.groupby(cols,as_index=False)['item_cnt_day'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_day':'cat_block_units'})

training = training.merge(cat_block_units, on=cols, how='left').fillna(0)
del cat_block_units
gc.collect()
#

cols = ['shop_id', 'item_category_id','date_block_num']
shop_cat_block_units = sales_train.groupby(cols,as_index=False)['item_cnt_day'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_day':'shop_cat_block_units'})

training = training.merge(shop_cat_block_units, on=cols, how='left').fillna(0)
del shop_cat_block_units
gc.collect()
#
cols = ['shop_id', 'item_id','date_block_num']
shop_item_block_units = sales_train.groupby(cols,as_index=False)['item_cnt_day'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_day':'shop_item_block_units'})

training = training.merge(shop_item_block_units, on=cols, how='left').fillna(0)
del shop_item_block_units
gc.collect()

63

In [317]:
"A" + "_"

'A_'

In [337]:
training['item_units'] = training.groupby(['date_block_num'])['item_block_units'].transform(np.sum)
training['item_max_units_block'] = training.groupby(['item_id'])['item_block_units']\
        .transform(np.max)
training['item_min_units_block'] = training.groupby(['item_id'])['item_block_units']\
        .transform(np.min)
training['item_minmax_mean'] = training[['item_max_units_block', 'item_min_units_block']].mean(axis=1)

#
training['shop_units'] = training.groupby(['date_block_num'])['shop_block_units'].transform(np.sum)
training['shop_max_units_block'] = training.groupby(['shop_id'])['shop_block_units']\
        .transform(np.max)
training['shop_min_units_block'] = training.groupby(['shop_id'])['shop_block_units']\
        .transform(np.min)
training['shop_minmax_mean'] = training[['shop_max_units_block', 'shop_min_units_block']].mean(axis=1)

#
training['cat_units'] = training.groupby(['date_block_num'])['cat_block_units'].transform(np.sum)
training['cat_max_units_block'] = training.groupby(['item_category_id'])['cat_block_units']\
        .transform(np.max)
training['cat_min_units_block'] = training.groupby(['item_category_id'])['cat_block_units']\
        .transform(np.min)
training['cat_minmax_mean'] = training[['cat_max_units_block', 'cat_min_units_block']].mean(axis=1)
#
training['shop_cat_units'] = training.groupby(['date_block_num'])['shop_cat_block_units'].transform(np.sum)
training['shop_cat_max_units_block'] = training.groupby(['shop_id', 'item_category_id'])['shop_cat_block_units']\
        .transform(np.max)
training['shop_cat_min_units_block'] = training.groupby(['shop_id', 'item_category_id'])['shop_cat_block_units']\
        .transform(np.min)
training['shop_cat_minmax_mean'] = training[['shop_cat_max_units_block', 'shop_cat_min_units_block']].mean(axis=1)
#
training['shop_item_units'] = training.groupby(['date_block_num'])['shop_item_block_units'].transform(np.sum)
training['shop_item_max_units_block'] = training.groupby(['shop_id', 'item_id'])['shop_item_block_units']\
        .transform(np.max)
training['shop_item_min_units_block'] = training.groupby(['shop_id', 'item_id'])['shop_item_block_units']\
        .transform(np.min)
training['shop_item_minmax_mean'] = training[['shop_item_max_units_block', 'shop_item_min_units_block']].mean(axis=1)

In [322]:
rolls = [3,6]
cols = ['item_id','date_block_num']

for roll in rolls:
    print(roll)
    roll_name = "item_block_units_rolling_" + str(roll)
    roll_name_tmp = roll_name + "_tmp"
    
    item_block_units_rolling_temp = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['item_id'],as_index=False)\
        ['item_block_units'].rolling(roll,min_periods=2).mean().reset_index()\
        .rename(columns={'item_block_units':roll_name_tmp})\
        [['item_id','date_block_num',roll_name_tmp]]
    
    training = training.merge(item_block_units_rolling_temp, on=cols, how='left')

    item_block_units_rolling = training\
        .drop_duplicates(cols)\
        .sort_values(cols)\
        .set_index(cols)\
        .groupby(['item_id'],as_index=False)\
        [roll_name_tmp].shift(1)\
        .rename(columns={roll_name_tmp:roll_name}).reset_index()

    training = training.merge(item_block_units_rolling, on=cols, how='left')
    training.drop(columns=[roll_name_tmp], inplace=True)

3
6


In [323]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'y', 'item_category_id',
       'item_block_units', 'shop_block_units', 'cat_block_units',
       'shop_cat_block_units', 'shop_item_block_units',
       'item_block_units_rolling_3', 'item_block_units_rolling_6'],
      dtype=object)

In [234]:
training[(training['item_id'].isin([30,31])) & (training['shop_id'] == 30)]\
        .sort_values(['item_id','date_block_num'])\
        .groupby(['item_id'])\
        ['item_block_units_rolling_3'].shift(1)
        #.reset_index()
        #.rename(columns={'item_block_units':'item_block_units_rolling1'})

1144          NaN
1145     0.000000
1146    41.000000
1147    37.666667
1148    25.333333
1149    22.666667
1150    16.666667
1151    14.000000
1152    12.666667
1153    12.000000
1154    12.000000
1155    12.333333
1156    14.000000
1157    14.333333
1158    14.666667
1159    11.000000
1160     7.666667
1161     4.666667
1162     4.333333
1163     4.333333
1164     4.666667
1165     4.333333
2068          NaN
2069     0.000000
2070    14.500000
2071    13.333333
2072    15.333333
2073    18.000000
2074    18.666667
2075    16.000000
2076    13.000000
2077    14.666667
2078    13.000000
2079    12.333333
2080    12.333333
2081    18.000000
2082    18.666667
2083    16.666667
2084    12.666667
2085    10.000000
2086     9.333333
2087     7.000000
2088    23.000000
2089    22.666667
Name: item_block_units_rolling_3, dtype: float64

In [196]:
np.min([2,2])

2

In [199]:
rolls = [3]

for roll in rolls:
    roll_name = "item_block_units_rolling_%d" % (roll)
    item_block_units_rolling = training\
        .sort_values(['item_id','date_block_num'])\
        .drop_duplicates(['item_id','date_block_num'])\
        .groupby(['item_id'])\
        ['item_block_units'].rolling(roll,min_periods=2).mean().reset_index()\
        .rename(columns={'item_block_units':roll_name})

    item_block_units_rolling['date_block_num'] = item_block_units_rolling['level_1'].map(training['date_block_num'])

    training = training.merge(item_block_units_rolling, on=['item_id', 'date_block_num'], how='left').fillna(0)

In [None]:
transactions_items['item_units'] = pd.to_numeric(transactions_items.groupby(['item_id'])['item_cnt_day'].transform(np.sum), downcast='unsigned') 


In [20]:
#SHOP

gc.collect()
transactions_shops_blocks = transactions.copy()

transactions_shops_blocks['shop_block_units'] = pd.to_numeric(transactions_shops_blocks\
                .groupby(['shop_id','date_block_num'])['item_cnt_day'].transform(np.sum), downcast='unsigned')    

transactions_shops_blocks.drop_duplicates(['shop_id','date_block_num'],inplace=True)

In [21]:
shops_blocks = transactions_shops_blocks[['shop_id', 'date_block_num', 'shop_block_units']]
training = training.merge(shops_blocks, on=['shop_id', 'date_block_num'], how='left').fillna(0)

In [22]:
shop_block_units_rolling3 = training\
        .sort_values(['shop_id','date_block_num'])\
        .drop_duplicates(['shop_id','date_block_num'])\
        .groupby(['shop_id'])\
        ['shop_block_units'].rolling(3,min_periods=2).mean().reset_index()\
        .rename(columns={'shop_block_units':'shop_block_units_rolling3'})

shop_block_units_rolling3['date_block_num'] = shop_block_units_rolling3['level_1'].map(training['date_block_num'])



training = training.merge(shop_block_units_rolling3, on=['shop_id', 'date_block_num'], how='left').fillna(0)

In [23]:
training['rolling_composite'] =  training['shop_block_units_rolling3'].clip(1, None) *\
            training['item_block_units_rolling3'].clip(1, None) 
training['me_composite'] =  training['item_me'].clip(1, None)  * training['shop_me'].clip(1, None) 

In [52]:
training.dtypes

item_id                    int64
shop_id                    uint8
date_block_num             uint8
y                           int8
item_category_id           int64
item_me                  float32
shop_me                  float32
category_me              float32
shop_category_me         float32
shop_item_me             float32
item_block_units         float64
shop_block_units         float64
cat_block_units          float64
shop_cat_block_units     float64
shop_item_block_units    float64
dtype: object

In [55]:
training.sample(10)

Unnamed: 0,item_id,shop_id,date_block_num,y,item_category_id,item_me,shop_me,category_me,shop_category_me,shop_item_me,item_block_units,shop_block_units,cat_block_units,shop_cat_block_units,shop_item_block_units,item_units,item_max_units_block,item_min_units_block
650693,2916,49,33,0,23,0.30847,0.217947,0.31367,0.211524,0.333333,0.0,648.0,2203.0,42.0,0.0,2499996.0,71.0,0.0
1680959,6608,19,17,0,25,0.040816,0.434715,0.180928,0.12246,0.0,0.0,2067.0,460.0,17.0,0.0,4156779.0,6.0,0.0
3105180,12080,38,24,0,40,0.155172,0.369399,0.239276,0.121243,0.0,0.0,1441.0,15109.0,76.0,0.0,4698631.0,30.0,0.0
2142666,8401,28,32,0,37,0.26971,0.715043,0.181585,0.367775,0.65,0.0,2979.0,2989.0,158.0,0.0,2528824.0,47.0,0.0
114587,1021,7,23,1,67,0.151515,0.219186,0.195883,0.15978,0.727273,15.0,3318.0,2778.0,49.0,1.0,7399317.0,18.0,0.0
4914915,18159,31,17,0,40,0.111111,0.686598,0.2298,0.635294,0.4,1.0,7701.0,16064.0,1835.0,0.0,4156779.0,4.0,0.0
2916352,11329,31,22,0,43,0.256098,0.74062,0.183567,0.555323,1.4,6.0,9865.0,614.0,183.0,0.0,5112241.0,9.0,0.0
4491899,16628,27,17,0,40,0.372549,0.494315,0.232576,0.246279,0.0,0.0,4282.0,16064.0,390.0,0.0,4156779.0,146.0,0.0
3687409,14174,52,23,0,43,0.072727,0.242092,0.189033,0.097643,0.181818,0.0,2243.0,740.0,11.0,0.0,7399317.0,4.0,0.0
4218749,15811,59,19,1,55,0.306122,0.265957,0.224547,0.150501,0.0,20.0,1244.0,11180.0,125.0,1.0,4474489.0,23.0,5.0


In [314]:
print(np.mean(np.array([31,24,58])))
print(np.mean(np.array([15,14,11])))
print(np.mean(np.array([6,9,53])))

37.666666666666664
13.333333333333334
22.666666666666668


In [324]:
training[(training['item_id'].isin([30,31])) & (training['shop_id'] == 30)]\
        .sort_values(['item_id','date_block_num'])[['item_id','shop_id',\
                                                    'date_block_num','item_block_units',\
                                                    'item_block_units_rolling_3', 'item_block_units_rolling_6']]
                                                    #'item_block_units_rolling_6']]

Unnamed: 0,item_id,shop_id,date_block_num,item_block_units,item_block_units_rolling_3,item_block_units_rolling_6
1144,30,30,12,58.0,,
1145,30,30,13,24.0,,
1146,30,30,14,31.0,41.0,41.0
1147,30,30,15,21.0,37.666667,37.666667
1148,30,30,16,16.0,25.333333,33.5
1149,30,30,17,13.0,22.666667,30.0
1150,30,30,18,13.0,16.666667,27.166667
1151,30,30,19,12.0,14.0,19.666667
1152,30,30,20,11.0,12.666667,17.666667
1153,30,30,21,13.0,12.0,14.333333


In [210]:
training[(training['item_id'].isin([30])) & (training['shop_id'].isin([30,31]))]\
        .sort_values(['shop_id','date_block_num'])[['item_id','shop_id',\
                                                    'date_block_num','shop_block_units', 'shop_block_units_rolling3']]

KeyError: "['shop_block_units_rolling3'] not in index"

In [57]:
training.sample(10)

Unnamed: 0,item_id,shop_id,date_block_num,y,item_category_id,item_me,shop_me,category_me,shop_category_me,shop_item_me,...,shop_min_units_block,cat_units,cat_max_units_block,cat_min_units_block,shop_cat_units,shop_cat_max_units_block,shop_cat_min_units_block,shop_item_units,shop_item_max_units_block,shop_item_min_units_block
2438370,9792,57,12,0,40,0.525253,0.505052,0.232457,0.516698,0.0,...,2266.0,2427754000.0,22065.0,6779.0,66561739.0,1526.0,575.0,116899.0,20.0,0.0
484977,2442,19,21,0,23,0.378549,0.393025,0.330852,0.242215,0.444444,...,1231.0,1779475000.0,10032.0,2203.0,50263784.0,126.0,19.0,107422.0,2.0,0.0
386045,2063,4,23,0,19,0.090909,0.255141,0.435678,0.256445,0.0,...,732.0,2577978000.0,10210.0,2443.0,71475937.0,128.0,30.0,168755.0,1.0,0.0
3869211,14777,44,17,0,37,0.329685,0.197865,0.160618,0.098138,0.2,...,619.0,1955719000.0,7704.0,2989.0,53504497.0,154.0,21.0,97429.0,1.0,0.0
395689,2097,57,31,0,55,0.052632,0.436393,0.237479,0.267337,0.052632,...,2266.0,1124313000.0,13786.0,4913.0,30642348.0,458.0,204.0,66079.0,1.0,0.0
636025,2897,24,17,0,25,0.178138,0.395124,0.265322,0.189907,0.4,...,882.0,1955719000.0,1148.0,256.0,53504497.0,30.0,0.0,97429.0,1.0,0.0
4999417,18479,14,17,0,55,0.365217,0.250162,0.223391,0.163211,0.0,...,933.0,1955719000.0,13786.0,4913.0,53504497.0,197.0,72.0,97429.0,3.0,0.0
4549447,16889,35,13,0,37,0.179138,0.314128,0.162025,0.169568,0.0,...,1227.0,2124627000.0,7704.0,2989.0,57746089.0,206.0,79.0,109687.0,2.0,0.0
3212446,12526,58,18,0,55,0.112022,0.431453,0.22199,0.227788,0.0,...,1319.0,1840117000.0,13786.0,4913.0,49967875.0,280.0,106.0,91280.0,2.0,0.0
4498579,16653,16,31,0,40,0.047619,0.29129,0.232545,0.164553,0.0,...,963.0,1124313000.0,22065.0,6779.0,30642348.0,291.0,92.0,66079.0,1.0,0.0


In [35]:
len(training[training['item_block_units'] > 0])

3456135

In [37]:
len(transactions_items_blocks)

135451

In [326]:
gc.collect()
val = training[training['date_block_num'] == 33]
print("val length", len(val))

unique_pairs_val = list(set(list(zip(val.shop_id, val.item_id))))
print("number of unique shop/item pairs in val", len(unique_pairs_val))
unique_pairs_val_ignore = unique_pairs_val[0:int(len(unique_pairs_val)/2)]


def tuple2key(t):
    return "%d_%d" % (t[0], t[1])

val_pairs_ignore_dict = {}
for t in unique_pairs_val_ignore:
    val_pairs_ignore_dict[tuple2key(t)] = 1
 
    
training['val_ignore'] = (training['shop_id'].astype(str) + '_' +  training['item_id'].astype(str))\
                                    .apply(lambda x: x in val_pairs_ignore_dict)

val length 270742
number of unique shop/item pairs in val 270742


In [101]:
len(training[training['val_ignore'] == True])

2978162

In [330]:
gc.collect()


x_train = training[(training['date_block_num'] < 33) & (training['val_ignore'] == False)]
y_train = x_train['y']


pos_train_len = len(y_train[y_train != 0])
print("pos_train_len", pos_train_len)

zeros_keep_indices_train = y_train[y_train == 0].sample(int(pos_train_len/6)).index
print("zeros_keep_indices_train", len(zeros_keep_indices_train))
non_zeros_train_indices = y_train[y_train != 0].index
print("non_zeros_train_indices", len(non_zeros_train_indices))

train_indices = np.append(np.array(zeros_keep_indices_train), np.array(non_zeros_train_indices))

y_train = y_train.loc[train_indices]
x_train = x_train.loc[train_indices]




x_val = training[training['date_block_num'] == 33]
y_val = x_val['y']

pos_val_len = len(y_val[y_val != 0])
print("pos_val_len", pos_val_len)

zeros_keep_indices_val = y_val[y_val == 0].sample(int(pos_val_len/6)).index
print("zeros_keep_indices_val", len(zeros_keep_indices_val))
non_zeros_val_indices = y_val[y_val != 0].index
print("non_zeros_val_indices", len(non_zeros_val_indices))

val_indices = np.append(np.array(zeros_keep_indices_val), np.array(non_zeros_val_indices))

y_val = y_val.loc[val_indices]
x_val = x_val.loc[val_indices]

pos_train_len 444760
zeros_keep_indices_train 74126
non_zeros_train_indices 444760
pos_val_len 31471
zeros_keep_indices_val 5245
non_zeros_val_indices 31471


In [338]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'y', 'item_category_id',
       'item_block_units', 'shop_block_units', 'cat_block_units',
       'shop_cat_block_units', 'shop_item_block_units',
       'item_block_units_rolling_3', 'item_block_units_rolling_6',
       'item_me', 'shop_me', 'category_me', 'shop_category_me',
       'shop_item_me', 'val_ignore', 'item_units', 'item_max_units_block',
       'item_min_units_block', 'shop_units', 'shop_max_units_block',
       'shop_min_units_block', 'cat_units', 'cat_max_units_block',
       'cat_min_units_block', 'shop_cat_units',
       'shop_cat_max_units_block', 'shop_cat_min_units_block',
       'shop_item_units', 'shop_item_max_units_block',
       'shop_item_min_units_block', 'item_minmax_mean',
       'shop_minmax_mean', 'cat_minmax_mean', 'shop_cat_minmax_mean',
       'shop_item_minmax_mean'], dtype=object)

In [339]:



cb_features = [
      
       'item_block_units_rolling_3', 'item_block_units_rolling_6',
       'item_me', 'shop_me', 'category_me', 'shop_category_me',
       'shop_item_me', 'val_ignore', 'item_units', 'item_max_units_block',
       'item_min_units_block', 'shop_units', 'shop_max_units_block',
       'shop_min_units_block', 'cat_units', 'cat_max_units_block',
       'cat_min_units_block', 'shop_cat_units',
       'shop_cat_max_units_block', 'shop_cat_min_units_block',
       'shop_item_units', 'shop_item_max_units_block',
       'shop_item_min_units_block', 'item_minmax_mean',
       'shop_minmax_mean', 'cat_minmax_mean', 'shop_cat_minmax_mean',
       'shop_item_minmax_mean']

In [340]:
cb_model = CatBoostRegressor(iterations=70000,
                             learning_rate=0.01,
                             eval_metric='RMSE',
                             #thread_count=16,
                             task_type = "GPU",
                             use_best_model=True,
                             #l2_leaf_reg = 1000,
                             od_type = "Iter",
                             od_wait = 30,
                             #random_strength = 10,
                             #bagging_temperature = 1,
                             #one_hot_max_size = 2,
                             random_seed = 42)

#drops = ['subcategory','area']
#x_train = x_train.drop(columns=drops)
#x_val = x_val.drop(columns=drops)


cb_model.fit(x_train[cb_features], y_train, #cat_features=categorical_features_indices,
             eval_set=(x_val[cb_features],y_val),
             #cat_features=categorical_features_pos,         
             verbose=True)

KeyError: "['shop_cat_minmax_mean', 'cat_minmax_mean', 'item_minmax_mean', 'shop_minmax_mean', 'shop_item_minmax_mean'] not in index"

In [148]:
#training.to_pickle("pickled/training")
#training = pd.read_pickle("pickled/training")

#pickle.dump(cb_model, open( "pickled/cb_model", "wb"), protocol=4)

#cb_model = pickle.load( open( "pickled/cb_model", "rb" ) )

In [334]:
scores = {}
for i,score in enumerate(cb_model.get_feature_importance()):
    scores[cb_features[i]] = score

sorted(scores.items(), key=lambda x: x[1])[::-1]

[('shop_item_max_units_block', 40.51541386390396),
 ('shop_item_me', 20.890591604097956),
 ('item_block_units_rolling_3', 8.933698183770673),
 ('item_me', 4.40036058514097),
 ('cat_units', 4.126716095246712),
 ('item_block_units_rolling_6', 3.9753375763099084),
 ('item_max_units_block', 3.061873989775672),
 ('shop_cat_units', 1.820664201573415),
 ('item_units', 1.5502902260647602),
 ('cat_min_units_block', 1.5154749016185132),
 ('category_me', 1.2986131474458102),
 ('item_min_units_block', 1.2125954479378467),
 ('cat_max_units_block', 1.2029905016853601),
 ('shop_item_units', 1.0071657736570225),
 ('shop_category_me', 0.9147134217504023),
 ('shop_units', 0.897065899174137),
 ('shop_max_units_block', 0.8109548175450396),
 ('shop_item_min_units_block', 0.5725151822152927),
 ('shop_me', 0.5195883330081601),
 ('shop_cat_max_units_block', 0.38419176069971794),
 ('shop_cat_min_units_block', 0.28215267761023854),
 ('shop_min_units_block', 0.1070318097684197),
 ('val_ignore', 0.0)]

In [335]:
cb_features = [item[0] for item in scores.items() if item[1] > 1]