In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
import gc
import xgboost as xgb
from sklearn.linear_model import ElasticNet, Lasso, Ridge, LinearRegression
import pickle
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

In [33]:
transactions    = pd.read_csv('sales_train.csv.gz')
items           = pd.read_csv('items.csv')
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
test            = pd.read_csv('test.csv.gz')

In [34]:
INDEX_COLS = ['item_id', 'shop_id', 'date_block_num']

In [35]:
transactions = pd.merge(transactions, items, on='item_id', how='left')
transactions = transactions.drop('item_name', axis=1)
transactions = transactions[transactions['date_block_num'] > 12]
transactions.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
1366911,01.02.2014,13,27,15242,699.0,1.0,63
1366912,25.02.2014,13,27,15200,299.0,1.0,69
1366913,19.02.2014,13,27,15279,799.0,1.0,63
1366914,26.02.2014,13,27,15202,299.0,1.0,69
1366915,01.02.2014,13,27,14888,549.0,1.0,55


In [36]:
train_item_ids = transactions['item_id'].unique()
train_shop_ids = transactions['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = transactions['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [53]:
combinations = []
for shop in all_shop_ids:
    #get all article ids ever associated to this shop
    train_ids = transactions[transactions['shop_id'] == shop]['item_id'].unique()
    test_ids = test[test['shop_id'] == shop]['item_id'].unique()
    all_shop = np.unique(np.append(train_ids, test_ids))
    all_shop_combo = [[item, shop, block] for item in all_shop for block in train_blocks]
    for combo in all_shop_combo:
        combinations.append(combo)

In [69]:
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['item_id', 'shop_id', 'date_block_num'])

In [70]:
len(all_combos)

7722666

In [71]:
all_combos = pd.merge(all_combos, items[['item_id', 'item_category_id']], on='item_id', how='left')

In [72]:
len(all_combos)

7722666

In [73]:
all_combos = pd.merge(all_combos, transactions[['item_price', 'item_id', 'shop_id']].drop_duplicates(['item_id', 'shop_id']), \
                      on=['item_id', 'shop_id'], how='left')

In [74]:
len(all_combos)

7722666

In [75]:
all_combos.head()

Unnamed: 0,item_id,shop_id,date_block_num,item_category_id,item_price
0,0,54,13,40,58.0
1,0,54,14,40,58.0
2,0,54,15,40,58.0
3,0,54,16,40,58.0
4,0,54,17,40,58.0


In [76]:
transactions['y'] = transactions.groupby(['date_block_num','shop_id','item_id'])['item_cnt_day'].transform(np.sum).clip(0,20)

In [77]:
gc.collect()

transactions['item_cnt_sum'] = transactions.groupby(['item_id', 'date_block_num'])['item_cnt_day'].transform(np.sum)
transactions['item_cnt_mean'] = transactions.groupby(['item_id', 'date_block_num'])['item_cnt_day'].transform(np.mean)
transactions['item_cnt_min'] = transactions.groupby(['item_id', 'date_block_num'])['item_cnt_day'].transform(np.min)
transactions['item_cnt_max'] = transactions.groupby(['item_id', 'date_block_num'])['item_cnt_day'].transform(np.max)
transactions['item_price_mean'] = transactions.groupby(['item_id', 'date_block_num'])['item_price'].transform(np.mean)


transactions['shop_cnt_sum'] = transactions.groupby(['shop_id', 'date_block_num'])['item_cnt_day'].transform(np.sum)
transactions['shop_cnt_mean'] = transactions.groupby(['shop_id', 'date_block_num'])['item_cnt_day'].transform(np.mean)
transactions['shop_cnt_min'] = transactions.groupby(['shop_id', 'date_block_num'])['item_cnt_day'].transform(np.min)
transactions['shop_cnt_max'] = transactions.groupby(['shop_id', 'date_block_num'])['item_cnt_day'].transform(np.max)
transactions['shop_price_mean'] = transactions.groupby(['shop_id', 'date_block_num'])['item_price'].transform(np.mean)

transactions['category_cnt_sum'] = transactions.groupby(['item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.sum)
transactions['category_cnt_mean'] = transactions.groupby(['item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.mean)
transactions['category_cnt_min'] = transactions.groupby(['item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.min)
transactions['category_cnt_max'] = transactions.groupby(['item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.max)
transactions['category_price_mean'] = transactions.groupby(['item_category_id', 'date_block_num'])['item_price'].transform(np.mean)


transactions['shop_category_cnt_sum'] = transactions.groupby(['shop_id', 'item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.sum)
transactions['shop_category_cnt_mean'] = transactions.groupby(['shop_id', 'item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.mean)
transactions['shop_category_cnt_min'] = transactions.groupby(['shop_id', 'item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.min)
transactions['shop_category_cnt_max'] = transactions.groupby(['shop_id', 'item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.max)
transactions['shop_category_price_mean'] = transactions.groupby(['shop_id', 'item_category_id', 'date_block_num'])['item_price'].transform(np.mean)

In [None]:
#shop

In [78]:
total_sales = transactions['item_cnt_day'].sum()
transactions['shop_share_of_all_sales'] = transactions.groupby(['shop_id'])['item_cnt_day'].transform(np.sum) * 100 / total_sales

transactions['block_total'] = transactions.groupby(['date_block_num'])['item_cnt_day'].transform(np.sum)
transactions['shop_share_of_block_sales'] = transactions.groupby(['shop_id', 'date_block_num'])['item_cnt_day'].transform(np.sum)\
                        * 100 / transactions['block_total']
transactions.drop('block_total', inplace=True, axis=1)

In [79]:
total_gross = (transactions['item_price'] * transactions['item_cnt_day']).sum()

transactions['gross'] = transactions['item_price'] * transactions['item_cnt_day']
transactions['shop_share_of_total_gross'] = transactions.groupby(['shop_id'])['gross'].transform(np.sum) * 100 / total_gross
transactions.drop('gross', inplace=True, axis=1)

In [80]:
transactions['gross'] = transactions['item_price'] * transactions['item_cnt_day']
transactions['block_gross'] = transactions.groupby(['date_block_num'])['gross'].transform(np.sum)

transactions['shop_share_of_block_gross'] = transactions.groupby(['shop_id', 'date_block_num'])['gross'].transform(np.sum)\
                        * 100 / transactions['block_gross']
transactions.drop('block_gross', inplace=True, axis=1)

In [81]:
#category

In [82]:
total_sales = transactions['item_cnt_day'].sum()
transactions['category_share_of_all_sales'] = transactions.groupby(['item_category_id'])['item_cnt_day'].transform(np.sum) * 100 / total_sales

transactions['block_total'] = transactions.groupby(['date_block_num'])['item_cnt_day'].transform(np.sum)
transactions['category_share_of_block_sales'] = transactions.groupby(['item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.sum)\
                        * 100 / transactions['block_total']
transactions.drop('block_total', inplace=True, axis=1)

In [83]:
total_gross = (transactions['item_price'] * transactions['item_cnt_day']).sum()

transactions['gross'] = transactions['item_price'] * transactions['item_cnt_day']
transactions['category_share_of_total_gross'] = transactions.groupby(['item_category_id'])['gross'].transform(np.sum) * 100 / total_gross
transactions.drop('gross', inplace=True, axis=1)

In [101]:
total_gross = (transactions['item_price'] * transactions['item_cnt_day']).sum()

transactions['gross'] = transactions['item_price'] * transactions['item_cnt_day']
transactions['category_share_of_block_gross'] = transactions.groupby(['item_category_id', 'date_block_num'])['gross'].transform(np.sum) * 100 / total_gross
transactions.drop('gross', inplace=True, axis=1)

In [85]:
#item

In [86]:
total_sales = transactions['item_cnt_day'].sum()
transactions['item_share_of_all_sales'] = transactions.groupby(['item_id'])['item_cnt_day'].transform(np.sum) * 100 / total_sales

transactions['block_total'] = transactions.groupby(['date_block_num'])['item_cnt_day'].transform(np.sum)
transactions['item_share_of_block_sales'] = transactions.groupby(['item_id', 'date_block_num'])['item_cnt_day'].transform(np.sum)\
                        * 100 / transactions['block_total']
transactions.drop('block_total', inplace=True, axis=1)

In [87]:
total_gross = (transactions['item_price'] * transactions['item_cnt_day']).sum()

transactions['gross'] = transactions['item_price'] * transactions['item_cnt_day']
transactions['item_share_of_total_gross'] = transactions.groupby(['item_id'])['gross'].transform(np.sum) * 100 / total_gross
transactions.drop('gross', inplace=True, axis=1)

In [102]:
total_gross = (transactions['item_price'] * transactions['item_cnt_day']).sum()

transactions['gross'] = transactions['item_price'] * transactions['item_cnt_day']
transactions['item_share_of_block_gross'] = transactions.groupby(['item_id', 'date_block_num'])['gross'].transform(np.sum) * 100 / total_gross
transactions.drop('gross', inplace=True, axis=1)

In [89]:
transactions['shop_share_of_total_gross'].unique().sum()

100.00000000000006

In [90]:
len(all_combos)

7722666

In [91]:
gc.collect()
transactions.drop_duplicates(['item_id', 'shop_id', 'date_block_num'],inplace=True)
training = pd.merge(all_combos,transactions,on=['item_id', 'shop_id', 'date_block_num'],how='left')


In [92]:
training.drop('item_category_id_y', inplace=True, axis=1)
training.drop('item_price_y', inplace=True, axis=1)
training.rename(columns={'item_category_id_x': 'item_category_id'}, inplace=True)
training.rename(columns={'item_price_x': 'item_price'}, inplace=True)
training.fillna(0, inplace=True)
training.sample(10)

Unnamed: 0,item_id,shop_id,date_block_num,item_category_id,item_price,date,item_cnt_day,y,item_cnt_sum,item_cnt_mean,...,shop_share_of_all_sales,shop_share_of_block_sales,shop_share_of_total_gross,shop_share_of_block_gross,category_share_of_all_sales,category_share_of_block_sales,category_share_of_total_gross,item_share_of_all_sales,item_share_of_block_sales,item_share_of_total_gross
5834611,16527,50,26,40,399.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3536758,10676,25,14,67,750.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6079789,17292,56,29,40,149.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3489943,10549,25,29,40,149.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6642346,18944,59,17,40,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1364056,4188,48,14,76,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7338507,20928,47,28,72,1229.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1705818,5156,19,22,67,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6866397,19621,15,19,40,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6885263,19659,22,27,40,399.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [103]:
transactions.columns

Index(['date', 'date_block_num', 'shop_id', 'item_id', 'item_price',
       'item_cnt_day', 'item_category_id', 'y', 'item_cnt_sum',
       'item_cnt_mean', 'item_cnt_min', 'item_cnt_max', 'item_price_mean',
       'shop_cnt_sum', 'shop_cnt_mean', 'shop_cnt_min', 'shop_cnt_max',
       'shop_price_mean', 'category_cnt_sum', 'category_cnt_mean',
       'category_cnt_min', 'category_cnt_max', 'category_price_mean',
       'shop_category_cnt_sum', 'shop_category_cnt_mean',
       'shop_category_cnt_min', 'shop_category_cnt_max',
       'shop_category_price_mean', 'shop_share_of_all_sales',
       'shop_share_of_block_sales', 'shop_share_of_total_gross',
       'shop_share_of_block_gross', 'category_share_of_all_sales',
       'category_share_of_block_sales', 'category_share_of_total_gross',
       'item_share_of_all_sales', 'item_share_of_block_sales',
       'item_share_of_total_gross', 'category_share_of_block_gross',
       'item_share_of_block_gross'],
      dtype='object')

In [94]:
training = training.sample(frac=1)

In [95]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

training_int = training.select_dtypes(include=['int'])
converted_int = training_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(training_int))
print(mem_usage(converted_int))

compare_ints = pd.concat([training_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

58.92 MB
58.92 MB


Unnamed: 0,before,after


In [96]:
training_float = training.select_dtypes(include=['float'])
converted_float = training_float.apply(pd.to_numeric,downcast='float')

print(mem_usage(training_float))
print(mem_usage(converted_float))

compare_floats = pd.concat([training_float.dtypes,converted_float.dtypes],axis=1)
compare_floats.columns = ['before','after']
compare_floats.apply(pd.Series.value_counts)

2003.26 MB
1031.09 MB


Unnamed: 0,before,after
float32,,33.0
float64,33.0,


In [97]:
optimized_training = training.copy()

optimized_training[converted_int.columns] = converted_int
optimized_training[converted_float.columns] = converted_float

print(mem_usage(training))
print(mem_usage(optimized_training))

2503.58 MB
1531.42 MB


In [98]:
training = optimized_training
del optimized_training
gc.collect()

0

In [104]:
lag_columns = ['item_cnt_sum', 'item_cnt_mean', 'item_cnt_min', 'item_cnt_max',
       'item_price_mean', 'shop_cnt_sum', 'shop_cnt_mean', 'shop_cnt_min',
       'shop_cnt_max', 'shop_price_mean', 'category_cnt_sum',
       'category_cnt_mean', 'category_cnt_min', 'category_cnt_max',
       'category_price_mean', 'shop_category_cnt_sum',
       'shop_category_cnt_mean', 'shop_category_cnt_min',
       'shop_category_cnt_max', 'shop_category_price_mean',
      'shop_share_of_block_sales',
       'shop_share_of_block_gross',
       'category_share_of_block_sales','category_share_of_block_gross',
       'item_share_of_block_sales', 'item_share_of_block_gross']


lags = [1,3,6,12]

In [105]:
lag_columns

['item_cnt_sum',
 'item_cnt_mean',
 'item_cnt_min',
 'item_cnt_max',
 'item_price_mean',
 'shop_cnt_sum',
 'shop_cnt_mean',
 'shop_cnt_min',
 'shop_cnt_max',
 'shop_price_mean',
 'category_cnt_sum',
 'category_cnt_mean',
 'category_cnt_min',
 'category_cnt_max',
 'category_price_mean',
 'shop_category_cnt_sum',
 'shop_category_cnt_mean',
 'shop_category_cnt_min',
 'shop_category_cnt_max',
 'shop_category_price_mean',
 'shop_share_of_block_sales',
 'shop_share_of_block_gross',
 'category_share_of_block_sales',
 'category_share_of_block_gross',
 'item_share_of_block_sales',
 'item_share_of_block_gross']

In [106]:
gc.collect()
def lagged_name(lag_column, lag):
    return "%s_lag_%d" % (lag_column, lag)

merge_columns = ['lagged_block','item_id','shop_id']

for lag in lags:
    print(lag)
    lagged = transactions.copy()
    lagged.rename(columns={'date_block_num':'lagged_block'},inplace=True)
    training['lagged_block'] = training['date_block_num'] - lag
    lagged_names = [lagged_name(c,lag) for c in lag_columns]
    lag_mapping = dict(zip(lag_columns, lagged_names))
    lagged.rename(columns=lag_mapping,inplace=True)
    training = pd.merge(training,lagged[lagged_names+merge_columns],on=merge_columns,how='left')
    del lagged
    gc.collect()

1
3
6
12


In [25]:
len(training)

7722666

In [None]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
training[(training['item_id'] == 30) & (training['shop_id'] == 30)].sort_values(by='date_block_num')

In [70]:
training.columns

Index(['item_id', 'shop_id', 'date_block_num', 'item_category_id', 'date',
       'item_price', 'item_cnt_day', 'y', 'item_cnt_sum', 'item_cnt_mean',
       'item_price_mean', 'shop_cnt_sum', 'shop_cnt_mean', 'shop_price_mean',
       'category_cnt_sum', 'category_cnt_mean', 'category_price_mean',
       'shop_category_cnt_sum', 'shop_category_cnt_mean',
       'shop_category_price_mean', 'item_cnt_sum_lag_1', 'item_cnt_mean_lag_1',
       'item_price_mean_lag_1', 'shop_cnt_sum_lag_1', 'shop_cnt_mean_lag_1',
       'shop_price_mean_lag_1', 'category_cnt_sum_lag_1',
       'category_cnt_mean_lag_1', 'category_price_mean_lag_1',
       'shop_category_cnt_sum_lag_1', 'shop_category_cnt_mean_lag_1',
       'shop_category_price_mean_lag_1', 'item_cnt_sum_lag_2',
       'item_cnt_mean_lag_2', 'item_price_mean_lag_2', 'shop_cnt_sum_lag_2',
       'shop_cnt_mean_lag_2', 'shop_price_mean_lag_2',
       'category_cnt_sum_lag_2', 'category_cnt_mean_lag_2',
       'category_price_mean_lag_2', 'sho

In [27]:
import pickle as pickle

pickle.dump(training, open( "training", "wb"), protocol=4)

#training = pickle.load( open( "training", "rb" ) )

In [107]:
training.drop('lagged_block', inplace=True, axis=1)

In [108]:
lgbm_features = training.columns[17:]
lgbm_features = np.append( ['item_id', 'shop_id', 'item_category_id', 'date_block_num'], lgbm_features)
lgbm_features

array(['item_id', 'shop_id', 'item_category_id', 'date_block_num',
       'shop_price_mean', 'category_cnt_sum', 'category_cnt_mean',
       'category_cnt_min', 'category_cnt_max', 'category_price_mean',
       'shop_category_cnt_sum', 'shop_category_cnt_mean',
       'shop_category_cnt_min', 'shop_category_cnt_max',
       'shop_category_price_mean', 'shop_share_of_all_sales',
       'shop_share_of_block_sales', 'shop_share_of_total_gross',
       'shop_share_of_block_gross', 'category_share_of_all_sales',
       'category_share_of_block_sales', 'category_share_of_total_gross',
       'item_share_of_all_sales', 'item_share_of_block_sales',
       'item_share_of_total_gross', 'item_cnt_sum_lag_1',
       'item_cnt_mean_lag_1', 'item_cnt_min_lag_1', 'item_cnt_max_lag_1',
       'item_price_mean_lag_1', 'shop_cnt_sum_lag_1',
       'shop_cnt_mean_lag_1', 'shop_cnt_min_lag_1', 'shop_cnt_max_lag_1',
       'shop_price_mean_lag_1', 'category_cnt_sum_lag_1',
       'category_cnt_mean_lag_1',

In [54]:
lgbm_features = ['item_id', 'shop_category_cnt_mean', 'date_block_num',
       'shop_category_cnt_sum', 'shop_id', 'item_cnt_mean_lag_1',
       'item_category_id', 'shop_category_price_mean',
       'item_cnt_sum_lag_1', 'category_cnt_sum_lag_1',
       'category_cnt_mean_lag_1', 'item_price_mean_lag_1']

In [111]:
categorical_features_indices = [0,1,2,3]

In [109]:
x_train = training[training['date_block_num'] < 32]

non_zeros_train = x_train[x_train['y'] != 0]
zeros_train = x_train[x_train['y'] == 0]
zeros_keep_train = zeros_train.sample(int(len(non_zeros_train)*0.1))
x_train = non_zeros_train.append(zeros_keep_train)

y_train = x_train['y']


val = training[training['date_block_num'].isin([32,33])]
non_zeros_val = val[val['y'] != 0]
zeros_val = val[val['y'] == 0]
zeros_keep_val = zeros_val.sample(int(len(non_zeros_val)*0.5))
x_val = non_zeros_val.append(zeros_keep_val)
y_val = x_val['y']


In [113]:
cb_model = CatBoostRegressor(iterations=10000,
                             learning_rate=0.01,
                             eval_metric='RMSE',
                             #thread_count=16,
                             task_type = "GPU",
                             use_best_model=True,
                             #l2_leaf_reg = 1000,
                            od_type = "Iter",
                            od_wait = 30,
                             #random_strength = 10,
                             #bagging_temperature = 1,
                             #one_hot_max_size = 2,
                             random_seed = 23)


cb_model.fit(x_train[lgbm_features], y_train, cat_features=categorical_features_indices,
             eval_set=(x_val[lgbm_features],y_val),
             #cat_features=categorical_features_pos,         
             verbose=True)

0:	learn: 1.1914751	test: 1.1429253	best: 1.1429253 (0)	total: 92.1ms	remaining: 15m 21s
1:	learn: 1.1814102	test: 1.1343865	best: 1.1343865 (1)	total: 175ms	remaining: 14m 34s
2:	learn: 1.1714636	test: 1.1260118	best: 1.1260118 (2)	total: 257ms	remaining: 14m 15s
3:	learn: 1.1615817	test: 1.1176478	best: 1.1176478 (3)	total: 337ms	remaining: 14m 2s
4:	learn: 1.1518627	test: 1.1094627	best: 1.1094627 (4)	total: 416ms	remaining: 13m 51s
5:	learn: 1.1422372	test: 1.1012502	best: 1.1012502 (5)	total: 495ms	remaining: 13m 44s
6:	learn: 1.1327234	test: 1.0928449	best: 1.0928449 (6)	total: 587ms	remaining: 13m 57s
7:	learn: 1.1233119	test: 1.0849518	best: 1.0849518 (7)	total: 665ms	remaining: 13m 50s
8:	learn: 1.1140187	test: 1.0771701	best: 1.0771701 (8)	total: 743ms	remaining: 13m 45s
9:	learn: 1.1048302	test: 1.0694134	best: 1.0694134 (9)	total: 826ms	remaining: 13m 44s
10:	learn: 1.0957580	test: 1.0619398	best: 1.0619398 (10)	total: 907ms	remaining: 13m 43s
11:	learn: 1.0867426	test: 1.0

93:	learn: 0.6234891	test: 0.6695896	best: 0.6695896 (93)	total: 7.72s	remaining: 13m 34s
94:	learn: 0.6203556	test: 0.6670523	best: 0.6670523 (94)	total: 7.81s	remaining: 13m 33s
95:	learn: 0.6172587	test: 0.6644676	best: 0.6644676 (95)	total: 7.89s	remaining: 13m 33s
96:	learn: 0.6142031	test: 0.6620158	best: 0.6620158 (96)	total: 7.97s	remaining: 13m 33s
97:	learn: 0.6112082	test: 0.6596881	best: 0.6596881 (97)	total: 8.05s	remaining: 13m 33s
98:	learn: 0.6082413	test: 0.6572655	best: 0.6572655 (98)	total: 8.13s	remaining: 13m 33s
99:	learn: 0.6053015	test: 0.6546807	best: 0.6546807 (99)	total: 8.22s	remaining: 13m 34s
100:	learn: 0.6024693	test: 0.6525200	best: 0.6525200 (100)	total: 8.3s	remaining: 13m 33s
101:	learn: 0.5996227	test: 0.6500201	best: 0.6500201 (101)	total: 8.39s	remaining: 13m 33s
102:	learn: 0.5968378	test: 0.6476151	best: 0.6476151 (102)	total: 8.47s	remaining: 13m 33s
103:	learn: 0.5940721	test: 0.6452536	best: 0.6452536 (103)	total: 8.55s	remaining: 13m 33s
104

183:	learn: 0.4667242	test: 0.5319165	best: 0.5319165 (183)	total: 15.2s	remaining: 13m 33s
184:	learn: 0.4658998	test: 0.5312769	best: 0.5312769 (184)	total: 15.3s	remaining: 13m 33s
185:	learn: 0.4651195	test: 0.5306423	best: 0.5306423 (185)	total: 15.4s	remaining: 13m 33s
186:	learn: 0.4643260	test: 0.5300059	best: 0.5300059 (186)	total: 15.5s	remaining: 13m 33s
187:	learn: 0.4635721	test: 0.5294001	best: 0.5294001 (187)	total: 15.6s	remaining: 13m 32s
188:	learn: 0.4628555	test: 0.5286300	best: 0.5286300 (188)	total: 15.7s	remaining: 13m 33s
189:	learn: 0.4621391	test: 0.5280651	best: 0.5280651 (189)	total: 15.7s	remaining: 13m 32s
190:	learn: 0.4614044	test: 0.5274570	best: 0.5274570 (190)	total: 15.8s	remaining: 13m 32s
191:	learn: 0.4606890	test: 0.5269576	best: 0.5269576 (191)	total: 15.9s	remaining: 13m 32s
192:	learn: 0.4599499	test: 0.5264857	best: 0.5264857 (192)	total: 16s	remaining: 13m 32s
193:	learn: 0.4592056	test: 0.5258562	best: 0.5258562 (193)	total: 16.1s	remaining

273:	learn: 0.4245675	test: 0.5105158	best: 0.5074040 (258)	total: 23s	remaining: 13m 37s
274:	learn: 0.4243307	test: 0.5104112	best: 0.5074040 (258)	total: 23.1s	remaining: 13m 37s
275:	learn: 0.4240905	test: 0.5102867	best: 0.5074040 (258)	total: 23.2s	remaining: 13m 38s
276:	learn: 0.4238440	test: 0.5144338	best: 0.5074040 (258)	total: 23.3s	remaining: 13m 38s
277:	learn: 0.4235502	test: 0.5143523	best: 0.5074040 (258)	total: 23.4s	remaining: 13m 38s
278:	learn: 0.4233322	test: 0.5143294	best: 0.5074040 (258)	total: 23.5s	remaining: 13m 38s
279:	learn: 0.4230953	test: 0.5142775	best: 0.5074040 (258)	total: 23.6s	remaining: 13m 38s
280:	learn: 0.4228091	test: 0.5141030	best: 0.5074040 (258)	total: 23.7s	remaining: 13m 38s
281:	learn: 0.4225762	test: 0.5140390	best: 0.5074040 (258)	total: 23.7s	remaining: 13m 38s
282:	learn: 0.4223586	test: 0.5140072	best: 0.5074040 (258)	total: 23.8s	remaining: 13m 38s
283:	learn: 0.4221170	test: 0.5145610	best: 0.5074040 (258)	total: 23.9s	remaining

<catboost.core.CatBoostRegressor at 0x1a3c4011cc0>

In [114]:
lgbm_features[np.argsort(cb_model.get_feature_importance())[::-1]]

array(['item_share_of_block_sales', 'shop_category_cnt_max',
       'item_share_of_all_sales', 'shop_category_cnt_mean',
       'item_share_of_total_gross', 'category_share_of_all_sales',
       'category_share_of_block_sales', 'category_share_of_total_gross',
       'shop_share_of_total_gross', 'shop_share_of_block_gross',
       'category_cnt_sum', 'shop_category_price_mean',
       'category_cnt_mean', 'shop_share_of_block_sales',
       'category_cnt_max', 'shop_price_mean',
       'category_share_of_block_gross_lag_1', 'item_id',
       'item_category_id', 'shop_category_cnt_min',
       'shop_share_of_all_sales', 'category_cnt_max_lag_1',
       'date_block_num', 'category_share_of_block_sales_lag_1',
       'item_cnt_mean_lag_1', 'shop_category_price_mean_lag_1',
       'item_share_of_block_sales_lag_1', 'item_cnt_max_lag_1',
       'shop_category_cnt_sum', 'item_share_of_block_gross_lag_1',
       'category_price_mean_lag_1', 'category_price_mean',
       'shop_cnt_sum_lag_1', 

In [47]:
len(training.item_category_id.unique())

79

In [None]:
print('Plotting feature importances...')
ax = lgbm.plot_importance(model_lgb, max_num_features=10)
plt.show()

In [None]:
best_features_indices = np.argsort(model_lgb.feature_importance())[::-1][0:5]
lgbm_features[best_features_indices]

In [115]:
test_w_cat_ids = test.set_index('item_id').join(items.set_index('item_id'))
test_w_cat_ids['date_block_num'] = 34

In [116]:
lag_columns

['item_cnt_sum',
 'item_cnt_mean',
 'item_cnt_min',
 'item_cnt_max',
 'item_price_mean',
 'shop_cnt_sum',
 'shop_cnt_mean',
 'shop_cnt_min',
 'shop_cnt_max',
 'shop_price_mean',
 'category_cnt_sum',
 'category_cnt_mean',
 'category_cnt_min',
 'category_cnt_max',
 'category_price_mean',
 'shop_category_cnt_sum',
 'shop_category_cnt_mean',
 'shop_category_cnt_min',
 'shop_category_cnt_max',
 'shop_category_price_mean',
 'shop_share_of_block_sales',
 'shop_share_of_block_gross',
 'category_share_of_block_sales',
 'category_share_of_block_gross',
 'item_share_of_block_sales',
 'item_share_of_block_gross']

In [117]:
gc.collect()
def lagged_name(lag_column, lag):
    return "%s_lag_%d" % (lag_column, lag)

merge_columns = ['lagged_block','item_id','shop_id']

for lag in lags:
    print(lag)
    lagged = transactions.copy()
    lagged.rename(columns={'date_block_num':'lagged_block'},inplace=True)
    test_w_cat_ids['lagged_block'] = test_w_cat_ids['date_block_num'] - lag
    lagged_names = [lagged_name(c,lag) for c in lag_columns]
    lag_mapping = dict(zip(lag_columns, lagged_names))
    lagged.rename(columns=lag_mapping,inplace=True)
    test_w_cat_ids = pd.merge(test_w_cat_ids,lagged[lagged_names+merge_columns],on=merge_columns,how='left')
    del lagged
    gc.collect()

1
3
6
12


In [120]:
test_w_cat_ids.columns

Index(['ID', 'item_id', 'shop_id', 'item_name', 'item_category_id',
       'date_block_num', 'lagged_block', 'item_cnt_sum_lag_1',
       'item_cnt_mean_lag_1', 'item_cnt_min_lag_1',
       ...
       'shop_category_cnt_mean_lag_12', 'shop_category_cnt_min_lag_12',
       'shop_category_cnt_max_lag_12', 'shop_category_price_mean_lag_12',
       'shop_share_of_block_sales_lag_12', 'shop_share_of_block_gross_lag_12',
       'category_share_of_block_sales_lag_12',
       'category_share_of_block_gross_lag_12',
       'item_share_of_block_sales_lag_12', 'item_share_of_block_gross_lag_12'],
      dtype='object', length=111)

In [None]:
len(test_w_cat_ids)

In [None]:
preds = model_lgb.predict(test_w_cat_ids[lgbm_features])
preds.clip(0,20,out=preds)

In [119]:
test_w_cat_ids[lgbm_features]

KeyError: "['shop_price_mean' 'category_cnt_sum' 'category_cnt_mean'\n 'category_cnt_min' 'category_cnt_max' 'category_price_mean'\n 'shop_category_cnt_sum' 'shop_category_cnt_mean' 'shop_category_cnt_min'\n 'shop_category_cnt_max' 'shop_category_price_mean'\n 'shop_share_of_all_sales' 'shop_share_of_block_sales'\n 'shop_share_of_total_gross' 'shop_share_of_block_gross'\n 'category_share_of_all_sales' 'category_share_of_block_sales'\n 'category_share_of_total_gross' 'item_share_of_all_sales'\n 'item_share_of_block_sales' 'item_share_of_total_gross'] not in index"

In [118]:
preds = cb_model.predict(test_w_cat_ids[lgbm_features])
preds.clip(0,20,out=preds)

KeyError: "['shop_price_mean' 'category_cnt_sum' 'category_cnt_mean'\n 'category_cnt_min' 'category_cnt_max' 'category_price_mean'\n 'shop_category_cnt_sum' 'shop_category_cnt_mean' 'shop_category_cnt_min'\n 'shop_category_cnt_max' 'shop_category_price_mean'\n 'shop_share_of_all_sales' 'shop_share_of_block_sales'\n 'shop_share_of_total_gross' 'shop_share_of_block_gross'\n 'category_share_of_all_sales' 'category_share_of_block_sales'\n 'category_share_of_total_gross' 'item_share_of_all_sales'\n 'item_share_of_block_sales' 'item_share_of_total_gross'] not in index"

In [60]:
np.sum(preds)

96340.428045824

In [57]:
from sklearn import preprocessing
x_train_scaled = x_train[lgbm_features].copy()
x_train_scaled.fillna(0,inplace=True)
x_val_scaled = x_val[lgbm_features].copy()
x_val_scaled.fillna(0,inplace=True)


x_train_scaled = preprocessing.scale(x_train_scaled)
x_val_scaled = preprocessing.scale(x_val_scaled)

In [60]:
lr_model = LinearRegression().fit(x_train_scaled, y_train)
lr_val_preds = lr_model.predict(x_val_scaled).clip(0,20)
rmse = sqrt(mean_squared_error(y_val, lr_val_preds))
print("rmse: ", rmse)

rmse:  2.205736313122704


In [63]:
rf_model = RandomForestRegressor(random_state=0, n_jobs = 8, verbose=1).fit(x_train_scaled, y_train)

[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:   29.8s remaining:   19.9s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:   37.4s finished


In [64]:
rf_val_preds = rf_model.predict(x_val_scaled).clip(0,20)
rmse = sqrt(mean_squared_error(y_val, rf_val_preds))
print("rmse: ", rmse)

rmse:  3.0751675439681896


[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.1s finished


In [61]:
submission = test.loc[:,['ID']]
submission['item_cnt_month'] = preds.astype(int)
#submission['item_cnt_month'] = ensemble_preds.astype(int)


submission.to_csv('submission.csv', index=False)