In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [81]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
import gc
import xgboost as xgb
from sklearn.linear_model import ElasticNet, Lasso, Ridge, LinearRegression
import pickle
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

In [124]:
transactions    = pd.read_csv('sales_train.csv.gz')
items           = pd.read_csv('items.csv')
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
test            = pd.read_csv('test.csv.gz')

In [83]:
INDEX_COLS = ['item_id', 'shop_id', 'date_block_num']

In [125]:
transactions = pd.merge(transactions, items, on='item_id', how='left')
transactions = transactions.drop('item_name', axis=1)
transactions = transactions[transactions['date_block_num'] > 12]
transactions.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
1366911,01.02.2014,13,27,15242,699.0,1.0,63
1366912,25.02.2014,13,27,15200,299.0,1.0,69
1366913,19.02.2014,13,27,15279,799.0,1.0,63
1366914,26.02.2014,13,27,15202,299.0,1.0,69
1366915,01.02.2014,13,27,14888,549.0,1.0,55


In [74]:
len(transactions)

868080

In [85]:
train_item_ids = transactions['item_id'].unique()
train_shop_ids = transactions['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = transactions['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [86]:
combinations = []
for shop in all_shop_ids:
    #get all article ids ever associated to this shop
    train_ids = transactions[transactions['shop_id'] == shop]['item_id'].unique()
    test_ids = test[test['shop_id'] == shop]['item_id'].unique()
    all_shop = np.unique(np.append(train_ids, test_ids))
    all_shop_combo = [[item, shop, block] for item in all_shop for block in train_blocks]
    for combo in all_shop_combo:
        combinations.append(combo)

In [None]:
len(combinations)

In [87]:
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['item_id', 'shop_id', 'date_block_num'])

In [None]:
len(all_combos)

In [89]:
all_combos.head()

Unnamed: 0,item_id,shop_id,date_block_num,item_category_id
0,0,54,13,40
1,0,54,14,40
2,0,54,15,40
3,0,54,16,40
4,0,54,17,40


In [88]:
all_combos = pd.merge(all_combos, items[['item_id', 'item_category_id']], on='item_id', how='left')

In [126]:
transactions['y'] = transactions.groupby(['date_block_num','shop_id','item_id'])['item_cnt_day'].transform(np.sum).clip(0,25)

In [127]:
gc.collect()

transactions['item_cnt_sum'] = transactions.groupby(['item_id', 'date_block_num'])['item_cnt_day'].transform(np.sum)
transactions['item_cnt_mean'] = transactions.groupby(['item_id', 'date_block_num'])['item_cnt_day'].transform(np.mean)
transactions['item_price_mean'] = transactions.groupby(['item_id', 'date_block_num'])['item_price'].transform(np.mean)


transactions['shop_cnt_sum'] = transactions.groupby(['shop_id', 'date_block_num'])['item_cnt_day'].transform(np.sum)
transactions['shop_cnt_mean'] = transactions.groupby(['shop_id', 'date_block_num'])['item_cnt_day'].transform(np.mean)
transactions['shop_price_mean'] = transactions.groupby(['shop_id', 'date_block_num'])['item_price'].transform(np.mean)

transactions['category_cnt_sum'] = transactions.groupby(['item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.sum)
transactions['category_cnt_mean'] = transactions.groupby(['item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.mean)
transactions['category_price_mean'] = transactions.groupby(['item_category_id', 'date_block_num'])['item_price'].transform(np.mean)


transactions['shop_category_cnt_sum'] = transactions.groupby(['shop_id', 'item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.sum)
transactions['shop_category_cnt_mean'] = transactions.groupby(['shop_id', 'item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.mean)
transactions['shop_category_price_mean'] = transactions.groupby(['shop_id', 'item_category_id', 'date_block_num'])['item_price'].transform(np.mean)

In [92]:
len(transactions)

1568938

In [128]:
transactions = transactions.drop_duplicates(INDEX_COLS)

In [94]:
len(transactions)

868080

In [129]:
training = pd.merge(all_combos,transactions,on=['item_id', 'shop_id', 'date_block_num'],how='left')

In [130]:
training.drop('item_category_id_y', inplace=True, axis=1)

In [131]:
training.rename(columns={'item_category_id_x': 'item_category_id'}, inplace=True)

In [98]:
training[['item_id', 'shop_id', 'item_category_id', 'date_block_num']].sample(100)

Unnamed: 0,item_id,shop_id,item_category_id,date_block_num
2854026,8632,54,40,13
6053457,17217,41,40,31
1456225,4416,21,56,14
6548244,18666,52,55,16
6780181,19331,4,49,29
6729862,19156,6,40,26
4100198,12383,42,55,24
6020302,17109,26,40,14
6285360,17957,39,38,31
986466,3219,6,76,25


In [132]:
training['y'] = training['y'].fillna(0)

In [133]:
training = training.sample(frac=1)

In [134]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

training_int = training.select_dtypes(include=['int'])
converted_int = training_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(training_int))
print(mem_usage(converted_int))

compare_ints = pd.concat([training_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

58.92 MB
58.92 MB


Unnamed: 0,before,after


In [135]:
training_float = training.select_dtypes(include=['float'])
converted_float = training_float.apply(pd.to_numeric,downcast='float')

print(mem_usage(training_float))
print(mem_usage(converted_float))

compare_floats = pd.concat([training_float.dtypes,converted_float.dtypes],axis=1)
compare_floats.columns = ['before','after']
compare_floats.apply(pd.Series.value_counts)

942.71 MB
500.81 MB


Unnamed: 0,before,after
float32,,15.0
float64,15.0,


In [136]:
optimized_training = training.copy()

optimized_training[converted_int.columns] = converted_int
optimized_training[converted_float.columns] = converted_float

print(mem_usage(training))
print(mem_usage(optimized_training))

1443.04 MB
1001.14 MB


In [137]:
training = optimized_training
del optimized_training
gc.collect()

63

In [138]:
lag_columns = training.columns[8:]
lags = [1,2,3,6,12]

In [139]:
lag_columns

Index(['item_cnt_sum', 'item_cnt_mean', 'item_price_mean', 'shop_cnt_sum',
       'shop_cnt_mean', 'shop_price_mean', 'category_cnt_sum',
       'category_cnt_mean', 'category_price_mean', 'shop_category_cnt_sum',
       'shop_category_cnt_mean', 'shop_category_price_mean'],
      dtype='object')

In [140]:
gc.collect()
def lagged_name(lag_column, lag):
    return "%s_lag_%d" % (lag_column, lag)

merge_columns = ['lagged_block','item_id','shop_id']

for lag in lags:
    print(lag)
    lagged = transactions.copy()
    lagged.rename(columns={'date_block_num':'lagged_block'},inplace=True)
    training['lagged_block'] = training['date_block_num'] - lag
    lagged_names = [lagged_name(c,lag) for c in lag_columns]
    lag_mapping = dict(zip(lag_columns, lagged_names))
    lagged.rename(columns=lag_mapping,inplace=True)
    training = pd.merge(training,lagged[lagged_names+merge_columns],on=merge_columns,how='left')
    del lagged
    gc.collect()

1
2
3
6
12


In [110]:
len(training)

7722666

In [None]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
training[(training['item_id'] == 30) & (training['shop_id'] == 30)].sort_values(by='date_block_num')

In [141]:
training.columns

Index(['item_id', 'shop_id', 'date_block_num', 'item_category_id', 'date',
       'item_price', 'item_cnt_day', 'y', 'item_cnt_sum', 'item_cnt_mean',
       'item_price_mean', 'shop_cnt_sum', 'shop_cnt_mean', 'shop_price_mean',
       'category_cnt_sum', 'category_cnt_mean', 'category_price_mean',
       'shop_category_cnt_sum', 'shop_category_cnt_mean',
       'shop_category_price_mean', 'lagged_block', 'item_cnt_sum_lag_1',
       'item_cnt_mean_lag_1', 'item_price_mean_lag_1', 'shop_cnt_sum_lag_1',
       'shop_cnt_mean_lag_1', 'shop_price_mean_lag_1',
       'category_cnt_sum_lag_1', 'category_cnt_mean_lag_1',
       'category_price_mean_lag_1', 'shop_category_cnt_sum_lag_1',
       'shop_category_cnt_mean_lag_1', 'shop_category_price_mean_lag_1',
       'item_cnt_sum_lag_2', 'item_cnt_mean_lag_2', 'item_price_mean_lag_2',
       'shop_cnt_sum_lag_2', 'shop_cnt_mean_lag_2', 'shop_price_mean_lag_2',
       'category_cnt_sum_lag_2', 'category_cnt_mean_lag_2',
       'category_price_m

In [111]:
training.drop(columns=['y_lag_1', 'y_lag_2', 'y_lag_3', 'y_lag_6', 'y_lag_12', 'lagged_block'],inplace=True)

KeyError: "labels ['y_lag_1' 'y_lag_2' 'y_lag_3' 'y_lag_6' 'y_lag_12'] not contained in axis"

In [70]:
import pickle as pickle

#pickle.dump(training, open( "training", "wb"), protocol=4)

training = pickle.load( open( "training", "rb" ) )

In [142]:
lgbm_features = training.columns[17:]
lgbm_features = np.append( ['item_id', 'shop_id', 'item_category_id', 'date_block_num'], lgbm_features)
lgbm_features

array(['item_id', 'shop_id', 'item_category_id', 'date_block_num',
       'shop_category_cnt_sum', 'shop_category_cnt_mean',
       'shop_category_price_mean', 'lagged_block', 'item_cnt_sum_lag_1',
       'item_cnt_mean_lag_1', 'item_price_mean_lag_1',
       'shop_cnt_sum_lag_1', 'shop_cnt_mean_lag_1',
       'shop_price_mean_lag_1', 'category_cnt_sum_lag_1',
       'category_cnt_mean_lag_1', 'category_price_mean_lag_1',
       'shop_category_cnt_sum_lag_1', 'shop_category_cnt_mean_lag_1',
       'shop_category_price_mean_lag_1', 'item_cnt_sum_lag_2',
       'item_cnt_mean_lag_2', 'item_price_mean_lag_2',
       'shop_cnt_sum_lag_2', 'shop_cnt_mean_lag_2',
       'shop_price_mean_lag_2', 'category_cnt_sum_lag_2',
       'category_cnt_mean_lag_2', 'category_price_mean_lag_2',
       'shop_category_cnt_sum_lag_2', 'shop_category_cnt_mean_lag_2',
       'shop_category_price_mean_lag_2', 'item_cnt_sum_lag_3',
       'item_cnt_mean_lag_3', 'item_price_mean_lag_3',
       'shop_cnt_sum_lag_

In [145]:
lgbm_features = ['item_id', 'shop_category_cnt_mean', 'date_block_num',
       'shop_category_cnt_sum', 'shop_id', 'item_cnt_mean_lag_1',
       'item_category_id', 'shop_category_price_mean',
       'item_cnt_sum_lag_1', 'category_cnt_sum_lag_1',
       'category_cnt_mean_lag_1', 'item_price_mean_lag_1']

In [146]:
categorical_features_indices = [0,2,4,6]

In [114]:
training[['item_id', 'shop_id', 'item_category_id', 'date_block_num']].sample(100)

Unnamed: 0,item_id,shop_id,item_category_id,date_block_num
5907364,2864,52,25,23
5120738,21616,30,37,14
7615415,10448,3,38,30
7690527,17416,45,40,16
5924586,1833,48,23,18
4453168,3153,6,75,23
7020559,11111,54,40,32
1412185,21683,15,40,19
6123558,10393,58,37,30
3279571,1023,50,67,14


In [115]:
x_train = training[training['date_block_num'] < 32]

non_zeros_train = x_train[x_train['y'] != 0]
zeros_train = x_train[x_train['y'] == 0]
zeros_keep_train = zeros_train.sample(int(len(non_zeros_train)*0.1))
x_train = non_zeros_train.append(zeros_keep_train)

y_train = x_train['y']


val = training[training['date_block_num'].isin([32,33])]
non_zeros_val = val[val['y'] != 0]
zeros_val = val[val['y'] == 0]
zeros_keep_val = zeros_val.sample(int(len(non_zeros_val)*0.5))
x_val = non_zeros_val.append(zeros_keep_val)
y_val = x_val['y']


In [46]:
len(x_val)

91692

In [47]:
x_train.columns

Index(['item_id', 'shop_id', 'date_block_num', 'item_category_id', 'date',
       'item_price', 'item_cnt_day', 'y', 'item_cnt_sum', 'item_cnt_mean',
       'item_price_mean', 'shop_cnt_sum', 'shop_cnt_mean', 'shop_price_mean',
       'category_cnt_sum', 'category_cnt_mean', 'category_price_mean',
       'lagged_block', 'item_cnt_sum_lag_1', 'item_cnt_mean_lag_1',
       'item_price_mean_lag_1', 'shop_cnt_sum_lag_1', 'shop_cnt_mean_lag_1',
       'shop_price_mean_lag_1', 'category_cnt_sum_lag_1',
       'category_cnt_mean_lag_1', 'category_price_mean_lag_1',
       'item_cnt_sum_lag_2', 'item_cnt_mean_lag_2', 'item_price_mean_lag_2',
       'shop_cnt_sum_lag_2', 'shop_cnt_mean_lag_2', 'shop_price_mean_lag_2',
       'category_cnt_sum_lag_2', 'category_cnt_mean_lag_2',
       'category_price_mean_lag_2', 'item_cnt_sum_lag_3',
       'item_cnt_mean_lag_3', 'item_price_mean_lag_3', 'shop_cnt_sum_lag_3',
       'shop_cnt_mean_lag_3', 'shop_price_mean_lag_3',
       'category_cnt_sum_lag_3'

In [36]:
gc.collect()
lgtrain = lgbm.Dataset(x_train[lgbm_features], label=y_train)
lgval = lgbm.Dataset(x_val[lgbm_features], label=y_val)



#[0.00542047893814942, 29, 24, 0.39949465609514856, 1, 0.67943500, 10]
params = {
        "num_threads": 16,
        "verbosity": -1,
        #"zero_as_missing": "true",
        "boosting":'gbdt',
        "objective" : "regression",
        "metric" : "rmse",
        "seed": 42,
        #"max_bin": 10,#default 255
        #"num_leaves": 10, #default 31
        #"bagging_fraction": 0.3,
        #"bagging_freq": 1,
        #"min_data_in_leaf": 50000,
        #"feature_fraction": 0.5,
        #"min_gain_to_split": 1,
       # "lambda_l1": 100,
        #"lambda_l2": 100,
        #"max_depth": 3, #default -1
        #"min_gain_to_split": 10,
        "learning_rate" : 0.02,
        #"histogram_pool_size": 1000,
        "categorical_column": categorical_features_indices
}


evals_result = {}
model_lgb = lgbm.train(params, lgtrain, 1000, 
                      valid_sets=[lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=100, 
                      evals_result=evals_result)



Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 0.999412
Early stopping, best iteration is:
[38]	valid_0's rmse: 0.991213


In [None]:
len()

In [149]:
cb_model = CatBoostRegressor(iterations=10000,
                             learning_rate=0.01,
                             eval_metric='RMSE',
                             #thread_count=16,
                             task_type = "GPU",
                             use_best_model=True,
                             #l2_leaf_reg = 1000,
                            od_type = "Iter",
                            od_wait = 30,
                             #random_strength = 10,
                             #bagging_temperature = 1,
                             #one_hot_max_size = 2,
                             random_seed = 23)


cb_model.fit(x_train[lgbm_features], y_train, cat_features=categorical_features_indices,
             eval_set=(x_val[lgbm_features],y_val),
             #cat_features=categorical_features_pos,         
             verbose=True)

0:	learn: 3.2481106	test: 2.8058071	best: 2.8058071 (0)	total: 93ms	remaining: 15m 29s
1:	learn: 3.2277678	test: 2.7964333	best: 2.7964333 (1)	total: 169ms	remaining: 14m 3s
2:	learn: 3.2077632	test: 2.7871403	best: 2.7871403 (2)	total: 243ms	remaining: 13m 30s
3:	learn: 3.1878572	test: 2.7778930	best: 2.7778930 (3)	total: 324ms	remaining: 13m 30s
4:	learn: 3.1683114	test: 2.7682231	best: 2.7682231 (4)	total: 416ms	remaining: 13m 52s
5:	learn: 3.1489560	test: 2.7593891	best: 2.7593891 (5)	total: 494ms	remaining: 13m 42s
6:	learn: 3.1299186	test: 2.7513238	best: 2.7513238 (6)	total: 573ms	remaining: 13m 37s
7:	learn: 3.1111465	test: 2.7420594	best: 2.7420594 (7)	total: 663ms	remaining: 13m 48s
8:	learn: 3.0926222	test: 2.7329208	best: 2.7329208 (8)	total: 756ms	remaining: 13m 58s
9:	learn: 3.0742942	test: 2.7246562	best: 2.7246562 (9)	total: 845ms	remaining: 14m 3s
10:	learn: 3.0562927	test: 2.7164404	best: 2.7164404 (10)	total: 919ms	remaining: 13m 54s
11:	learn: 3.0384432	test: 2.7089

93:	learn: 2.1498226	test: 2.2678562	best: 2.2678562 (93)	total: 8.73s	remaining: 15m 20s
94:	learn: 2.1435819	test: 2.2649019	best: 2.2649019 (94)	total: 8.82s	remaining: 15m 19s
95:	learn: 2.1373974	test: 2.2618402	best: 2.2618402 (95)	total: 8.9s	remaining: 15m 17s
96:	learn: 2.1316500	test: 2.2586022	best: 2.2586022 (96)	total: 9s	remaining: 15m 18s
97:	learn: 2.1259143	test: 2.2547635	best: 2.2547635 (97)	total: 9.1s	remaining: 15m 19s
98:	learn: 2.1201440	test: 2.2520676	best: 2.2520676 (98)	total: 9.17s	remaining: 15m 17s
99:	learn: 2.1144418	test: 2.2494269	best: 2.2494269 (99)	total: 9.28s	remaining: 15m 18s
100:	learn: 2.1084173	test: 2.2476955	best: 2.2476955 (100)	total: 9.37s	remaining: 15m 18s
101:	learn: 2.1028981	test: 2.2451199	best: 2.2451199 (101)	total: 9.47s	remaining: 15m 18s
102:	learn: 2.0970899	test: 2.2426705	best: 2.2426705 (102)	total: 9.58s	remaining: 15m 20s
103:	learn: 2.0915445	test: 2.2400330	best: 2.2400330 (103)	total: 9.67s	remaining: 15m 20s
104:	le

183:	learn: 1.8232280	test: 2.0850770	best: 2.0850770 (183)	total: 18.4s	remaining: 16m 23s
184:	learn: 1.8212994	test: 2.0837352	best: 2.0837352 (184)	total: 18.5s	remaining: 16m 23s
185:	learn: 1.8193379	test: 2.0830082	best: 2.0830082 (185)	total: 18.7s	remaining: 16m 24s
186:	learn: 1.8173339	test: 2.0819133	best: 2.0819133 (186)	total: 18.8s	remaining: 16m 24s
187:	learn: 1.8154232	test: 2.0811936	best: 2.0811936 (187)	total: 18.9s	remaining: 16m 25s
188:	learn: 1.8135500	test: 2.0800181	best: 2.0800181 (188)	total: 19s	remaining: 16m 25s
189:	learn: 1.8116899	test: 2.0792922	best: 2.0792922 (189)	total: 19.1s	remaining: 16m 26s
190:	learn: 1.8099221	test: 2.0778087	best: 2.0778087 (190)	total: 19.2s	remaining: 16m 27s
191:	learn: 1.8081450	test: 2.0771585	best: 2.0771585 (191)	total: 19.3s	remaining: 16m 27s
192:	learn: 1.8065379	test: 2.0758244	best: 2.0758244 (192)	total: 19.5s	remaining: 16m 28s
193:	learn: 1.8047103	test: 2.0750494	best: 2.0750494 (193)	total: 19.6s	remaining

273:	learn: 1.7096723	test: 1.9990913	best: 1.9990913 (273)	total: 28.5s	remaining: 16m 52s
274:	learn: 1.7088798	test: 1.9982113	best: 1.9982113 (274)	total: 28.6s	remaining: 16m 52s
275:	learn: 1.7078201	test: 1.9978760	best: 1.9978760 (275)	total: 28.7s	remaining: 16m 52s
276:	learn: 1.7071870	test: 1.9973968	best: 1.9973968 (276)	total: 28.8s	remaining: 16m 52s
277:	learn: 1.7064031	test: 1.9966771	best: 1.9966771 (277)	total: 29s	remaining: 16m 53s
278:	learn: 1.7054418	test: 1.9954159	best: 1.9954159 (278)	total: 29.1s	remaining: 16m 53s
279:	learn: 1.7049131	test: 1.9946576	best: 1.9946576 (279)	total: 29.2s	remaining: 16m 53s
280:	learn: 1.7042873	test: 1.9939478	best: 1.9939478 (280)	total: 29.3s	remaining: 16m 54s
281:	learn: 1.7033027	test: 1.9933355	best: 1.9933355 (281)	total: 29.4s	remaining: 16m 54s
282:	learn: 1.7023901	test: 1.9926100	best: 1.9926100 (282)	total: 29.6s	remaining: 16m 54s
283:	learn: 1.7018273	test: 1.9919995	best: 1.9919995 (283)	total: 29.7s	remaining

365:	learn: 1.6538635	test: 1.9523778	best: 1.9523778 (365)	total: 38s	remaining: 16m 39s
366:	learn: 1.6534132	test: 1.9521640	best: 1.9521640 (366)	total: 38s	remaining: 16m 38s
367:	learn: 1.6530597	test: 1.9520985	best: 1.9520985 (367)	total: 38.1s	remaining: 16m 37s
368:	learn: 1.6525108	test: 1.9518017	best: 1.9518017 (368)	total: 38.2s	remaining: 16m 37s
369:	learn: 1.6520574	test: 1.9513461	best: 1.9513461 (369)	total: 38.3s	remaining: 16m 37s
370:	learn: 1.6517069	test: 1.9511384	best: 1.9511384 (370)	total: 38.4s	remaining: 16m 36s
371:	learn: 1.6512229	test: 1.9508945	best: 1.9508945 (371)	total: 38.5s	remaining: 16m 37s
372:	learn: 1.6506909	test: 1.9505920	best: 1.9505920 (372)	total: 38.6s	remaining: 16m 37s
373:	learn: 1.6501593	test: 1.9502696	best: 1.9502696 (373)	total: 38.7s	remaining: 16m 37s
374:	learn: 1.6496827	test: 1.9499702	best: 1.9499702 (374)	total: 38.8s	remaining: 16m 37s
375:	learn: 1.6493414	test: 1.9498454	best: 1.9498454 (375)	total: 38.9s	remaining: 

456:	learn: 1.6191053	test: 1.9326949	best: 1.9326949 (456)	total: 46.6s	remaining: 16m 12s
457:	learn: 1.6188285	test: 1.9324524	best: 1.9324524 (457)	total: 46.7s	remaining: 16m 12s
458:	learn: 1.6185716	test: 1.9323336	best: 1.9323336 (458)	total: 46.7s	remaining: 16m 11s
459:	learn: 1.6183285	test: 1.9323323	best: 1.9323323 (459)	total: 46.8s	remaining: 16m 11s
460:	learn: 1.6179004	test: 1.9322483	best: 1.9322483 (460)	total: 46.9s	remaining: 16m 11s
461:	learn: 1.6176281	test: 1.9321501	best: 1.9321501 (461)	total: 47s	remaining: 16m 10s
462:	learn: 1.6172685	test: 1.9319028	best: 1.9319028 (462)	total: 47.1s	remaining: 16m 10s
463:	learn: 1.6168921	test: 1.9318996	best: 1.9318996 (463)	total: 47.2s	remaining: 16m 10s
464:	learn: 1.6166732	test: 1.9318509	best: 1.9318509 (464)	total: 47.3s	remaining: 16m 9s
465:	learn: 1.6162068	test: 1.9318176	best: 1.9318176 (465)	total: 47.4s	remaining: 16m 9s
466:	learn: 1.6159374	test: 1.9316803	best: 1.9316803 (466)	total: 47.5s	remaining: 

547:	learn: 1.5913764	test: 1.9229030	best: 1.9229030 (547)	total: 54.6s	remaining: 15m 41s
548:	learn: 1.5911249	test: 1.9227306	best: 1.9227306 (548)	total: 54.7s	remaining: 15m 41s
549:	learn: 1.5908408	test: 1.9225169	best: 1.9225169 (549)	total: 54.8s	remaining: 15m 41s
550:	learn: 1.5906241	test: 1.9224932	best: 1.9224932 (550)	total: 54.9s	remaining: 15m 42s
551:	learn: 1.5904562	test: 1.9224415	best: 1.9224415 (551)	total: 55s	remaining: 15m 41s
552:	learn: 1.5902749	test: 1.9222988	best: 1.9222988 (552)	total: 55.1s	remaining: 15m 42s
553:	learn: 1.5900977	test: 1.9217470	best: 1.9217470 (553)	total: 55.2s	remaining: 15m 41s
554:	learn: 1.5897416	test: 1.9219237	best: 1.9217470 (553)	total: 55.3s	remaining: 15m 41s
555:	learn: 1.5893473	test: 1.9220465	best: 1.9217470 (553)	total: 55.4s	remaining: 15m 41s
556:	learn: 1.5889716	test: 1.9220581	best: 1.9217470 (553)	total: 55.5s	remaining: 15m 41s
557:	learn: 1.5885540	test: 1.9219828	best: 1.9217470 (553)	total: 55.6s	remaining

637:	learn: 1.5690643	test: 1.9121865	best: 1.9121865 (637)	total: 1m 2s	remaining: 15m 22s
638:	learn: 1.5688118	test: 1.9122181	best: 1.9121865 (637)	total: 1m 2s	remaining: 15m 21s
639:	learn: 1.5686495	test: 1.9121886	best: 1.9121865 (637)	total: 1m 3s	remaining: 15m 21s
640:	learn: 1.5685036	test: 1.9118749	best: 1.9118749 (640)	total: 1m 3s	remaining: 15m 21s
641:	learn: 1.5682864	test: 1.9119934	best: 1.9118749 (640)	total: 1m 3s	remaining: 15m 21s
642:	learn: 1.5679505	test: 1.9120471	best: 1.9118749 (640)	total: 1m 3s	remaining: 15m 20s
643:	learn: 1.5678014	test: 1.9119872	best: 1.9118749 (640)	total: 1m 3s	remaining: 15m 20s
644:	learn: 1.5676815	test: 1.9119018	best: 1.9118749 (640)	total: 1m 3s	remaining: 15m 20s
645:	learn: 1.5675359	test: 1.9118463	best: 1.9118463 (645)	total: 1m 3s	remaining: 15m 20s
646:	learn: 1.5673432	test: 1.9122782	best: 1.9118463 (645)	total: 1m 3s	remaining: 15m 20s
647:	learn: 1.5671082	test: 1.9122543	best: 1.9118463 (645)	total: 1m 3s	remaini

729:	learn: 1.5493279	test: 1.9089936	best: 1.9089782 (720)	total: 1m 11s	remaining: 15m 6s
730:	learn: 1.5490806	test: 1.9089914	best: 1.9089782 (720)	total: 1m 11s	remaining: 15m 5s
731:	learn: 1.5489807	test: 1.9087952	best: 1.9087952 (731)	total: 1m 11s	remaining: 15m 5s
732:	learn: 1.5488293	test: 1.9088102	best: 1.9087952 (731)	total: 1m 11s	remaining: 15m 5s
733:	learn: 1.5486395	test: 1.9089018	best: 1.9087952 (731)	total: 1m 11s	remaining: 15m 5s
734:	learn: 1.5484637	test: 1.9089185	best: 1.9087952 (731)	total: 1m 11s	remaining: 15m 4s
735:	learn: 1.5483272	test: 1.9088955	best: 1.9087952 (731)	total: 1m 11s	remaining: 15m 4s
736:	learn: 1.5482156	test: 1.9088406	best: 1.9087952 (731)	total: 1m 11s	remaining: 15m 4s
737:	learn: 1.5480928	test: 1.9087953	best: 1.9087952 (731)	total: 1m 12s	remaining: 15m 4s
738:	learn: 1.5480026	test: 1.9087177	best: 1.9087177 (738)	total: 1m 12s	remaining: 15m 5s
739:	learn: 1.5477440	test: 1.9085890	best: 1.9085890 (739)	total: 1m 12s	remain

<catboost.core.CatBoostRegressor at 0x239edb47be0>

In [144]:
lgbm_features[np.argsort(cb_model.get_feature_importance())[::-1]]

array(['item_id', 'shop_category_cnt_mean', 'date_block_num',
       'shop_category_cnt_sum', 'shop_id', 'item_cnt_mean_lag_1',
       'item_category_id', 'shop_category_price_mean',
       'item_cnt_sum_lag_1', 'category_cnt_sum_lag_1',
       'category_cnt_mean_lag_1', 'item_price_mean_lag_1',
       'category_cnt_sum_lag_3', 'category_cnt_sum_lag_6',
       'category_price_mean_lag_1', 'category_cnt_sum_lag_2',
       'shop_category_cnt_sum_lag_1', 'shop_cnt_sum_lag_6',
       'shop_category_cnt_mean_lag_1', 'lagged_block',
       'category_cnt_mean_lag_6', 'shop_cnt_sum_lag_3',
       'item_cnt_sum_lag_2', 'shop_cnt_sum_lag_1',
       'category_cnt_mean_lag_2', 'item_cnt_sum_lag_3',
       'category_cnt_mean_lag_3', 'shop_category_cnt_mean_lag_3',
       'shop_category_price_mean_lag_1', 'item_cnt_mean_lag_3',
       'shop_cnt_sum_lag_2', 'item_price_mean_lag_2',
       'item_cnt_mean_lag_2', 'shop_cnt_mean_lag_12',
       'shop_category_price_mean_lag_3', 'item_price_mean_lag_3',


In [47]:
len(training.item_category_id.unique())

79

In [None]:
print('Plotting feature importances...')
ax = lgbm.plot_importance(model_lgb, max_num_features=10)
plt.show()

In [None]:
best_features_indices = np.argsort(model_lgb.feature_importance())[::-1][0:5]
lgbm_features[best_features_indices]

In [150]:
test_w_cat_ids = test.set_index('item_id').join(items.set_index('item_id'))
test_w_cat_ids['date_block_num'] = 34

In [156]:
lag_columns

Index(['item_cnt_sum', 'item_cnt_mean', 'item_price_mean', 'shop_cnt_sum',
       'shop_cnt_mean', 'shop_price_mean', 'category_cnt_sum',
       'category_cnt_mean', 'category_price_mean', 'shop_category_cnt_sum',
       'shop_category_cnt_mean', 'shop_category_price_mean'],
      dtype='object')

In [151]:
gc.collect()
def lagged_name(lag_column, lag):
    return "%s_lag_%d" % (lag_column, lag)

merge_columns = ['lagged_block','item_id','shop_id']

for lag in lags:
    print(lag)
    lagged = transactions.copy()
    lagged.rename(columns={'date_block_num':'lagged_block'},inplace=True)
    test_w_cat_ids['lagged_block'] = test_w_cat_ids['date_block_num'] - lag
    lagged_names = [lagged_name(c,lag) for c in lag_columns]
    lag_mapping = dict(zip(lag_columns, lagged_names))
    lagged.rename(columns=lag_mapping,inplace=True)
    test_w_cat_ids = pd.merge(test_w_cat_ids,lagged[lagged_names+merge_columns],on=merge_columns,how='left')
    del lagged
    gc.collect()

1
2
3
6
12


In [155]:
test_w_cat_ids.columns

Index(['ID', 'item_id', 'shop_id', 'item_name', 'item_category_id',
       'date_block_num', 'lagged_block', 'item_cnt_sum_lag_1',
       'item_cnt_mean_lag_1', 'item_price_mean_lag_1', 'shop_cnt_sum_lag_1',
       'shop_cnt_mean_lag_1', 'shop_price_mean_lag_1',
       'category_cnt_sum_lag_1', 'category_cnt_mean_lag_1',
       'category_price_mean_lag_1', 'shop_category_cnt_sum_lag_1',
       'shop_category_cnt_mean_lag_1', 'shop_category_price_mean_lag_1',
       'item_cnt_sum_lag_2', 'item_cnt_mean_lag_2', 'item_price_mean_lag_2',
       'shop_cnt_sum_lag_2', 'shop_cnt_mean_lag_2', 'shop_price_mean_lag_2',
       'category_cnt_sum_lag_2', 'category_cnt_mean_lag_2',
       'category_price_mean_lag_2', 'shop_category_cnt_sum_lag_2',
       'shop_category_cnt_mean_lag_2', 'shop_category_price_mean_lag_2',
       'item_cnt_sum_lag_3', 'item_cnt_mean_lag_3', 'item_price_mean_lag_3',
       'shop_cnt_sum_lag_3', 'shop_cnt_mean_lag_3', 'shop_price_mean_lag_3',
       'category_cnt_sum_lag_

In [None]:
len(test_w_cat_ids)

In [None]:
preds = model_lgb.predict(test_w_cat_ids[lgbm_features])
preds.clip(0,20,out=preds)

In [154]:
preds = cb_model.predict(test_w_cat_ids[lgbm_features])
preds.clip(0,20,out=preds)

KeyError: "['shop_category_cnt_mean' 'shop_category_cnt_sum'\n 'shop_category_price_mean'] not in index"

In [60]:
np.sum(preds)

96340.428045824

In [67]:
from sklearn import preprocessing
x_train_scaled = x_train[lgbm_features].copy()
x_train_scaled.fillna(0,inplace=True)
x_val_scaled = x_val[lgbm_features].copy()
x_val_scaled.fillna(0,inplace=True)


x_train_scaled = preprocessing.scale(x_train_scaled)
x_val_scaled = preprocessing.scale(x_val_scaled)

In [71]:
lr_model = LinearRegression().fit(x_train_scaled, y_train)
lr_val_preds = lr_model.predict(x_val_scaled).clip(0,20)
rmse = sqrt(mean_squared_error(y_val, lr_val_preds))
print("rmse: ", rmse)

rmse:  1.0198653021875652


In [61]:
submission = test.loc[:,['ID']]
submission['item_cnt_month'] = preds.astype(int)
#submission['item_cnt_month'] = ensemble_preds.astype(int)


submission.to_csv('submission.csv', index=False)