#Introduction

This notebook summarizes the final project of the course. The following versions of packages are used in this work.

numpy 1.18.4

pandas 1.0.3

scipy 1.4.1

sklearn 0.22.2.post1

lightgbm 2.2.3

In [None]:
import pandas as pd
import numpy as np
import os
import gc
import matplotlib.pyplot as plt
%matplotlib inline 
from itertools import product
import sklearn.model_selection

import sklearn
import scipy.sparse 

import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
import pickle

for p in [np, pd, scipy, sklearn, lightgbm]:
    print (p.__name__, p.__version__)

numpy 1.18.4
pandas 1.0.3
scipy 1.4.1
sklearn 0.22.2.post1
lightgbm 2.2.3


Read the files.

In [None]:
transactions    = pd.read_csv('sales_train.csv.zip')
items           = pd.read_csv('items.csv.zip')
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
test           = pd.read_csv('test.csv.zip')
submission           = pd.read_csv('sample_submission.csv.zip')

#EDA

This section looks at some of the features available on the competition.

# Feature generation

This code snippet downcasts the data types

In [None]:
pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 50)

def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

Generating some features (the monthly total sales for each item, total sales of each store, and total sales for each item in each store)

In [None]:
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in transactions['date_block_num'].unique():
    cur_shops = transactions.loc[transactions['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = transactions.loc[transactions['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

# Groupby data to get shop-item-month aggregates
gb = transactions.groupby(index_cols,as_index=False).item_cnt_day.sum()
gb.rename(columns={'item_cnt_day':'trg'},inplace=True)
all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates
gb = transactions.groupby(['shop_id', 'date_block_num'],as_index=False).item_cnt_day.sum()
gb.rename(columns={'item_cnt_day':'s_mn'},inplace=True)
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = transactions.groupby(['item_id', 'date_block_num'],as_index=False).item_cnt_day.sum()
gb.rename(columns={'item_cnt_day':'i_mn'},inplace=True)
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
del grid, gb 
gc.collect();
print(all_data.head())
print(all_data.tail())

   shop_id  item_id  date_block_num  trg    s_mn  i_mn
0       59    22154               0  1.0  2017.0  18.0
1       59     2552               0  0.0  2017.0   0.0
2       59     2554               0  0.0  2017.0   1.0
3       59     2555               0  0.0  2017.0   2.0
4       59     2564               0  0.0  2017.0   5.0
          shop_id  item_id  date_block_num  trg    s_mn  i_mn
10913845       21     7635              33  0.0  1912.0   1.0
10913846       21     7638              33  0.0  1912.0   1.0
10913847       21     7640              33  0.0  1912.0   1.0
10913848       21     7632              33  0.0  1912.0   1.0
10913849       21     7440              33  0.0  1912.0   1.0


Taking the Lag (1 month before, 2 months before, 3 months before, 6 months before and 1 year before) of the above mentioned features (target, shop_target, item_target). 

In [None]:
# List of columns that we will use to create lags
cols_to_rename = list(all_data.columns.difference(index_cols)) 
print(cols_to_rename)
shift_range = [1, 2, 3, 4, 5, 6, 12]

for month_shift in shift_range:
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_l_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift

# Don't use old data from year 2013
all_data = all_data[all_data['date_block_num'] >= 12] 

# List of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

# Category for each item
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
all_data = downcast_dtypes(all_data)
# print(to_drop_cols)
# print(all_data.head(10))
# print(all_data.tail(10))
gc.collect();

['i_mn', 's_mn', 'trg']


##Adding mean encoding for category values
In this section we add the mean encoding for items based on item categories. K-fold mean encoding is used to generate these values. The mean encoding of each item category for each month is generated. Similar to the previousfeatures lagged mean encoding for 1 month, 2 month, 3 months, 6 months and 12 months is added to the features.

In [None]:
# YOUR CODE GOES HERE

Folds=sklearn.model_selection.KFold(5,shuffle=False)
#print(Folds[1])

for train_index, test_index in Folds.split(all_data):
    X_tr, X_val = all_data.iloc[train_index], all_data.iloc[test_index]
    all_data.loc[all_data.index[test_index],'item_cat_target_enc']=X_val['item_category_id'].map(X_tr.groupby('item_category_id')['trg'].mean())
    #item_id_target_mean = all_data.iloc(train_index).groupby('item_id').target.mean()    

all_data['item_cat_target_enc'].fillna(0.3343, inplace=True)
item_id_target_mean = all_data.groupby('item_category_id').item_cat_target_enc.mean()
all_data['item_cat_target_enc'] = all_data['item_category_id'].map(item_id_target_mean)

# Fill NaNs
 # Print correlation
encoded_feature = all_data['item_cat_target_enc'].values
# You will need to compute correlation like that
corr = np.corrcoef(all_data['trg'].values, encoded_feature)[0][1]
print(corr)
all_data = downcast_dtypes(all_data)

0.4078919456341599


##Seperating the validation and training sets
Here, the data from the final month on the training set is used as the validation set.

In [None]:
all_data =  all_data.drop(['item_cat_target_enc'], axis=1)
#all_data =  all_data.drop(['item_category_id'], axis=1)
X_train = all_data.loc[all_data['date_block_num'] <  33].drop(to_drop_cols, axis=1)
#X_train=X_train.drop(['item_cat_target_enc'],axis=1)
X_test =  all_data.loc[all_data['date_block_num']== 33].drop(to_drop_cols, axis=1)
#X_test=X_test.drop(['item_cat_target_enc'],axis=1)

y_train = all_data.loc[all_data['date_block_num'] <  33, 'trg'].values
y_test =  all_data.loc[all_data['date_block_num'] == 33, 'trg'].values

##Generating the same features for the final data

Here, setting up the final features for the unseen data.

In [None]:
test['date_block_num']=34
#print(test.head())

shift_range = [1, 2, 3,4,5, 6, 12]

for month_shift in shift_range:
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_l_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    test = pd.merge(test, train_shift, on=index_cols, how='left').fillna(0)

del train_shift

# Category for each item
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

test = pd.merge(test, item_category_mapping, how='left', on='item_id')
test = downcast_dtypes(test)
#test['item_cat_target_enc'] = all_data['item_category_id'].map(item_id_target_mean)

# print(test.head())
test_final =  test.drop(['date_block_num','ID'], axis=1)
# print(test_final.head())
print(test_final.shape)

print(X_train.shape)
print(X_test.shape)

(214200, 24)
(6186922, 24)
(238172, 24)


In [None]:
print(test_final.columns)
print(X_train.columns)
X=X_train.append(X_test, ignore_index = True) 
y=np.append(y_train,y_test)
print(to_drop_cols)

Index(['shop_id', 'item_id', 'i_mn_l_1', 's_mn_l_1', 'trg_l_1', 'i_mn_l_2',
       's_mn_l_2', 'trg_l_2', 'i_mn_l_3', 's_mn_l_3', 'trg_l_3', 'i_mn_l_4',
       's_mn_l_4', 'trg_l_4', 'i_mn_l_5', 's_mn_l_5', 'trg_l_5', 'i_mn_l_6',
       's_mn_l_6', 'trg_l_6', 'i_mn_l_12', 's_mn_l_12', 'trg_l_12',
       'item_category_id'],
      dtype='object')
Index(['shop_id', 'item_id', 'i_mn_l_1', 's_mn_l_1', 'trg_l_1', 'i_mn_l_2',
       's_mn_l_2', 'trg_l_2', 'i_mn_l_3', 's_mn_l_3', 'trg_l_3', 'i_mn_l_4',
       's_mn_l_4', 'trg_l_4', 'i_mn_l_5', 's_mn_l_5', 'trg_l_5', 'i_mn_l_6',
       's_mn_l_6', 'trg_l_6', 'i_mn_l_12', 's_mn_l_12', 'trg_l_12',
       'item_category_id'],
      dtype='object')
['trg', 's_mn', 'i_mn', 'date_block_num']


In [None]:
test_final.to_csv('test_final.csv',index=False)
all_data.to_csv('all_data.csv',index=False)

#Single Models

## Training the Linear Regression Model

In [None]:


lr = LinearRegression()
lr.fit(X_train.values, y_train)
pred_lr = lr.predict(X_test.values)

print('Test R-squared for linreg is %f' % r2_score(y_test, pred_lr))
print('Test MSE for linreg is %f' % mean_squared_error(y_test, pred_lr)**0.5)

Test R-squared for linreg is 0.249171
Test MSE for linreg is 4.629506


In [None]:


sc_X = StandardScaler()
X_lr = sc_X.fit_transform(X_train)
X_t_lr = sc_X.fit_transform(X_test)

lr = LinearRegression()
lr.fit(X_lr, y_train)
pred_lr = lr.predict(X_t_lr)

print('Test R-squared for linreg is %f' % r2_score(y_test, pred_lr))
print('Test MSE for linreg is %f' % mean_squared_error(y_test, pred_lr)**0.5)

Test R-squared for linreg is 0.274102
Test MSE for linreg is 4.551994


In [None]:
lr.fit(sc_X.fit_transform(X), y)
final_pred_lr = lr.predict(sc_X.fit_transform(test_final))

submission['item_cnt_month']=final_pred_lr.clip(0,20)
print(submission.head())
submission.to_csv('LR_single.csv',index=False)

   ID  item_cnt_month
0   0        0.680842
1   1        0.145837
2   2        1.081187
3   3        0.341886
4   4        0.161811


The leaderboard score with this solution: 1.05954

## Training the LGB Model

In [None]:
lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }

model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)
pred_lgb = model.predict(X_test)

print('Test R-squared for LightGBM is %f' % r2_score(y_test, pred_lgb))
print('Test MSE for LightGBM is %f' % mean_squared_error(y_test, pred_lgb)**0.5)

Test R-squared for LightGBM is 0.270185
Test MSE for LightGBM is 20.832469


In [None]:
model = lgb.train(lgb_params, lgb.Dataset(X, label=y), 200)
final_pred_lgb = model.predict(test_final)
submission['item_cnt_month']=final_pred_lgb.clip(0,20)
print(submission.head())
submission.to_csv('LGB_single.csv',index=False)

   ID  item_cnt_month
0   0        0.391065
1   1        0.296546
2   2        0.862037
3   3        0.209695
4   4        0.065971


The leaderboard score with this solution: 0.97665

## Training the Neural Network

In [None]:

sc_X = StandardScaler()
X_nn = sc_X.fit_transform(X_train)

mlp = MLPRegressor(hidden_layer_sizes=(32,16), activation='relu', solver='adam', max_iter=50,early_stopping=True,verbose=1)
mlp.fit(X_nn,y_train)

pred_NN = mlp.predict(sc_X.fit_transform(X_test))

print('Test R-squared for NN is %f' % r2_score(y_test, pred_NN))
print('Test MSE for NN is %f' % mean_squared_error(y_test, pred_NN))

Iteration 1, loss = 3.59986367
Validation score: 0.456206
Iteration 2, loss = 3.40334688
Validation score: 0.345852
Iteration 3, loss = 3.40445183
Validation score: 0.451515
Iteration 4, loss = 3.26825400
Validation score: 0.466035
Iteration 5, loss = 3.22748782
Validation score: 0.482459
Iteration 6, loss = 3.18635940
Validation score: 0.492130
Iteration 7, loss = 3.08349038
Validation score: 0.489577
Iteration 8, loss = 3.09543755
Validation score: 0.371736
Iteration 9, loss = 3.02243007
Validation score: 0.315471
Iteration 10, loss = 3.02623559
Validation score: 0.497471
Iteration 11, loss = 2.96499008
Validation score: 0.457873
Iteration 12, loss = 3.00854589
Validation score: 0.504584
Iteration 13, loss = 2.92191502
Validation score: 0.497234
Iteration 14, loss = 2.88283156
Validation score: 0.488629
Iteration 15, loss = 2.84664435
Validation score: 0.516002
Iteration 16, loss = 2.82917486
Validation score: 0.547496
Iteration 17, loss = 2.87043376
Validation score: 0.501640
Iterat

In [None]:
mlp.fit(sc_X.fit_transform(X),y)

final_pred_NN = mlp.predict(sc_X.fit_transform(test_final))

submission['item_cnt_month']=final_pred_NN.clip(0,20)
print(submission.head())
submission.to_csv('NN_single.csv',index=False)

(6425094, 24)
[ 4.  3. 14. ...  0.  0.  0.]
Iteration 1, loss = 3.94180767
Validation score: 0.631982
Iteration 2, loss = 3.85987834
Validation score: 0.643249
Iteration 3, loss = 3.77343951
Validation score: 0.617618
Iteration 4, loss = 3.70162644
Validation score: 0.616193
Iteration 5, loss = 3.65802670
Validation score: 0.587009
Iteration 6, loss = 3.66178399
Validation score: 0.619863
Iteration 7, loss = 3.64508192
Validation score: 0.644419
Iteration 8, loss = 3.60187745
Validation score: 0.630723
Iteration 9, loss = 3.56069926
Validation score: 0.618781
Iteration 10, loss = 3.51997294
Validation score: 0.609351
Iteration 11, loss = 3.49343935
Validation score: 0.576080
Iteration 12, loss = 3.47338765
Validation score: 0.615565
Iteration 13, loss = 3.46855596
Validation score: 0.622208
Iteration 14, loss = 3.39340554
Validation score: 0.637519
Iteration 15, loss = 3.38454742
Validation score: 0.591961
Iteration 16, loss = 3.31814647
Validation score: 0.635963
Iteration 17, loss = 

The leaderboard score with this solution: 1.00741




## Training a Support Vector Regression Model





In [None]:

sc_X = StandardScaler()
X_svr = sc_X.fit_transform(X_train)



svr_rbf = SVR(kernel='rbf',C=100,verbose=1,max_iter=20)
svr_rbf.fit(X_svr,y_train)
#5 Predicting a new result

pred_SVR = svr_rbf.predict(X_train)
pred_SVR = svr_rbf.predict(sc_X.fit_transform(X_test))

print('Test R-squared for SVR is %f' % r2_score(y_test, pred_SVR))
print('Test MSE for SVR is %f' % mean_squared_error(y_test, pred_SVR))

[LibSVM]



Test R-squared for SVR is -4072.593107
Test MSE for SVR is 116280.196783


For some reason SVR dis not worked as expected.

##Trying a kNN Regressor

In [None]:
from sklearn import neighbors

sc_X = StandardScaler()
X_knn = sc_X.fit_transform(X_train)
X_t_knn = sc_X.fit_transform(X_test)

knn = neighbors.KNeighborsRegressor(n_neighbors=1, weights='distance')
knn.fit(X_knn, y_train)
pred_knn = knn.predict(X_t_knn)

print('Test R-squared for kNN is %f' % r2_score(y_test, pred_knn))
print('Test MSE for kNN is %f' % mean_squared_error(y_test, pred_knn)**0.5)


TTHis code did not worked.

##XGBoost Model

In [None]:


model = XGBRegressor(max_depth=8,n_estimators=1000,min_child_weight=100,colsample_bytree=0.8,subsample=0.8,eta=0.3,seed=42)

model.fit(X_train,y_train,eval_metric="rmse",eval_set=[(X_train, y_train),(X_test, y_test)],verbose=True,early_stopping_rounds = 10)
#model.fit(X_train,y_train,eval_metric="rmse",verbose=True)
pred_xgb=model.predict(X_test)

print('Test R-squared for kNN is %f' % r2_score(y_test, pred_xgb))
print('Test MSE for kNN is %f' % mean_squared_error(y_test, pred_xgb)**0.5)



[0]	validation_0-rmse:3.42169	validation_1-rmse:5.28187
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:3.32599	validation_1-rmse:5.2089
[2]	validation_0-rmse:3.23701	validation_1-rmse:5.15404
[3]	validation_0-rmse:3.14569	validation_1-rmse:5.09839
[4]	validation_0-rmse:3.07279	validation_1-rmse:5.05398
[5]	validation_0-rmse:3.00882	validation_1-rmse:5.0147
[6]	validation_0-rmse:2.95496	validation_1-rmse:4.98146
[7]	validation_0-rmse:2.91499	validation_1-rmse:4.95232
[8]	validation_0-rmse:2.87663	validation_1-rmse:4.92764
[9]	validation_0-rmse:2.8423	validation_1-rmse:4.90464
[10]	validation_0-rmse:2.81432	validation_1-rmse:4.87252
[11]	validation_0-rmse:2.77922	validation_1-rmse:4.85242
[12]	validation_0-rmse:2.7585	validation_1-rmse:4.8276
[13]	validation_0-rmse:2.73772	validation_1-rmse:4.81556
[14]	validation_0-rmse:2.71564	validation_1-rmse:4.79774
[

KeyboardInterrupt: ignored

In [None]:
# X=X_train.append(X_test, ignore_index = True) 
# y=np.append(y_train,y_test)
# model.fit(X,y,eval_metric="rmse",verbose=True,early_stopping_rounds = 10)
final_pred_xgb = model.predict(test_final)

submission['item_cnt_month']=final_pred_xgb.clip(0,20)
print(submission.head())
submission.to_csv('XGB_single2.csv',index=False)

   ID  item_cnt_month
0   0        0.482179
1   1        0.262573
2   2        0.968165
3   3        0.281573
4   4        3.835277


The leaderboard score with this solution: 0.97607

This is the best score obtained from a single model.





# Ensembling the results with scheme (f) from the course

Currently I have tried several approaches and their LB scores are mentioned as follows.



1.   Linear Regression - 1.05954
2.   Light GBM - 0.97665
3.   Neural Network - 1.00741
2.   Support Vector Regression- DID NOT WORK. HAVE TO DEBUG
5.   K-Nearest Neighbour Regression- DID NOT WORK. HAVE TO DEBUG.
6.   XGBoost - 0.97607

In this section, I'm going to use 1, 2, 3, and 6 to generate the meta features.

To validate on the second level features, I'm using the scheme f) provided in the course. Here, I'm generating meta features for date block nums 27, 28, 29, 30, 31, 32 to test the models on date block num 33. (This is similar to the scheme used for ensembling assignment). Then, meta features from 27 to 33 are used to train the final model and predict the values for date block num 34 with its own meta data. 



In [None]:
#print(all_data.tail())
# test_final.head()
print(to_drop_cols)

['trg', 's_mn', 'i_mn', 'date_block_num']


##Generating meta features for the test set (date blocks 27 to 32)


This takes a considerably a long time. Thus, the results are saved in a seperate file.

In [None]:



sc_X = StandardScaler()

dates = all_data['date_block_num']
dates_train = dates[dates <  33]
y_train_level2 = y_train[dates_train.isin([27, 28, 29, 30, 31, 32])]

# And here we create 2nd level feeature matrix, init it with zeros first
X_train_level2 = np.zeros([y_train_level2.shape[0], 3])
#X_train_level2 = np.zeros([test_final.shape[0], 2])

# Now fill `X_train_level2` with metafeatures
count=0
lgb_params = {'feature_fraction': 0.75,'metric': 'rmse','nthread':1,'min_data_in_leaf': 2**7,'bagging_fraction': 0.75,
              'learning_rate': 0.03,'objective': 'mse','bagging_seed': 2**7,'num_leaves': 2**7,'bagging_freq':1,'verbose':0}
for cur_block_num in [27, 28, 29, 30, 31, 32]:
    
    print(cur_block_num)
    ## Setting up the train and test data for each round
    X_train_temp = all_data.loc[dates <  cur_block_num].drop(to_drop_cols, axis=1)
    X_test_temp =  all_data.loc[dates == cur_block_num].drop(to_drop_cols, axis=1)

    y_train_temp = all_data.loc[dates <  cur_block_num, 'trg'].values
    y_test_temp =  all_data.loc[dates == cur_block_num, 'trg'].values
    

    #Starting the linear model
    lr = LinearRegression()
    lr.fit(sc_X.fit_transform(X_train_temp.values), y_train_temp)
    pred_lr_temp = lr.predict(sc_X.fit_transform(X_test_temp.values))
    

    #Starting the LGB model 
    model = lgb.train(lgb_params, lgb.Dataset(X_train_temp, label=y_train_temp), 100)
    pred_lgb_temp = model.predict(X_test_temp)

    #Starting the NN
    mlp = MLPRegressor(hidden_layer_sizes=(32,16), activation='relu', solver='adam', max_iter=50,early_stopping=True,verbose=1)
    mlp.fit(sc_X.fit_transform(X_train_temp.values),y_train_temp)
    pred_NN_temp = mlp.predict(sc_X.fit_transform(X_test_temp.values))


    #np.append(X_train_level2,np.c_[pred_lr, pred_lgb],axis=1)
    for i in range(len(pred_lr_temp)):
        X_train_level2[count][0]=pred_lr_temp[i]
        X_train_level2[count][1]=pred_lgb_temp[i]
        X_train_level2[count][2]=pred_NN_temp[i]
        count+=1

    
    

27
Iteration 1, loss = 2.76155994
Validation score: 0.647420
Iteration 2, loss = 2.63155724
Validation score: 0.681180
Iteration 3, loss = 2.49766541
Validation score: 0.691660
Iteration 4, loss = 2.44502316
Validation score: 0.667130
Iteration 5, loss = 2.37714337
Validation score: 0.658975
Iteration 6, loss = 2.34515500
Validation score: 0.706489
Iteration 7, loss = 2.33536410
Validation score: 0.647151
Iteration 8, loss = 2.28131417
Validation score: 0.668704
Iteration 9, loss = 2.28791176
Validation score: 0.691058
Iteration 10, loss = 2.24448202
Validation score: 0.710607
Iteration 11, loss = 2.22934749
Validation score: 0.683495
Iteration 12, loss = 2.19284138
Validation score: 0.686751
Iteration 13, loss = 2.19399282
Validation score: 0.654837
Iteration 14, loss = 2.22715472
Validation score: 0.720622
Iteration 15, loss = 2.20465877
Validation score: 0.722899
Iteration 16, loss = 2.16534573
Validation score: 0.685484
Iteration 17, loss = 2.19258144
Validation score: 0.711609
Ite

##Generating the meta features for the test set (date block num=33).

In [None]:
 #Save the data for future use
 np.save('X_train_level2', X_train_level2) 
 
 #Now generating the meta features for the final test set 
print(all_data.head())
X_train_temp = all_data.loc[dates <  33].drop(to_drop_cols, axis=1)
X_test_temp =  all_data.loc[dates == 33].drop(to_drop_cols, axis=1)

y_train_temp = all_data.loc[dates <  33, 'trg'].values
y_test_temp =  all_data.loc[dates == 33, 'trg'].values
    

    #Starting the linear model
lr = LinearRegression()
lr.fit(sc_X.fit_transform(X_train_temp.values), y_train_temp)
pred_lr_temp = lr.predict(sc_X.fit_transform(X_test_temp.values))
    

    #Starting the LGB model 
model = lgb.train(lgb_params, lgb.Dataset(X_train_temp, label=y_train_temp), 100)
pred_lgb_temp = model.predict(X_test_temp)

#Starting the NN

mlp = MLPRegressor(hidden_layer_sizes=(32,16), activation='relu', solver='adam', max_iter=50,early_stopping=True,verbose=1)
mlp.fit(sc_X.fit_transform(X_train_temp.values),y_train_temp)

pred_NN_temp = mlp.predict(sc_X.fit_transform(X_test_temp.values))


count=0
X_test_level2 = np.c_[pred_lr_temp, pred_lgb_temp,pred_NN_temp] 
np.save('X_test_level2', X_test_level2)
# for i in range(len(pred_lr_temp)):
#   X_test_level2[count][0]=pred_lr_temp[i]
#   X_test_level2[count][1]=pred_lgb_temp[i]
#   count+=1   

   shop_id  item_id  date_block_num   trg    s_mn   i_mn  i_mn_l_1  s_mn_l_1  \
0       54    10297              12   4.0  8198.0   23.0      42.0   10055.0   
1       54    10296              12   3.0  8198.0   17.0      24.0   10055.0   
2       54    10298              12  14.0  8198.0  182.0     369.0   10055.0   
3       54    10300              12   3.0  8198.0   26.0      54.0   10055.0   
4       54    10284              12   1.0  8198.0    3.0       4.0   10055.0   

   trg_l_1  i_mn_l_2  s_mn_l_2  trg_l_2  i_mn_l_3  s_mn_l_3  trg_l_3  \
0      3.0       2.0    7978.0      0.0       0.0       0.0      0.0   
1      0.0       0.0       0.0      0.0       0.0       0.0      0.0   
2     21.0    1309.0    7978.0    119.0     144.0    6676.0      7.0   
3      1.0     361.0    7978.0     31.0      53.0    6676.0      0.0   
4      0.0       3.0    7978.0      0.0       5.0    6676.0      0.0   

   i_mn_l_4  s_mn_l_4  trg_l_4  i_mn_l_5  s_mn_l_5  trg_l_5  i_mn_l_6  \
0       0.0  

In [None]:
 #Now generating the meta features for the final test set 
#print(test_final.head())
X = all_data.drop(to_drop_cols, axis=1)
y = np.append(y_train,y_test)


 #Strting the linear model
lr = LinearRegression()
lr.fit(sc_X.fit_transform(X.values), y)
pred_lr_final = lr.predict(sc_X.fit_transform(test_final.values))

model = lgb.train(lgb_params, lgb.Dataset(X, label=y), 100)
pred_lgb_final = model.predict(test_final)

#Starting the NN

mlp = MLPRegressor(hidden_layer_sizes=(32,16), activation='relu', solver='adam', max_iter=50,early_stopping=True,verbose=1)
mlp.fit(sc_X.fit_transform(X.values),y)

pred_NN_final = mlp.predict(sc_X.fit_transform(test_final.values))


count=0
#np.append(X_train_level2,np.c_[pred_lr, pred_lgb],axis=1)
X_final_level2 = np.c_[pred_lr_final, pred_lgb_final,pred_NN_final] 
np.save('X_final_level2', X_final_level2)

Iteration 1, loss = 3.68496770
Validation score: 0.429310
Iteration 2, loss = 3.58268557
Validation score: 0.500573
Iteration 3, loss = 3.44873211
Validation score: 0.498720
Iteration 4, loss = 3.45736261
Validation score: 0.476497
Iteration 5, loss = 3.38793787
Validation score: 0.507301
Iteration 6, loss = 3.33495824
Validation score: 0.514307
Iteration 7, loss = 3.34408860
Validation score: 0.460664
Iteration 8, loss = 3.32143402
Validation score: 0.517834
Iteration 9, loss = 3.27583305
Validation score: 0.539496
Iteration 10, loss = 3.27266152
Validation score: 0.508022
Iteration 11, loss = 3.26323881
Validation score: 0.558047
Iteration 12, loss = 3.21169960
Validation score: 0.572773
Iteration 13, loss = 3.25751741
Validation score: 0.580670
Iteration 14, loss = 3.14918857
Validation score: 0.546941
Iteration 15, loss = 3.15818940
Validation score: 0.603670
Iteration 16, loss = 3.07718826
Validation score: 0.572489
Iteration 17, loss = 3.09238727
Validation score: 0.555669
Iterat

The above codes generates the meta features by using LR, LGB, and NN. Since these takesa long time to run, those are saved as npy files. The following code snippets do the same for XGBoost.

##Generating XGBoost meta feature


XGBoost met features are generated in this section. This section takes a lot of time to run. Thus, after each iteration, the obtained results are saved as numpy arrays.

In [None]:



dates = all_data['date_block_num']
dates_train = dates[dates <  33]
#for cur_block_num in [27, 28, 29, 30, 31, 32]:
cur_block_num=33    
print(cur_block_num)
    ## Setting up the train and test data for each round
X_train_temp = all_data.loc[dates <  cur_block_num].drop(to_drop_cols, axis=1)
X_test_temp =  all_data.loc[dates == cur_block_num].drop(to_drop_cols, axis=1)

y_train_temp = all_data.loc[dates <  cur_block_num, 'trg'].values
y_test_temp =  all_data.loc[dates == cur_block_num, 'trg'].values
    
model = XGBRegressor(max_depth=6,n_estimators=500,min_child_weight=40,colsample_bytree=0.8,subsample=0.8,eta=0.3,seed=42)

model.fit(X_train,y_train,eval_metric="rmse",eval_set=[(X_train_temp, y_train_temp),(X_test_temp, y_test_temp)],verbose=True,early_stopping_rounds = 6)
pred_xgb=model.predict(X_test_temp)
pickle.dump(model, open("xgb_final.pickle.dat", "wb"))
np.save('pred_xgb33', pred_xgb)
pred_xgb=model.predict(test_final)    
np.save('pred_xgb_final', pred_xgb)
    
    

33
[0]	validation_0-rmse:3.41209	validation_1-rmse:5.26514
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 6 rounds.
[1]	validation_0-rmse:3.30479	validation_1-rmse:5.19146
[2]	validation_0-rmse:3.18394	validation_1-rmse:5.11153
[3]	validation_0-rmse:3.09125	validation_1-rmse:5.04677
[4]	validation_0-rmse:3.01616	validation_1-rmse:4.99817
[5]	validation_0-rmse:2.94858	validation_1-rmse:4.95324
[6]	validation_0-rmse:2.88748	validation_1-rmse:4.9097
[7]	validation_0-rmse:2.83947	validation_1-rmse:4.87392
[8]	validation_0-rmse:2.79589	validation_1-rmse:4.84474
[9]	validation_0-rmse:2.76073	validation_1-rmse:4.81122
[10]	validation_0-rmse:2.73453	validation_1-rmse:4.79371
[11]	validation_0-rmse:2.70615	validation_1-rmse:4.77634
[12]	validation_0-rmse:2.68004	validation_1-rmse:4.75257
[13]	validation_0-rmse:2.6592	validation_1-rmse:4.74102
[14]	validation_0-rmse:2.63592	validation_1-rmse:4.72

In [None]:
print(X_train_temp.columns)

Since the above code takes more than 1 hour to run (for each date block num), the meta features from each date block num was saved. Now loading these files and generating the array of results for xgboost. At the same time adding these values to the X_test_level2, X_train_level2 and X_final_level2 arrays.

In [None]:
XG_pred = np.load('pred_xgb27.npy')
XG_28 = np.load('pred_xgb28.npy')
XG_29 = np.load('pred_xgb29.npy')
XG_30 = np.load('pred_xgb30.npy')
XG_31 = np.load('pred_xgb31.npy')
XG_32 = np.load('pred_xgb32.npy')
XG_33 = np.load('pred_xgb33.npy')
XG_final = np.load('pred_xgb_final.npy')
XG_pred=np.append(XG_pred,XG_28)
XG_pred=np.append(XG_pred,XG_29)
XG_pred=np.append(XG_pred,XG_30)
XG_pred=np.append(XG_pred,XG_31)
XG_pred=np.append(XG_pred,XG_32)

X_test_level2=np.load('X_test_level2.npy')
X_train_level2=np.load('X_train_level2.npy')
X_final_level2=np.load('X_final_level2.npy')

print(XG_pred.shape)
print(X_train_level2.shape)

print(XG_33.shape)
print(X_test_level2.shape)

print(XG_final.shape)
print(X_final_level2.shape)

X_train_level2 = np.c_[X_train_level2,XG_pred] 
X_test_level2 = np.c_[X_test_level2,XG_33] 
X_final_level2 = np.c_[X_final_level2,XG_final] 

print(XG_pred.shape)
print(X_train_level2.shape)

print(XG_33.shape)
print(X_test_level2.shape)

print(XG_final.shape)
print(X_final_level2.shape)

np.save('X_train_level2', X_train_level2)
np.save('X_test_level2', X_test_level2)
np.save('X_final_level2', X_final_level2)


(1376192,)
(1376192, 3)
(238172,)
(238172, 3)
(214200,)
(214200, 3)
(1376192,)
(1376192, 4)
(238172,)
(238172, 4)
(214200,)
(214200, 4)


##Getting the final answer

### Simple linear model

In [None]:
y_test_temp =  all_data.loc[(27<=dates)&(dates<=32), 'trg'].values
lr.fit(X_train_level2, y_test_temp)

train_preds = lr.predict(X_train_level2)
r2_train_stacking = r2_score(y_test_temp, train_preds)# YOUR CODE GOES HERE

test_preds = lr.predict(X_test_level2)
r2_test_stacking = r2_score(y_test, test_preds)# YOUR CODE GOES HERE

print('Train R-squared for stacking is %f' % r2_train_stacking)
print('Test  R-squared for stacking is %f' % r2_test_stacking)


Train R-squared for stacking is 0.287446
Test  R-squared for stacking is 0.318761


In [None]:
y_test_temp =  all_data.loc[(27<=dates)&(dates<=33), 'trg'].values

X_final_train = np.append(X_train_level2,X_test_level2,axis=0)
print(X_final_train.shape)
#X_final_train=X_train_level2.append(X_test_level2, ignore_index = True) 
lr.fit(X_final_train, y_test_temp)

final_preds = lr.predict(X_final_level2)


submission['item_cnt_month']=final_preds.clip(0,20)
print(submission.head())
submission.to_csv('Linear_Ensemble_3.csv',index=False)



(1614364, 3)
   ID  item_cnt_month
0   0        0.432216
1   1        0.092727
2   2        0.960536
3   3        0.286704
4   4        1.959579


###Simple Neural network as the second level model

In [None]:
y_test_temp =  all_data.loc[(27<=dates)&(dates<=32), 'trg'].values


mlp = MLPRegressor(hidden_layer_sizes=(8,8), activation='relu', solver='adam', max_iter=50,early_stopping=True,verbose=1)
mlp.fit(X_train_level2, y_test_temp)




train_preds = mlp.predict(X_train_level2)
r2_train_stacking = r2_score(y_test_temp, train_preds)# YOUR CODE GOES HERE

test_preds = mlp.predict(X_test_level2)
r2_test_stacking = r2_score(y_test, test_preds)# YOUR CODE GOES HERE

print('Train R-squared for stacking is %f' % r2_train_stacking)
print('Test  R-squared for stacking is %f' % r2_test_stacking)



y_test_temp =  all_data.loc[(27<=dates)&(dates<=33), 'trg'].values

X_final_train = np.append(X_train_level2,X_test_level2,axis=0)
print(X_final_train.shape)
#X_final_train=X_train_level2.append(X_test_level2, ignore_index = True) 
mlp.fit(X_final_train, y_test_temp)
final_preds = mlp.predict(X_final_level2)

submission['item_cnt_month']=final_preds.clip(0,20)
print(submission.head())
submission.to_csv('NN_Ensemble_4.csv',index=False)





Iteration 1, loss = 6.41141396
Validation score: 0.199550
Iteration 2, loss = 5.99319951
Validation score: 0.202615
Iteration 3, loss = 6.06294467
Validation score: 0.211581
Iteration 4, loss = 6.04203789
Validation score: 0.212820
Iteration 5, loss = 5.98131149
Validation score: 0.202402
Iteration 6, loss = 6.01462970
Validation score: 0.204761
Iteration 7, loss = 5.96320448
Validation score: 0.213334
Iteration 8, loss = 6.00061646
Validation score: 0.210754
Iteration 9, loss = 5.97454970
Validation score: 0.211910
Iteration 10, loss = 6.01229372
Validation score: 0.212061
Iteration 11, loss = 5.90426363
Validation score: 0.209004
Iteration 12, loss = 5.89008943
Validation score: 0.227163
Iteration 13, loss = 5.96282778
Validation score: 0.224626
Iteration 14, loss = 5.94274803
Validation score: 0.216839
Iteration 15, loss = 5.92948549
Validation score: 0.218362
Iteration 16, loss = 5.93854717
Validation score: 0.208707
Iteration 17, loss = 5.92248458
Validation score: 0.224546
Iterat



In [None]:
X_final_level2 = np.load('X_final_level2.npy') 
print(X_final_level2.shape)
submission['item_cnt_month']=X_final_level2[:,2].clip(0,20)
print(submission.head())
submission.to_csv('Linear_Ensemble_test3.csv',index=False)



(214200, 3)
   ID  item_cnt_month
0   0        0.347915
1   1        3.510933
2   2        0.621385
3   3        0.137700
4   4        0.000000


In [None]:
print(X_final_level2[:,2])
print(X_final_level2[:,1])
print(X_final_level2[:,0])

[ 0.34791472  3.51093324  0.62138483 ...  0.14671031 -0.05780269
 -0.15993974]
[ 0.62197945  0.09947484  1.07775103 ... -0.02097045 -0.02184285
 -0.02184285]
[-0.3057409   0.40513046  0.13444466 ...  0.32719403  0.01596999
 -0.06328963]


#Quick Execution of Results



This section loads the saved arrays to qucikly execute the ensembling results.

##Single XGBoost Model


First we load the XGBoost model that is saved. The LB score for this approach is 0.980943

In [None]:
all_data=pd.read_csv('all_data.csv')
test_final=pd.read_csv('test_final.csv')
print(test_final.columns)
print(test_final.head())
#Load the model

xgb_model = pickle.load(open("xgb_final.pickle.dat", "rb"))

final_pred_xgb = xgb_model.predict(test_final)

submission['item_cnt_month']=final_pred_xgb.clip(0,20)
print(submission.head())
submission.to_csv('XGB_single_submission.csv',index=False)

Index(['shop_id', 'item_id', 'i_mn_l_1', 's_mn_l_1', 'trg_l_1', 'i_mn_l_2',
       's_mn_l_2', 'trg_l_2', 'i_mn_l_3', 's_mn_l_3', 'trg_l_3', 'i_mn_l_4',
       's_mn_l_4', 'trg_l_4', 'i_mn_l_5', 's_mn_l_5', 'trg_l_5', 'i_mn_l_6',
       's_mn_l_6', 'trg_l_6', 'i_mn_l_12', 's_mn_l_12', 'trg_l_12',
       'item_category_id'],
      dtype='object')
   shop_id  item_id  i_mn_l_1  s_mn_l_1  trg_l_1  i_mn_l_2  s_mn_l_2  trg_l_2  \
0        5     5037      25.0    1052.0      0.0     110.0    1092.0      1.0   
1        5     5320       0.0       0.0      0.0       0.0       0.0      0.0   
2        5     5233      42.0    1052.0      1.0      80.0    1092.0      3.0   
3        5     5232      28.0    1052.0      0.0      48.0    1092.0      0.0   
4        5     5268       0.0       0.0      0.0       0.0       0.0      0.0   

   i_mn_l_3  s_mn_l_3  trg_l_3  i_mn_l_4  s_mn_l_4  trg_l_4  i_mn_l_5  \
0     119.0    1294.0      3.0      54.0     991.0      1.0     105.0   
1       0.0       0

##Ensembling by loading saved results



First the saved second level features are loaded. Here X_final_level2 is the meat features for the test data set. X_test_level2 is the meta features for the validation data set.

In [None]:
X_test_level2=np.load('X_test_level2.npy')
X_train_level2=np.load('X_train_level2.npy')
X_final_level2=np.load('X_final_level2.npy')
dates = all_data['date_block_num']


We currently have four meta features. In this section we look at the best meta feature selection to get the best result on the validataion data set.

###Simple linear ensembling

In [None]:
y_test_temp =  all_data.loc[(27<=dates)&(dates<=32), 'trg'].values

possible=[[1,2,3,0],[1,2,3],[1,2,0],[1,3,0],[2,3,0],[1,2],[1,3],[1,0],[3,2],[0,2],[3,0]]
maxR2=0
bestComb=0
for ty in possible:
  print(f'Testing for {ty}')
  X_train_2=X_train_level2[:,ty]
  X_test_2=X_test_level2[:,ty]
  lr = LinearRegression()
  lr.fit(X_train_2, y_test_temp)

  train_preds = lr.predict(X_train_2)
  r2_train_stacking = r2_score(y_test_temp, train_preds)# YOUR CODE GOES HERE

  test_preds = lr.predict(X_test_2)
  r2_test_stacking = r2_score(y_test, test_preds)# YOUR CODE GOES HERE

  if r2_test_stacking>maxR2:
    maxR2=r2_test_stacking
    bestComb=ty
  print('Train R-squared for stacking is %f' % r2_train_stacking)
  print('Test  R-squared for stacking is %f' % r2_test_stacking)



print(f'Best combination is {bestComb}')
y_test_temp =  all_data.loc[(27<=dates)&(dates<=33), 'trg'].values

X_final_train = np.append(X_train_level2,X_test_level2,axis=0)
#print(X_final_train[:,bestComb].shape)

lr.fit(X_final_train[:,bestComb], y_test_temp)

final_preds = lr.predict(X_final_level2[:,bestComb])


submission['item_cnt_month']=final_preds.clip(0,20)
print(submission.head())
submission.to_csv('Linear_Ensemble_best.csv',index=False)




Testing for [1, 2, 3, 0]
Train R-squared for stacking is 0.688059
Test  R-squared for stacking is 0.269071
Testing for [1, 2, 3]
Train R-squared for stacking is 0.687913
Test  R-squared for stacking is 0.266699
Testing for [1, 2, 0]
Train R-squared for stacking is 0.287446
Test  R-squared for stacking is 0.318761
Testing for [1, 3, 0]
Train R-squared for stacking is 0.687584
Test  R-squared for stacking is 0.275300
Testing for [2, 3, 0]
Train R-squared for stacking is 0.687095
Test  R-squared for stacking is 0.266608
Testing for [1, 2]
Train R-squared for stacking is 0.287362
Test  R-squared for stacking is 0.318074
Testing for [1, 3]
Train R-squared for stacking is 0.686770
Test  R-squared for stacking is 0.273831
Testing for [1, 0]
Train R-squared for stacking is 0.270163
Test  R-squared for stacking is 0.285288
Testing for [3, 2]
Train R-squared for stacking is 0.687060
Test  R-squared for stacking is 0.267695
Testing for [0, 2]
Train R-squared for stacking is 0.278803
Test  R-squar

The best meta features are 0,1, and 2 (corresponding to LR, LGB, and NN). The LB score for this is 0.97483

###Neural network ensembling

In [None]:
y_test_temp =  all_data.loc[(27<=dates)&(dates<=32), 'trg'].values


possible=[[1,2,3,0],[1,2,3],[1,2,0],[1,3,0],[2,3,0],[1,2],[1,3],[1,0],[3,2],[0,2],[3,0]]
#possible=[[1,2,3,0],[1,2,3],[1,2,0]]
maxR2=0
bestComb=0
for ty in possible:
  print(f'Testing for {ty}')
  X_train_l2=X_train_level2[:,ty]
  X_test_l2=X_test_level2[:,ty]
  lr = LinearRegression()
  mlp = MLPRegressor(hidden_layer_sizes=(8,8), activation='relu', solver='adam', max_iter=80,early_stopping=True,verbose=0)
  mlp.fit(X_train_l2, y_test_temp)
  



  train_preds = mlp.predict(X_train_l2)
  r2_train_stacking = r2_score(y_test_temp, train_preds)# YOUR CODE GOES HERE

  test_preds = mlp.predict(X_test_l2)
  r2_test_stacking = r2_score(y_test, test_preds)# YOUR CODE GOES HERE
  if r2_test_stacking>maxR2:
    maxR2=r2_test_stacking
    bestComb=ty
  print('Train R-squared for stacking is %f' % r2_train_stacking)
  print('Test  R-squared for stacking is %f' % r2_test_stacking)



print(f'Best combination is {bestComb}')
y_test_temp =  all_data.loc[(27<=dates)&(dates<=33), 'trg'].values

X_final_train = np.append(X_train_level2,X_test_level2,axis=0)

#X_final_train=X_train_level2.append(X_test_level2, ignore_index = True) 
mlp.fit(X_final_train[:,bestComb], y_test_temp)
final_preds = mlp.predict(X_final_level2[:,bestComb])

submission['item_cnt_month']=final_preds.clip(0,20)
print(submission.head())
submission.to_csv('Neural_Network_Ensemble_best.csv',index=False)



Testing for [1, 2, 3, 0]
Train R-squared for stacking is 0.686944
Test  R-squared for stacking is 0.279066
Testing for [1, 2, 3]




Train R-squared for stacking is 0.703744
Test  R-squared for stacking is 0.287465
Testing for [1, 2, 0]
Train R-squared for stacking is 0.290370
Test  R-squared for stacking is 0.334793
Testing for [1, 3, 0]
Train R-squared for stacking is 0.690174
Test  R-squared for stacking is 0.272060
Testing for [2, 3, 0]
Train R-squared for stacking is 0.688621
Test  R-squared for stacking is 0.259912
Testing for [1, 2]
Train R-squared for stacking is 0.280925
Test  R-squared for stacking is 0.274429
Testing for [1, 3]
Train R-squared for stacking is 0.694263
Test  R-squared for stacking is 0.278571
Testing for [1, 0]
Train R-squared for stacking is 0.276053
Test  R-squared for stacking is 0.280213
Testing for [3, 2]
Train R-squared for stacking is 0.688814
Test  R-squared for stacking is 0.278086
Testing for [0, 2]
Train R-squared for stacking is 0.274222
Test  R-squared for stacking is 0.282374
Testing for [3, 0]
Train R-squared for stacking is 0.682088
Test  R-squared for stacking is 0.266350


The LB score for this result is 0.94125.