In [58]:
import numpy as np
import pandas as pd
import itertools
from sklearn.linear_model import LinearRegression
import pickle

In [93]:
def create_test_data(prediction, dict_cat):
  trtv = pd.concat([dict_cat['train'], dict_cat['val']], axis = 0)
  test_ids = dict_cat['test']['item_id'].isin(trtv['item_id']) & dict_cat['test']['shop_id'].isin(trtv['shop_id'])
  prediction[~test_ids] = 0
  return(prediction)

In [95]:
def create_cv_sets2(months, data):
    X_train = data.loc[~data['date_block_num'].isin(months+[34]),:].drop('cnt_shop_item', axis=1)
    X_val = data.loc[data['date_block_num'].isin(months),:]
    X_val, y_val = create_test(X_val)
    X_test = data.loc[data['date_block_num'] == 34,:].drop('cnt_shop_item',axis=1)
    y_train = data.loc[~data['date_block_num'].isin(months+[34]),'cnt_shop_item']
    return(dict({'train': X_train, 'val': X_val, 'test': X_test, 'train_y': y_train, 'val_y': y_val}))

In [97]:
def create_test(test):
    test = test.loc[(test['shop_id'].isin(sales_test['shop_id']))&(test['item_id'].isin(sales_test['item_id'])),:].copy()
    test_pred = test['cnt_shop_item'].copy()
    test.drop('cnt_shop_item', axis = 1, inplace = True)
    return(test, test_pred)

In [69]:
sales_test = pd.read_csv(r'test.csv')

In [128]:
path = 'Submission Time Series/Stacking/'
models = ['lr','rf', 'xgboost', 'emb_nn', 'catboost']
stack_train = []
stack_test = []
for model in models:
    data_train = pd.read_csv(path+model+'_train_level2.csv', header = None)
    data_test = pd.read_csv(path+model+'_test_level2.csv', header = None)
    stack_train.append(data_train)
    stack_test.append(data_test)
stack_train = np.column_stack(stack_train)
stack_test = np.column_stack(stack_test)

In [59]:
with open(r"full_data_2.pkl", "rb") as input_file:
    full_data = pickle.load(input_file)

In [98]:
dict_cat = create_cv_sets2([9,21,33], full_data)

In [61]:
stack_months = np.arange(28,34)
stack_y = full_data.loc[full_data['date_block_num'].isin(stack_months),'cnt_shop_item']

In [142]:
stack_test

array([[ 0.58789062,  0.56529513,  0.5542345 , ...,  0.13734591,
        -0.50944463, -0.64679054],
       [ 0.        ,  0.23633458,  0.12270784, ..., -0.0635494 ,
        -0.08257721, -0.01902781],
       [ 0.57226562,  0.6387954 ,  0.40862715, ..., -0.23421097,
        -0.65940277, -0.4251918 ],
       ...,
       [ 0.        ,  0.06885622,  0.        , ..., -0.15011179,
         0.        ,  0.15011179],
       [ 0.        ,  0.07455094,  0.        , ..., -0.15158904,
         0.        ,  0.15158904],
       [ 0.        ,  0.28891951,  0.26694965, ...,  0.08030677,
         0.15293559,  0.07262882]])

In [63]:
stack_y.shape

(192351,)

In [129]:
stack_train.shape

(192351, 5)

In [130]:
stack_test.shape

(214200, 5)

Calculate pairwise differences

In [131]:
train_pair = []
test_pair = []
for i, j in itertools.combinations(range(stack_train.shape[1]), 2):
    diff_train = stack_train[:,i]-stack_train[:,j]
    diff_test = stack_test[:,i]-stack_test[:,j]
    train_pair.append(diff_train)
    test_pair.append(diff_test)
train_pair = np.column_stack(train_pair)
test_pair = np.column_stack(test_pair)

In [132]:
train_pair.shape

(192351, 10)

In [133]:
test_pair.shape

(214200, 10)

In [134]:
stack_train = np.concatenate((stack_train, train_pair), axis=1)
stack_test = np.concatenate((stack_test, test_pair), axis=1)

In [135]:
lr = LinearRegression(normalize=True, n_jobs=-1)
lr.fit(stack_train, stack_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=True)

In [136]:
lr.score(stack_train, stack_y)

0.6308994673425566

In [138]:
stack_pred_lr = lr.predict(stack_test)

In [66]:
def create_submission_file(pred, name):
    ID = np.arange(0, sales_test.shape[0]) 
    new_df = pd.DataFrame({'ID': ID, 'item_cnt_month': pred})
    new_df.to_csv('Submission Time Series/'+name+'.csv',index = False)

In [101]:
create_submission_file(create_test_data(stack_pred_lr, dict_cat), 'stack_lr')

Adding the lag feature

In [106]:
prev_month_train = full_data.loc[full_data['date_block_num'].isin(stack_months), 'cnt_shop_item_lag_1']
prev_month_test = full_data.loc[full_data['date_block_num'].isin([34]), 'cnt_shop_item_lag_1']

In [143]:
stack_train2 = np.column_stack((stack_train, prev_month_train))
stack_test2 = np.column_stack((stack_test, prev_month_test))

In [144]:
lr = LinearRegression(normalize=True, n_jobs=-1)
lr.fit(stack_train2, stack_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=True)

In [145]:
lr.score(stack_train2, stack_y)

0.6308997952296302