For memory issues on this platform, I have to split the Final Project in 2 parts:

Part 1 
* EDA
* Features 
* Training models for validation

Part 2 (This kernel)
* Training models for predictions
* Generating submission file

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Load data of competition

In [None]:
sales        = pd.read_csv(os.path.join(dirname, 'sales_train.csv'))
items        = pd.read_csv(os.path.join(dirname, 'items.csv'))
item_cat     = pd.read_csv(os.path.join(dirname, 'item_categories.csv'))
shops        = pd.read_csv(os.path.join(dirname, 'shops.csv'))
test         = pd.read_csv(os.path.join(dirname, 'test.csv'))
submission   = pd.read_csv(os.path.join(dirname, 'sample_submission.csv'))


Load Python libraries

In [None]:
import sklearn
import scipy
import seaborn
import gc
import matplotlib.pyplot as plt
%matplotlib inline 

import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

from tqdm import notebook
from math import sqrt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline

from itertools import product
import joblib

List Versions used

In [None]:
for p in [np, pd, scipy,sklearn, seaborn, lgb]:
    print (p.__name__, p.__version__)

Function to downsize types from 64 to 32 - took from luliu31415926 on github

In [None]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

Function to calc RMSE

In [None]:
def get_rmse(actual, predicted):
    '''
        Input: 
                actual, predicted: series object type
        Output:
                root mean squared error: float
    '''
    
    # Select columns to downcast
    mse = mean_squared_error(actual, predicted)
    rmse = sqrt(mse)
        
    return rmse

**** PREDICTION STAGE ******

I am going to append test data to the training data to do all features preprocessing and transformatinos, after I will split training from test data for training the model with all data available and then apply the prediction to the test data.

In [None]:
sales_test = sales.copy()
test_grid = test[["shop_id", "item_id"]]
test_grid["date_block_num"] = 34
test_grid["month"] = 11
test_grid["date"] = test_grid["item_price"] = test_grid["item_cnt_day"] = 0
test_grid = test_grid.reindex(columns=sales.columns.values)
sales_test = sales_test.append(test_grid)

All the sequence it is the same as TRAINING VALIDATON STAGE - I omitted explaination.

In [None]:
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales_test['date_block_num'].unique():
    cur_shops = sales_test.loc[sales_test['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales_test.loc[sales_test['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

# Groupby data to get shop-item-month aggregates
gb = sales_test.groupby(index_cols,as_index=False).item_cnt_day.sum()
gb.columns = index_cols + ["target"]

# Join it to the grid
all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates

#gb = sales_test.groupby(['shop_id', 'date_block_num'],as_index=False).item_cnt_day.sum()
#gb.columns = ["shop_id", "date_block_num", "target_shop"]

gb = sales_test.groupby(['shop_id', 'date_block_num'],as_index=False).agg({"item_cnt_day":"sum","item_price":"max"})
gb.columns = ["shop_id", "date_block_num", "target_shop", "max_price"]

all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales_test.groupby(['item_id', 'date_block_num'],as_index=False).item_cnt_day.sum()
gb.columns = ["item_id", "date_block_num", "target_item"]

all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
del grid, gb 
gc.collect();

In [None]:
# List of columns that we will use to create lags
cols_to_rename = list(all_data.columns.difference(index_cols)) 

shift_range = [1, 2, 3, 4, 5, 12]

for month_shift in notebook.tqdm(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift

# Don't use old data from previous month block
#all_data = all_data[all_data['date_block_num'] >= 12] 

# I am going to use whole data available for testing

# List of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 

#This line is needed with 2 ´date_block_num´  -- was used to fix error when you choose only 2 items witnin range
#fit_cols += [col for col in all_data.columns if col[-2] in [str(item) for item in shift_range]]

# We will drop these at fitting stage
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

# Category for each item
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
all_data = downcast_dtypes(all_data)
gc.collect();

Seniority

In [None]:
shop_first_block = sales_test.groupby("shop_id").date_block_num.min()
all_data["shop_1st"] = all_data["shop_id"].map(shop_first_block)
all_data["shop_1st"] = all_data["date_block_num"] - all_data["shop_1st"]

item_first_block = sales_test.groupby("item_id").date_block_num.min()
all_data["item_1st"] = all_data["item_id"].map(item_first_block)
all_data["item_1st"] = all_data["date_block_num"] - all_data["item_1st"]

all_data.shop_1st.value_counts()

Mean Econding

# Mean Encoding for item_id
cumsum = all_data.groupby('item_id')['target'].cumsum() - all_data['target']
cumcnt = all_data.groupby('item_id').cumcount()
all_data['item_target_enc'] = cumsum / cumcnt
all_data['item_target_enc'].fillna(0.3343, inplace=True) 

# Mean Encoding for shop_id
cumsum = all_data.groupby('shop_id')['target'].cumsum() - all_data['target']
cumcnt = all_data.groupby('shop_id').cumcount()
all_data['shop_target_enc'] = cumsum / cumcnt
all_data['shop_target_enc'].fillna(0.3343, inplace=True) 

# Mean Encoding for item_category_id
cumsum = all_data.groupby('item_category_id')['target'].cumsum() - all_data['target']
cumcnt = all_data.groupby('item_category_id').cumcount()
all_data['category_target_enc'] = cumsum / cumcnt
all_data['category_target_enc'].fillna(0.3343, inplace=True) 

all_data = all_data.drop(["item_id", "shop_id", "item_category_id"], axis=1)  # "item_category_id"

all_data = downcast_dtypes(all_data)
del cumsum, cumcnt
gc.collect();

Month

In [None]:
all_data["month"]=(all_data["date_block_num"]+1)%12

# Mean Encoding for month
cumsum = all_data.groupby('month')['target'].cumsum() - all_data['target']
cumcnt = all_data.groupby('month').cumcount()
all_data['month_enc'] = cumsum / cumcnt
all_data['month_enc'].fillna(0.3343, inplace=True) 

all_data = all_data.drop(["month"], axis=1)

all_data = downcast_dtypes(all_data)
del cumsum, cumcnt
gc.collect();

all_data.head(5)

Split

In [None]:
dates = all_data['date_block_num']

last_block = dates.max()   #This will be our validation set

print('Test `date_block_num` is %d' % last_block)

In [None]:
dates_train = dates[dates <  last_block]
dates_test  = dates[dates == last_block]

X_train = all_data.loc[dates <  last_block].drop(to_drop_cols, axis=1)
X_test =  all_data.loc[dates == last_block].drop(to_drop_cols, axis=1)

y_train = all_data.loc[dates <  last_block, 'target'].values.clip(0,20)
y_test =  all_data.loc[dates == last_block, 'target'].values.clip(0,20)

print(X_train.head())

Just to ensure that index from test file and X_test set matchs for prediction.

In [None]:
X_test_compare = pd.DataFrame(data=X_test, columns=["ID", "shop_id", "item_id"])
X_test_compare.reset_index(inplace=True, drop=True)
X_test_compare["ID"]=X_test_compare.index

print(test.columns, test.shape, test.head())
print(X_test_compare.columns, X_test_compare.shape, X_test_compare.head())
print("Are the same", (test == X_test_compare).value_counts())

First Level

SGD Regressor

In [None]:
reg = make_pipeline(StandardScaler(), SGDRegressor(loss="epsilon_insensitive"))
reg.fit(X_train.values, y_train)
pred_reg = reg.predict(X_test.values).clip(0,20)

In [None]:
print("r2 train: ", r2_score(y_train, reg.predict(X_train.values).clip(0,20)))

In [None]:
filename = 'pred_reg_model_l1.sav'
joblib.dump(reg, filename)

LigthGBM

In [None]:
lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }

lgb1 = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)    
pred_lgb = lgb1.predict(X_test).clip(0,20)

In [None]:
print("r2 train: ", r2_score(y_train, lgb1.predict(X_train).clip(0,20)))

In [None]:
filename = 'pred_lgb003_model_l1.sav'
joblib.dump(lgb1, filename)

Level 2

In [None]:
X_test_level2 = np.c_[pred_reg, pred_lgb]

print(X_test_level2.shape)

Kfold range

In [None]:
months_f=np.array([i for i in range(10,last_block)])
dates_train_level2 = dates_train[dates_train.isin(months_f)]

# That is how we get target for the 2nd level dataset
y_train_level2 = y_train[dates_train.isin(months_f)]

print('shape of y_train_level2: {}'.format(y_train_level2.shape))

Kfold Validation

In [None]:
# And here we create 2nd level feature matrix, init it with zeros first
X_train_level2 = np.zeros([y_train_level2.shape[0], 2])

# Now fill `X_train_level2` with metafeatures
for cur_block_num in notebook.tqdm(months_f):
    
    print(cur_block_num, end='')
    
    '''
        1. Split `X_train` into parts
           Remember, that corresponding dates are stored in `dates_train` 
        2. Fit linear regression 
        3. Fit LightGBM and put predictions          
        4. Store predictions from 2. and 3. in the right place of `X_train_level2`. 
           You can use `dates_train_level2` for it
           Make sure the order of the meta-features is the same as in `X_test_level2`
    '''      
    
    #  YOUR CODE GOES HERE
    X_train_block = all_data.loc[dates < cur_block_num].drop(to_drop_cols, axis=1)
    X_test_block = all_data.loc[dates == cur_block_num].drop(to_drop_cols, axis=1)
    
    y_train_block = all_data.loc[dates <  cur_block_num, 'target'].values.clip(0,20)
    y_test_block = all_data.loc[dates == cur_block_num, 'target'].values.clip(0,20)
    
    print(':  X_train_block.shape={}'.format(X_train_block.shape), end='')
    print(',  X_test_block.shape={}'.format(X_test_block.shape), end='')
    print(',   Total Size={}'.format(X_train_block.shape[0] + X_test_block.shape[0]), end='')
    print()
    
    reg.fit(X_train_block, y_train_block)
    X_train_level2[dates_train_level2 == cur_block_num, 0] = reg.predict(X_test_block.values).clip(0,20)
    
    model = lgb.train(lgb_params, lgb.Dataset(X_train_block, label=y_train_block), 100)
    X_train_level2[dates_train_level2 == cur_block_num, 1] = model.predict(X_test_block).clip(0,20)
    
    

Train

In [None]:
lr = LinearRegression()
lr.fit(X_train_level2, y_train_level2)

print('Coefficient:            {}'.format(lr.coef_))
print('Normalized Coefficient: {}'.format(lr.coef_ / lr.coef_.sum()))

Predict

In [None]:
test_preds_stacking_lr = lr.predict(np.vstack((pred_reg, pred_lgb)).T).clip(0,20)
test_preds_stacking_lr.shape

In [None]:
print("r2 train: ", r2_score(y_train_level2, lr.predict(X_train_level2).clip(0,20)))

In [None]:
filename = 'pred_stack_model_l2.sav'
joblib.dump(lr, filename)

** GENERATING SUBMISSION FILE

In [None]:
submissionstackinglr = submission.copy()
submissionstackinglr["item_cnt_month"]=test_preds_stacking_lr
print(submissionstackinglr.item_cnt_month.min(), submissionstackinglr.item_cnt_month.max())
submissionstackinglr["item_cnt_month"]
print(submissionstackinglr.item_cnt_month.min(), submissionstackinglr.item_cnt_month.max())
print(submissionstackinglr.head())

To generate submission file for download and send for assess.

In [None]:
submissionstackinglr.to_csv('cu_full_1_5_12_reg_lgb_stacked_lr_v4.csv', index=False)

** END OF MODEL ***