For memory issues on this platform, I have to split the Final Project in 2 parts:

Part 1 (This kernel)
* EDA
* Features 
* Training models for validation

Part 2
* Training models for predictions
* Generating submission file

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
sales        = pd.read_csv(os.path.join(dirname, 'sales_train.csv'))
items        = pd.read_csv(os.path.join(dirname, 'items.csv'))
item_cat     = pd.read_csv(os.path.join(dirname, 'item_categories.csv'))
shops        = pd.read_csv(os.path.join(dirname, 'shops.csv'))
test         = pd.read_csv(os.path.join(dirname, 'test.csv'))
submission   = pd.read_csv(os.path.join(dirname, 'sample_submission.csv'))


Load Python libraries

In [None]:
import sklearn
import scipy
import seaborn
import gc
import matplotlib.pyplot as plt
%matplotlib inline 

import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

from tqdm import notebook
from math import sqrt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline

from itertools import product
import joblib

List Versions used

In [None]:
for p in [np, pd, scipy,sklearn, seaborn, lgb]:
    print (p.__name__, p.__version__)

Function to downsize types from 64 to 32 - took from luliu31415926 on github

In [None]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

Function to calc RMSE

In [None]:
def get_rmse(actual, predicted):
    '''
        Input: 
                actual, predicted: series object type
        Output:
                root mean squared error: float
    '''
    
    # Select columns to downcast
    mse = mean_squared_error(actual, predicted)
    rmse = sqrt(mse)
        
    return rmse

**EDA STAGE**

In [None]:
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
seaborn.set_style("whitegrid")

Function to plot data - took from Cloistered Monkey

In [None]:
def make_figure_and_axis(x_label, y_label, title, figsize=(10, 8)):
    """make a matplotlib figure

    Args:
     x_label (str): label for the x-axis
     y_label (str): label for the y-axis
     title (str): title for the plot
     figsize: tuple of width, height
    Returns:
     tuple: figure, axis
    """
    fig = plt.figure(figsize=figsize)
    axe = fig.gca()
    axe.set_xlabel(x_label)
    axe.set_ylabel(y_label)
    axe.set_title(title)
    return fig, axe

Basic anlysis of data - complementing those posted by Kaggle Data.

In [None]:
# Trasaction dataset "sales" and "test"

pd.options.display.float_format = '{:,.2f}'.format
print("Sales\n", sales.describe(), "\n")

print("Test\n",test.describe())

Looking for outliers

In [None]:
print(sales.item_price.value_counts())
print(sales[sales["item_price"] == -1].item_price.value_counts())
print(sales[sales["item_price"] == 307980].item_price.value_counts())
item1 = sales[sales["item_price"] == -1].item_id.max()
item2 = sales[sales["item_price"] == 307980].item_id.max()

print("Item: ", item1, "\n", sales[sales["item_id"] == item1].item_price.value_counts())
print("Item: ", item2, "\n", sales[sales["item_id"] == item2].item_price.value_counts())

Look for item_id with price outlier in test set, to determine if it is possible to drop

In [None]:
print(test[test["item_id"] == item2].item_id.count())

Confirm outlayers with plot

In [None]:
fig=plt.plot(sales.item_price, ".")

Actions, replace by mean in the first case and drop unique row for special item just to not distort de model

In [None]:
print(sales.loc[sales["item_price"] == -1, ["item_id", "item_price"]]) 
sales.loc[sales["item_price"] == -1, ["item_price"]] = sales[sales["item_id"] == item1].item_price.mean()
print(sales.loc[sales["item_price"] == -1, ["item_id"]]) 

In [None]:
sales = sales[sales.item_price != 307980]
print(sales["item_price"].describe())

In [None]:
fig=plt.plot(sales.item_cnt_day, ".")

I don't take any action in this case

In [None]:
plt.hist(sales.date_block_num, sales.date_block_num.max())

Exploring shop_id distribution beetwen sales and test data

In [None]:
plt.hist(sales.shop_id, len(sales.shop_id.unique()))

In [None]:
plt.hist(test.shop_id, len(test.shop_id.unique()))

This distribution show that test data was sintetic, hardly any validation set sustracted from sales data will be aproximated to this.

Prepare data for specific analysis based on competition's target

In [None]:
sales_test = sales.copy()
sales_test["item_revenue"] = sales_test.item_price*sales_test.item_cnt_day 

sales_test["day"] = sales_test.date.str[0:2]
sales_test["month"] = sales_test.date.str[3:5]
sales_test["year"] = sales_test.date.str[6:10]

index_cols = ['shop_id', 'item_id', 'date_block_num']

grid = [] 
for block_num in sales_test['date_block_num'].unique():
    cur_shops = sales_test.loc[sales_test['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales_test.loc[sales_test['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

# Groupby data to get shop-item-month aggregates
gb = sales_test.groupby(index_cols,as_index=False).item_cnt_day.sum()
gb.columns = index_cols + ["month_qty"]

# Join it to the grid
all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

all_data["month"]=(all_data["date_block_num"]+1)%12
all_data["year"]=(all_data["date_block_num"]+1)//12
all_data.loc[all_data["year"] == 0, ["year"]] = 2013
all_data.loc[all_data["year"] == 1, ["year"]] = 2014
all_data.loc[all_data["year"] == 2, ["year"]] = 2015

item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()
all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')

all_data.head()

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
del grid, gb 
gc.collect();

Look for taget variable trends in time

In [None]:
grid = seaborn.catplot(x="date_block_num", data=all_data, kind="count")

See item sold per month

In [None]:
date_group = all_data.groupby("date_block_num")
summed = date_group.sum()
summed = summed.reset_index()

grid = seaborn.relplot(x="date_block_num", y="month_qty", data=summed, kind="line")

In [None]:
all_data["Date"] = all_data.year.apply(str) + "-" + all_data.month.apply(str)
month_grouped = all_data.groupby("Date")
month_summed = month_grouped.sum().reset_index()

top_two = month_summed.sort_values("month_qty", ascending=False)[:2]

grid = seaborn.relplot(x="Date", y="month_qty", data=month_summed, kind="line")


In [None]:
print(top_two[["Date", "month_qty"]])

Shop analysis

In [None]:
group = all_data.groupby("shop_id").sum().reset_index().sort_values("month_qty")

grid = seaborn.relplot(x="shop_id", y="month_qty", data=group)

Item category analysis

In [None]:
group = all_data.groupby("item_category_id").sum().reset_index()

grid = seaborn.relplot(x="item_category_id", y="month_qty", data=group)

Category most frecuent sold

In [None]:
category_group = all_data.groupby(["Date", "item_category_id"]).sum().reset_index()
biggest = category_group.iloc[category_group["month_qty"].idxmax()]

biggest_category = category_group[category_group.item_category_id == biggest.item_category_id]

grid = seaborn.relplot(x="Date", y="month_qty", data=biggest_category, kind="line")

In [None]:
print("Biggest category\n", biggest)

Once again its confirms sales decreasing during the time

*** TRAINING STAGE ***

Here I used the code from "Ensembling implementation" programming assigment, from the course, to generate grid matrix of features.  We did some adjust to run the code and add item_price as a new feature.

In [None]:
sales_test = sales.copy()
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales_test['date_block_num'].unique():
    cur_shops = sales_test.loc[sales_test['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales_test.loc[sales_test['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

# Groupby data to get shop-item-month aggregates
gb = sales_test.groupby(index_cols,as_index=False).item_cnt_day.sum()
gb.columns = index_cols + ["target"]

# Join it to the grid
all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates

#gb = sales_test.groupby(['shop_id', 'date_block_num'],as_index=False).item_cnt_day.sum()
#gb.columns = ["shop_id", "date_block_num", "target_shop"]

gb = sales_test.groupby(['shop_id', 'date_block_num'],as_index=False).agg({"item_cnt_day":"sum","item_price":"max"})
gb.columns = ["shop_id", "date_block_num", "target_shop", "max_price"]

all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales_test.groupby(['item_id', 'date_block_num'],as_index=False).item_cnt_day.sum()
gb.columns = ["item_id", "date_block_num", "target_item"]

all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
del grid, gb 
gc.collect();

Now we are going to add the most important features, lags for target, item, shop and price -- adapted from "Ensembling implementation" programming assigment.  I tested manually different lags ranges and at the end best metrics was obtained selecting 1:5 and 12 range.

In [None]:
# List of columns that we will use to create lags
cols_to_rename = list(all_data.columns.difference(index_cols)) 

shift_range = [1, 2, 3, 4, 5, 12]

for month_shift in notebook.tqdm(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift

# Don't use old data from previous month block
#all_data = all_data[all_data['date_block_num'] >= 12] 

# I am going to use whole data available for testing

# List of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 

#This line is needed with 2 ´date_block_num´  -- was used to fix error when you choose only 2 items witnin range
#fit_cols += [col for col in all_data.columns if col[-2] in [str(item) for item in shift_range]]

# We will drop these at fitting stage
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

# Category for each item
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()
all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')

all_data = downcast_dtypes(all_data)
gc.collect();

Here we are going to add new features related with first time when appear in training set item_id and shop_id. Then I substract from date_block_num in each row to mean seniority in training set.

In [None]:
shop_first_block = sales_test.groupby("shop_id").date_block_num.min()
all_data["shop_1st"] = all_data["shop_id"].map(shop_first_block)
all_data["shop_1st"] = all_data["date_block_num"] - all_data["shop_1st"]

item_first_block = sales_test.groupby("item_id").date_block_num.min()
all_data["item_1st"] = all_data["item_id"].map(item_first_block)
all_data["item_1st"] = all_data["date_block_num"] - all_data["item_1st"]

all_data.shop_1st.value_counts()

Adding Mean Encoding for shop_id, item_id and item_category_id.  

In this case I test applying expanding mean scheme for regualization purpose but model metrics was not favorable, then for final version was omitted.

# Mean Encoding for item_id
cumsum = all_data.groupby('item_id')['target'].cumsum() - all_data['target']
cumcnt = all_data.groupby('item_id').cumcount()
all_data['item_target_enc'] = cumsum / cumcnt
all_data['item_target_enc'].fillna(0.3343, inplace=True) 

# Mean Encoding for shop_id
cumsum = all_data.groupby('shop_id')['target'].cumsum() - all_data['target']
cumcnt = all_data.groupby('shop_id').cumcount()
all_data['shop_target_enc'] = cumsum / cumcnt
all_data['shop_target_enc'].fillna(0.3343, inplace=True) 

# Mean Encoding for item_category_id
cumsum = all_data.groupby('item_category_id')['target'].cumsum() - all_data['target']
cumcnt = all_data.groupby('item_category_id').cumcount()
all_data['category_target_enc'] = cumsum / cumcnt
all_data['category_target_enc'].fillna(0.3343, inplace=True) 

all_data = all_data.drop(["item_id", "shop_id", "item_category_id"], axis=1)  # "item_category_id"

all_data = downcast_dtypes(all_data)
del cumsum, cumcnt
gc.collect();

Now I am going to add month to grid based on date_block_num, this for include in this new feature seasonality that its represent the month in the year - this seasonality for sales was viewed in a histogram data.

In [None]:
all_data["month"]=(all_data["date_block_num"]+1)%12
all_data.head(5)

Then we must encoding month, the same as before, because this columns is likehood to categorical feature.  At the end was ommitted for model metrics results.

# Mean Encoding for month
cumsum = all_data.groupby('month')['target'].cumsum() - all_data['target']
cumcnt = all_data.groupby('month').cumcount()
all_data['month_enc'] = cumsum / cumcnt
all_data['month_enc'].fillna(0.3343, inplace=True) 

all_data = all_data.drop(["month"], axis=1)

all_data = downcast_dtypes(all_data)
del cumsum, cumcnt
gc.collect();

Split trasaction dataframes into train and test, taking in a count that this is a regression over the time then prediction are going to be done by month, then we reserve the last block/month to be the validation set (test set).

In [None]:
dates = all_data['date_block_num']

last_block = dates.max()   #This will be our validation set

print('Test `date_block_num` is %d' % last_block)

Now we split the data as explained but we must delete from the trainset all columns related with our target to predict.

In [None]:
dates_train = dates[dates <  last_block]
dates_test  = dates[dates == last_block]

X_train = all_data.loc[dates <  last_block].drop(to_drop_cols, axis=1)
X_test =  all_data.loc[dates == last_block].drop(to_drop_cols, axis=1)

y_train = all_data.loc[dates <  last_block, 'target'].values.clip(0,20)
y_test =  all_data.loc[dates == last_block, 'target'].values.clip(0,20)

print(X_train.head())

APPLYING MODELS OF THE 1st LEVEL OF ENSEMBLING

**Linear Regression**

In [None]:
lr = LinearRegression()
lr.fit(X_train.values, y_train)
pred_lr = lr.predict(X_test.values).clip(0,20)

Calc RMSE and R2 for model Linear Regression. this will be used manually to select best features conformation until obtain best R2.

In [None]:
rmse_lr = get_rmse(y_test, pred_lr)
print("rmse: ", rmse_lr)
print("r2 train: ", r2_score(y_train, lr.predict(X_train.values).clip(0,20)))
print("r2 test: ", r2_score(y_test, pred_lr))

Serialize the model

In [None]:
filename = 'lr_model_l1.sav'
joblib.dump(lr, filename)

SGD Regressor

In [None]:
reg = make_pipeline(StandardScaler(), SGDRegressor(loss="epsilon_insensitive"))
reg.fit(X_train.values, y_train)
pred_reg = reg.predict(X_test.values).clip(0,20)

Calc RMSE for model SGD Regressor

In [None]:
rmse_reg = get_rmse(y_test, pred_reg)
print("rmse: ", rmse_reg)
print("r2 train: ", r2_score(y_train, reg.predict(X_train.values).clip(0,20)))
print("r2 test: ", r2_score(y_test, pred_reg))

Serialize

In [None]:
filename = 'reg_model_l1.sav'
joblib.dump(reg, filename)

We tested with whole types and parameters of SGDRegressor in order to obtain better performance than linear regression.
I tried other models like SVR from svm and Logistic Regression but none could improve linear regression.

LightGBM Gradient Boosting with decision trees.

In [None]:
lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }

lgb1 = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)    
pred_lgb = lgb1.predict(X_test).clip(0,20)

Calc RMSE an R2 for model Light GBM.  This metrics were the key for optimize learning rate and iterations, we test over 20 combinations to obtain better performance.

In [None]:
rmse_lgb = get_rmse(y_test, pred_lgb)
print("rmse: ", rmse_lgb)
print("r2 train: ", r2_score(y_train, lgb1.predict(X_train.values).clip(0,20)))
print("r2 test: ", r2_score(y_test, pred_lgb))

Serialize

In [None]:
filename = 'lgb003_model_l1.sav'
joblib.dump(lgb1, filename)

Plot feature importance

In [None]:
varimp = lgb1.feature_importance()
names = X_train.columns.values
var_array = pd.DataFrame(list(zip(names, varimp)))
var_array.columns=["names", "varimp"]
var_array.plot(kind="bar", x="names", y="varimp")

You can observe feature importance of model, IDs are most important, then season feature (month of the year), then lag 1 pack and seniority measure for shop and item.

BEGINS 2nd LEVEL FOR ENSEMBLING

Test level 2 are taking directly from predictions of two models L1

In [None]:
X_test_level2 = np.c_[pred_reg, pred_lgb]

print(X_test_level2.shape)

I am going to use "KFold scheme in time series" to validate 2nd. level model from stacking procedure.

In [None]:
months_f=np.array([i for i in range(11,last_block)])
dates_train_level2 = dates_train[dates_train.isin(months_f)]

# That is how we get target for the 2nd level dataset
y_train_level2 = y_train[dates_train.isin(months_f)]

print('shape of y_train_level2: {}'.format(y_train_level2.shape))

Again, I used routine from programming assignment from course, I did some adjusts in params and code to be usable here.

In [None]:
# And here we create 2nd level feature matrix, init it with zeros first
X_train_level2 = np.zeros([y_train_level2.shape[0], 2])

# Now fill `X_train_level2` with metafeatures
for cur_block_num in notebook.tqdm(months_f):
    
    print(cur_block_num, end='')
    
    '''
        1. Split `X_train` into parts
           Remember, that corresponding dates are stored in `dates_train` 
        2. Fit linear regression 
        3. Fit LightGBM and put predictions          
        4. Store predictions from 2. and 3. in the right place of `X_train_level2`. 
           You can use `dates_train_level2` for it
           Make sure the order of the meta-features is the same as in `X_test_level2`
    '''      
    
    #  YOUR CODE GOES HERE
    X_train_block = all_data.loc[dates < cur_block_num].drop(to_drop_cols, axis=1)
    X_test_block = all_data.loc[dates == cur_block_num].drop(to_drop_cols, axis=1)
    
    y_train_block = all_data.loc[dates <  cur_block_num, 'target'].values.clip(0,20)
    y_test_block = all_data.loc[dates == cur_block_num, 'target'].values.clip(0,20)
    
    print(':  X_train_block.shape={}'.format(X_train_block.shape), end='')
    print(',  X_test_block.shape={}'.format(X_test_block.shape), end='')
    print(',   Total Size={}'.format(X_train_block.shape[0] + X_test_block.shape[0]), end='')
    print()
    
    reg.fit(X_train_block, y_train_block)
    X_train_level2[dates_train_level2 == cur_block_num, 0] = reg.predict(X_test_block.values).clip(0,20)
    
    model = lgb.train(lgb_params, lgb.Dataset(X_train_block, label=y_train_block), 100)
    X_train_level2[dates_train_level2 == cur_block_num, 1] = model.predict(X_test_block).clip(0,20)
    
    

Train 2nd level model, Linear regression in this case

In [None]:
lr.fit(X_train_level2, y_train_level2)

print('Coefficient:            {}'.format(lr.coef_))
print('Normalized Coefficient: {}'.format(lr.coef_ / lr.coef_.sum()))

In [None]:
test_preds_stacking_lr = lr.predict(np.vstack((pred_lr, pred_lgb)).T).clip(0,20)
test_preds_stacking_lr.shape

Calc RMSE an R2 for this Stack Model - level 2

In [None]:
rmse_lr_stack = get_rmse(y_test, test_preds_stacking_lr)
print("rmse: ", rmse_lr_stack)
print("r2 train: ", r2_score(y_train_level2, lr.predict(X_train_level2).clip(0,20)))
print("r2 test: ", r2_score(y_test, test_preds_stacking_lr))

Serialize

In [None]:
filename = 'stack_model_lr_l2.sav'
joblib.dump(lr, filename)

** END OF VALIDATION MODEL ***

Continue with "Final Project CUribe.co Part 2/2" kernel 

https://www.kaggle.com/curibe10/final-project-curibe-co-part-2-2