This notebook is the final project in the Coursera course : "*How to Win a Data Science Competition : Learn from Top Kagglers"*.  

**Summary**:  
I used XGBoost to train the model. For ensembling, I associated XGBoost with ElasticNet. Parameters for ElasticNet have been tuned using Timesplit and GridSeachCV. The most important feature is the previous item sales for a given shop_id/item_id pair. For the development tool, I used Kaggle Jupyter notebook without acceleration.
  
**Pipeline:**
- [Importing Required Librairies](#import)
- [Data Loading](#data_loading)
- [Exploratory Data Analysis and Outliers Removal](#data_cleaning)
- [Data Pre-Processing](#data_preprocess)
- [Features Engineering](#data_engineering)
- [Model Training](#model_training)  
    - [Scores](#scoring)
    - [Features Plot](#features_plot)
- [First Submission](#first_submission)
- [Ensembling](#model_ensemble)
    - [GridSearchCV for Linear Regression](#gridsearch_elasticnet)
    - [Building Test & Train Meta Features](#meta_features)
- [Second Submission](#second_submission)

<a id='import'></a>
# Importing Required Librairies

In [None]:
import numpy as np 
import pandas as pd
import time
import lightgbm as lgb
from itertools import product
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import sklearn.model_selection as skt
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from xgboost import plot_importance
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from xgboost import plot_importance
import matplotlib.pyplot as plt
from sklearn.linear_model import ElasticNet
from numpy import savetxt
from numpy import loadtxt
from sklearn.linear_model import Ridge
import seaborn as sns
import gc
import os
import pickle
from tqdm import tqdm_notebook
from sklearn.model_selection import PredefinedSplit
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV


%matplotlib inline

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.set_option('display.max_rows', 1100)
pd.set_option('display.max_columns', 100)

<a id='data_loading'></a>
# Data Loading

In [None]:
DATA_FOLDER  = '/kaggle/input/competitive-data-science-predict-future-sales/'
train          = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv'))
items           = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
item_categories = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops           = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))
sample_sub      = pd.read_csv(os.path.join(DATA_FOLDER, 'sample_submission.csv'))
test            = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv'))
#This is a translated version of cities data frame
cities          = pd.read_csv('../input/shops-en/shops-en.csv')

<a id='data_cleaning'></a>
# Exploratory Data Analysis and Outliers Removal

In [None]:
#EDA - Detect Outliers
plt.figure(figsize=(10,4))
plt.xlim(-100, 3000)
sns.boxplot(x=train.item_cnt_day)
#We plot item prices
plt.figure(figsize=(10,4))
plt.xlim(train.item_price.min(), train.item_price.max()*1.1)
sns.boxplot(x=train.item_price)

In [None]:
#We remove outliers and fix negative price
train = train[train.item_price<100000]
train = train[train.item_cnt_day<1200]
#Fix negative price for a item 2973
median = np.median(train[train['item_id']==2973].item_price)
train.loc[train['item_price']<0,'item_price'] = median

<a id='data_preprocess'></a>
# Data Pre-Processing

In [None]:
#We pre process shops data to extract city information
cities['city'] = cities['shop_name'].str.split(" ").map(lambda x: x[1])
#Manual adjustement for some rows to fix city names
cities.iloc[20,2] = "Moscow"
cities.iloc[42,2] = "St. Petersburg"
cities.iloc[43,2] = "St. Petersburg"

In [None]:
#This is a function to reduce memory usage by adjusting column types
def optimize_memory(df):
    start_mem = df.memory_usage().sum() / 1024**2
    integers = ['int8','int16','int32','int64']
    floats   = ['float32','float64']
    int_cols  = [c for c in df if df[c].dtype in integers]
    float_cols  = [c for c in df if df[c].dtype in floats]
    for i in int_cols:
        df[i] = pd.to_numeric(df[i], downcast='integer')
    for i in float_cols:
        df[i] = pd.to_numeric(df[i], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2
    print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
#Data Preparation and Pre- Processing 
#For every pair shop_id/item_id we create a row for each month (0 - 33)
grid = [] 
index_cols = ['date_block_num','shop_id', 'item_id']
months = train['date_block_num'].unique()
#We construct a grid of all possible shop_id/item_id pairs for a given month
for mth in months:
    shop_ids = train[train['date_block_num'] == mth].shop_id.unique()
    item_ids = train[train['date_block_num'] == mth].item_id.unique()
    grid.append(np.array(list(product(*[[mth],shop_ids, item_ids])),dtype='int16'))

grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)  

#We join the grid with the aggregated sales data per month
gb = train.groupby(index_cols,as_index = False).agg({ 'item_cnt_day':'sum'})
gb.rename(columns ={'item_cnt_day':'item_cnt_month'},inplace = True)
df_sales = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)

#We add item price
gb = train.groupby('item_id',as_index = False).agg({ 'item_price':'mean'})
gb.rename(columns ={'item_price':'avg_item_price'},inplace = True)
df_sales = pd.merge(df_sales,gb,how='left',on='item_id').fillna(0)

#Clip target values
df_sales['item_cnt_month'] = np.clip(df_sales['item_cnt_month'],0,20)
df_sales.sort_values(index_cols, inplace = True)

#Free Memory
del grid,gb
gc.collect()

In [None]:
#We assign next Date Block Num to the test set
test['date_block_num'] = 34
#Concatenate train and test dataframes
df_sales = pd.concat([df_sales,test], ignore_index = True)
df_sales.drop('ID',axis = 1, inplace = True)
df_sales.fillna(0,inplace = True)

In [None]:
# We add the item category to the data frame
df_sales = pd.merge(df_sales,items,how = 'left',on = 'item_id').drop('item_name', axis = 1)
#We add also the city info to the data
df_sales = pd.merge(df_sales,cities,how = 'left',on = 'shop_id').drop('shop_name', axis = 1)
#We assign codes to each city
df_sales['city_code'] = LabelEncoder().fit_transform(df_sales['city'])
df_sales.drop('city',axis = 1,inplace = True)

In [None]:
#Add month and days/month holidays/month to the dataframe
df_sales['month'] =df_sales['date_block_num'] % 12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
df_sales['days'] = df_sales['month'].map(days)
holidays = pd.Series([8,1,1,0,2,1,0,0,0,0,1,0])
df_sales['holidays'] = df_sales['month'].map(holidays)

<a id='data_engineering'></a>
# Features Engineering

In [None]:

def shift_feature(df, lags, feature,drop = True):
    for step in lags:
        new_col_name = feature+'_'+str(step)
        df[new_col_name] = df.groupby(['shop_id','item_id'])[feature].shift(periods = step)
    if (drop):
            df.drop(feature,axis = 1,inplace = True)
    return df

#In my method based on shifting by step, whatever previous month is, we will have a forecast 
#even if the immediat previous month is not available, which has been rewarding in terms of score
def add_feature(df,grp_cols,feature):
        new_df = df.groupby(grp_cols).agg({'item_cnt_month': ['mean']})
        new_df.columns = [feature]
        new_df.reset_index(inplace = True)
        df = pd.merge(df,new_df,on = grp_cols,how='left')
        return(df)

In [None]:
#Mean Encoding Part 1 : We splits parts to free some memory in between
df_sales = shift_feature(df_sales,[1,2,3],'item_cnt_month',False)

df_sales = add_feature(df_sales,['date_block_num', 'item_id'],'date_item_avg_cnt')
df_sales = shift_feature(df_sales,[1,2,3],'date_item_avg_cnt')

df_sales = add_feature(df_sales,['date_block_num', 'shop_id'],'date_shop_avg_cnt')
df_sales = shift_feature(df_sales,[1,2,3],'date_shop_avg_cnt')

df_sales = add_feature(df_sales,['date_block_num', 'item_category_id'],'date_cat_avg_cnt')
df_sales = shift_feature(df_sales,[1,2,3],'date_cat_avg_cnt')

df_sales = add_feature(df_sales,['date_block_num'],'date_avg_item_cnt')
df_sales = shift_feature(df_sales,[1,2,3],'date_avg_item_cnt')

#Cleaning works
df_sales.fillna(0,inplace = True)
df_sales = optimize_memory(df_sales)
gc.collect()

In [None]:
#Mean Encoding Part 2

df_sales = add_feature(df_sales,['month'],'month_target')
df_sales = shift_feature(df_sales,[1,2,3,12],'month_target')

df_sales = add_feature(df_sales,['month','item_id'],'month_item_target')
df_sales = shift_feature(df_sales,[1,2,3],'month_item_target')

df_sales = add_feature(df_sales,['month','shop_id'],'month_shop_target')
df_sales = shift_feature(df_sales,[1,2,3],'month_shop_target')

df_sales = add_feature(df_sales,['month','item_category_id'],'month_cat_target')
df_sales = shift_feature(df_sales,[1,2,3,12],'month_cat_target')

#Cleaning works
df_sales.fillna(0,inplace = True)
df_sales = optimize_memory(df_sales)
gc.collect()

In [None]:
#Mean Encoding Part 3
df_sales['date_item_day'] = df_sales['item_cnt_month'] / df_sales['days']
df_sales = shift_feature(df_sales,[1,2,3,12],'date_item_day')

df_sales = add_feature(df_sales,['date_block_num','item_id','city_code'],'date_item_city_target')
df_sales = shift_feature(df_sales,[1,2,3,12],'date_item_city_target')

#We measure the difference between item price vs average item pricce 
gb = df_sales.groupby(['shop_id','item_id'],as_index = False).agg({ 'avg_item_price':'mean'})
gb.rename(columns ={'avg_item_price':'avg_pair_price'},inplace = True)
df_sales = pd.merge(df_sales,gb,how='left',on=['shop_id','item_id']).fillna(0)
df_sales['delta_item_price'] = (df_sales['avg_pair_price'] - df_sales['avg_item_price'])/df_sales['avg_item_price']
df_sales = shift_feature(df_sales,[1,2,3,6],'avg_item_price')
df_sales = shift_feature(df_sales,[1,2,3,6],'avg_pair_price')
df_sales = shift_feature(df_sales,[1,2,3,6],'delta_item_price')

df_sales.fillna(0,inplace = True)
df_sales = optimize_memory(df_sales)
del gb
gc.collect()

In [None]:
#Discard the first year
start_mem = df_sales.memory_usage().sum() / 1024**2

df_sales = df_sales[df_sales['date_block_num'] > 11]

end_mem = df_sales.memory_usage().sum() / 1024**2
print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
gc.collect()

In [None]:
#Save the data
df_sales.to_pickle('df_sales.pkl')

<a id='model_training'></a>
# Model Training

In [None]:
#Un comment to Load data and avoid re processing the data

#df_sales = pd.read_pickle('../input/predit-sales/df_sales.pkl')

In [None]:
#Splitting training into training and validation
#Using gridsearch on XGBoost will take very long time, so I decided to go with a fixed validation set for month 33
#and use evaluation built in functionnality of XGBoost to determine the best iteration
X_train = df_sales[df_sales['date_block_num'] < 33].drop('item_cnt_month',axis = 1)
y_train = df_sales[df_sales['date_block_num'] < 33].item_cnt_month
X_val = df_sales[df_sales['date_block_num'] == 33].drop('item_cnt_month',axis = 1)
y_val = df_sales[df_sales['date_block_num'] == 33].item_cnt_month
#Test Set
X_test  = df_sales[df_sales['date_block_num'] == 34].drop('item_cnt_month',axis = 1)
#del df_sales
gc.collect()

In [None]:
#Model training and fiting
#Model is already serialized, uncomment to train the model again
ts = time.time()

xgb = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    subsample=0.8,
    colsample_bytree=0.8,
    eta = 0.3,
    seed=42)

xgb.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, y_train), (X_val, y_val)], 
    verbose=True, 
    early_stopping_rounds = 10)

time.time() - ts

In [None]:
#Saving the model
pickle.dump(xgb, open("xgb.dat", "wb"))

#Uncomment to load pre-saved model
#xgb = pickle.load(open("../input/predit-sales/xgbmodel.dat", "rb"))

<a id='scoring'></a>
# Scores

In [None]:
#Prediction and score results for XGBoost model
y_pred = xgb.predict(X_val).clip(0,20)
y_pred_tr = xgb.predict(X_train).clip(0,20)
rmse_tr = mean_squared_error(y_train, y_pred_tr,squared=False)
rmse_val = mean_squared_error(y_val, y_pred,squared=False)
print("RMSE Validation: %.5f" % rmse_val)
print("RMSE Training: %.5f" % rmse_tr)

<a id='features_plot'></a>
# Features Plot

In [None]:
#We plot the most important features of the model
def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

plot_features(xgb, (10,14))

<a id='first_submission'></a>
# First Submission

In [None]:
#First submission
y_test = xgb.predict(X_test).clip(0,20)
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": y_test
})
submission.to_csv('submission_1.csv', index=False)

<a id='model_ensemble'></a>
# Ensembling

In [None]:
#We include now validation set into the full training set, to have better data and improve the model
X_train = df_sales[df_sales['date_block_num'] < 34].drop('item_cnt_month',axis = 1)
y_train = df_sales[df_sales['date_block_num'] < 34].item_cnt_month

<a id='gridsearch_elasticnet'></a>
# GridSearchCV for Linear Regression

In [None]:
#Use GridSearch and Timesplit to tune ElasticNet parameters, before creating meta features for ensembling
param_search = {"max_iter": [1500]}
elanet = ElasticNet()
tscv = TimeSeriesSplit(n_splits = 10).split(X_train)
gsearch = GridSearchCV(elanet, cv=tscv,param_grid=param_search,scoring = 'neg_root_mean_squared_error',n_jobs = 2,verbose = 3)
gsearch.fit(X_train,y_train)
gc.collect()

<a id='meta_features'></a>
# Building Test and Train Meta Features

In [None]:
#To have the same index for X_test and y_test
dates_train = X_train['date_block_num']
dates_train_level2 = dates_train[dates_train.isin([28,29,30,31,32,33])]
#Test meta features

#Linear Model 
pred_lr = gsearch.predict(X_test).clip(0,20)
pred_xgb = xgb.predict(X_test).clip(0,20)
#Concatenate the two predictions
X_test_level2 = np.c_[pred_lr,pred_xgb]

#train meta features
y_train_level2  =    y_train[dates_train.isin([28,29,30,31,32,33])]
X_train_level2 =     np.zeros([y_train_level2.shape[0], 2])

In [None]:
del gsearch,pred_lr,pred_xgb
gc.collect()

ts = time.time()
lr = ElasticNet(max_iter = 1500)

for cur_block_num in [28,29,30,31,32,33]:
    
    print(cur_block_num)
    X_train_part = X_train[dates_train < cur_block_num]
    y_train_part = y_train[dates_train < cur_block_num]
    X_test_part  = X_train[dates_train == cur_block_num]
    y_test_part  = y_train[dates_train == cur_block_num]
    print('Linear Regression Model')
    lr.fit(X_train_part,y_train_part)
    pred_lr = lr.predict(X_test_part)
    print('Fitting XGBoost Model')
    xgb.fit(X_train_part,y_train_part,eval_metric="rmse", eval_set=[(X_train_part, y_train_part), (X_test_part, y_test_part)], 
            verbose=True, early_stopping_rounds = 10)
    pred_xgb = xgb.predict(X_test_part)
    X_train_level2[dates_train_level2 == cur_block_num,0] = pred_lr
    X_train_level2[dates_train_level2 == cur_block_num,1] = pred_xgb
    
time.time() - ts

In [None]:
plt.scatter(X_train_level2[:,0],X_train_level2[:,1],marker='o');

In [None]:
#Save train meta features to avoid repeated processing
pickle.dump(X_train_level2, open("X_train_level2.dat", "wb"))

In [None]:
#We find the best alpha parameter for pred = alpha * lr + (1-alpha)*xgb
alphas_to_try = np.linspace(0, 1, 5000)
rmse_scores = np.array([mean_squared_error(y_train_level2,np.dot(X_train_level2,[alpha,1-alpha]),squared = False) for alpha in alphas_to_try])
best_alpha = alphas_to_try[rmse_scores.argmin()]
rmse_train_simple_mix = rmse_scores.min()
print('Best alpha: %f; Corresponding rmse score on train: %f' % (best_alpha, rmse_train_simple_mix))

<a id='second_submission'></a>
# Second Submission

In [None]:
y_test = np.dot(X_test_level2,[best_alpha,1-best_alpha]).clip(0,20)
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": y_test
})
submission.to_csv('submission_2.csv', index=False)