In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler,Normalizer,LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error
from scipy import stats
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import AdaBoostRegressor,RandomForestRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb
import optuna
import optuna.integration.lightgbm as lgb2
import optuna.integration.xgboost as xgb2
from keras.models import Sequential
from keras.layers import Dense,Dropout,Input,AlphaDropout
from keras.regularizers import Regularizer,l2 
from collections import OrderedDict
from itertools import product
import re
#import hvplot.pandas
%matplotlib inline
import plotly.offline as pyo
pyo.init_notebook_mode()

### Objective

**We are provided with daily historical sales data. The task is to forecast the total amount of products sold in every shop for the test set.**

In [None]:
salesDF=pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
itemsDF=pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
itemsCategoriesDF=pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
shopsDF=pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
testDF=pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')

In [None]:
salesDF.head()

In [None]:
# Cleaning shops data
# Якутск Орджоникидзе, 56
salesDF.loc[salesDF.shop_id == 0, 'shop_id'] = 57
testDF.loc[testDF.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
salesDF.loc[salesDF.shop_id == 1, 'shop_id'] = 58
testDF.loc[testDF.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
salesDF.loc[salesDF.shop_id == 10, 'shop_id'] = 11
testDF.loc[testDF.shop_id == 10, 'shop_id'] = 11

Here on analysing the shop names, we can find that the shop names encompass the city name and the category. 

For eg: **shop id 2 - Адыгея ТЦ "Мега" - Adygea mega shopping mall(English translation)**

Here Adygea is the name of the city in Russia

In [None]:
shopsDF['city']=shopsDF['shop_name'].apply(lambda x: x.split(" ")[0])
shopsDF['category']=shopsDF['shop_name'].apply(lambda x: x.split(" ")[1])

##Each SHOP_ID' in this list belongs to separate category and we use 'other' to encompass them.
for i in [0,6,9,20,21,22,46,55,57,10,11]:
    shopsDF.loc[shopsDF['shop_id']==i,'category']='other'
    
    
shopsDF.loc[shopsDF['city']=='!Якутск','city']='Якутск'

# le=LabelEncoder()
# shopsDF['city']=le.fit_transform(shopsDF['city'])
# shopsDF['category']=le.fit_transform(shopsDF['category'])

When we explore item category names we have:

For eg : **item_category_id 0 - PC  Гарнитуры/Наушники - PC Headset/Headphones**

**Hence we can separate this feature as 2 columns as main category and sub category**

In [None]:
##Splits item_category name into main category and sub category
categories=[]
sub_categories=[]

for i in itemsCategoriesDF['item_category_name'].unique():
    if len(list(i.split(" - ")))==1:
        categories.append(i.split(" - ")[0])
        sub_categories.append('other')
    else:
        categories.append(i.split(" - ")[0])
        sub_categories.append(i.split(" - ")[1])
        
itemsCategoriesDF['main_category']=categories
itemsCategoriesDF['sub_category']=sub_categories

# le=LabelEncoder()
# itemsCategoriesDF['main_category']=le.fit_transform(itemsCategoriesDF['main_category'])
# itemsCategoriesDF['sub_category']=le.fit_transform(itemsCategoriesDF['sub_category'])

Let the items categories dataframe after splitting features be merged with the items dataframe

In [None]:
itemsNewDF=pd.merge(itemsDF,itemsCategoriesDF[['item_category_id','main_category','sub_category']],on='item_category_id',how='inner')

In [None]:
salesDF=pd.merge(salesDF,itemsNewDF[['item_id','item_category_id','main_category','sub_category']],on='item_id',how='inner')
salesDF=pd.merge(salesDF,shopsDF[['shop_id','city','category']],on='shop_id',how='inner')

**Lets remove the outliers**

In [None]:
fig=plt.figure(figsize=(10,5))
gs=fig.add_gridspec(1,2)
ax1=fig.add_subplot(gs[0,0])
img=sns.boxplot(x=salesDF['item_cnt_day'],ax=ax1)
ax2=fig.add_subplot(gs[0,1])
img=sns.boxplot(x=salesDF['item_price'],ax=ax2)

In [None]:
pd.cut(salesDF['item_price'],10).value_counts()

In [None]:
salesDF.drop(salesDF[(salesDF['item_cnt_day']>200)&(salesDF['item_cnt_day']<0)].index,axis=0,inplace=True)
salesDF.drop(salesDF[salesDF['item_price']>30797].index,axis=0,inplace=True)

In [None]:
##Splits date into day,month,year format
salesDF['DateModified']=pd.to_datetime(salesDF['date'],format='%d.%m.%Y')
salesDF['year']=salesDF['DateModified'].dt.year
salesDF['month']=salesDF['DateModified'].dt.month
salesDF['date']=salesDF['DateModified'].dt.day

# Extensive EDA

In [None]:
shopNamesDict={shopsDF['shop_id'][i]:shopsDF['shop_name'][i] for i in range(len(shopsDF))}
shopWiseSalesDF=salesDF.groupby('shop_id').agg({'item_cnt_day':'sum','item_price':'sum'})
shopWiseSalesDF=shopWiseSalesDF.sort_values(by='item_price',ascending=False)
shopWiseSalesDF.reset_index(inplace=True)
shopWiseSalesDF['item_price']=shopWiseSalesDF['item_price'].astype(float)/1000000   
shopWiseSalesDF=shopWiseSalesDF.replace({'shop_id':shopNamesDict})
fig=plt.figure(figsize=(15,5))
img=sns.barplot(data=shopWiseSalesDF[:10],y='shop_id',x='item_price',palette='rainbow')
img.set(xticklabels=[])
img.tick_params(bottom=False,left=False)
img.set_ylabel('Shop Names',fontweight='bold',fontfamily='sans-serif',fontsize=15)
img.set_xlabel('Total price of products sold across all years',fontweight='bold',fontfamily='sans-serif',fontsize=15)
img.text(5,-1,'Top 10 shops with highest sales(in terms of price)',fontfamily='sans-serif',fontweight='bold',fontsize=20)
for i in range(10):
    img.text(1,i,str(round(shopWiseSalesDF['item_price'][i],0))+' Million',fontfamily='sans-serif',fontsize=12,fontweight='medium')
sns.despine(left=True,bottom=True,right=True,top=True)

In [None]:
itemWiseSalesDF=salesDF.groupby(['item_id'],as_index=False).agg({'item_price':'sum'}).sort_values(by='item_price',ascending=False).reset_index()
itemWiseSalesDF.drop('index',axis=1,inplace=True)
#itemWiseSalesDF['item_price']=itemWiseSalesDF['item_price'].astype('float')/1000000
itemWiseSalesDF=pd.merge(itemWiseSalesDF[:10],itemsDF[['item_id','item_name']],on='item_id',how='inner')

labels=['Sony PS4 500GB','Sony PS4 Kit 500GB','GTA V PS3','GTA V XBOX360','PS Store Top-Up Wallet']

fig=px.bar(itemWiseSalesDF[:5],y='item_name',x='item_price',orientation='h')
fig.update_layout(plot_bgcolor='#fff',
                  title='Top 5 Highest selling products',
                  yaxis=dict(showline=True,linecolor='black',tickvals=[0,1,2,3,4],ticktext=labels))
fig.show()

In [None]:
## Total items sold in every month across the 3 years
salesYearDF=salesDF.groupby(['year','month']).agg({'item_cnt_day':'sum'})
salesYearDF.reset_index(inplace=True)
months = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug',
          9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
salesYearDF=salesYearDF.replace({'month':months})
plt.figure(figsize=(15,5))
sns.set_context("notebook")
sns.set_style('white')
img=sns.lineplot(data=salesYearDF,x='month',y='item_cnt_day',hue='year')
img.set_xlabel('Month',fontsize=15,fontweight='bold',color='black',fontfamily='sans-serif')
img.set_ylabel('Items sold per month',fontsize=15,fontweight='bold',color='black',fontfamily='sans-serif')
sns.despine(right=True,top=True)

> **In both 2013 and 2014, there has been significant increase in sales during the months of November and December. This may be attributed due to Christmas season**

In [None]:
fig=px.bar(salesDF.groupby('city',as_index=False).agg({'item_cnt_day':'sum'}).sort_values('item_cnt_day',ascending=False),
           x='city',
           y='item_cnt_day',
          title='City wise sales')
fig.update_layout(plot_bgcolor='#fff')
fig.show()

The below function groups the dataframe based on city and identifies the top 5 categories in each city and represented in terms of percentage

In [None]:
cityWiseSalesDF=salesDF.groupby(['city','main_category']).agg({'item_cnt_day':'sum'}).sort_values('item_cnt_day',ascending=False).reset_index()
cityWiseSalesDF=cityWiseSalesDF.groupby('city').head(5)
cityWiseSalesDF['percentage']=cityWiseSalesDF.groupby('city')['item_cnt_day'].transform(lambda x:round(x*100/x.sum(),2))
fig=px.bar(cityWiseSalesDF,x='city',y='percentage',color='main_category',title='Top 5 categories sold in each city')
fig.update_layout(margin=dict(l=0,r=0,b=0),
                 showlegend=False)
fig.show()

**Even though Moscow is the city with highest sales some categories like cinema bluray cd's and dvd's are comparatively sold more in Yakutsk city**

> **Delivery of goods(Доставка товара) as a category is found only in the city of Интернет-магазин and accounts for 30% of sales.**
> 
> **PC games and movies accountfor the majority of sale in almost all cities**

In [None]:
salesPartitionDict={2013:{'Items Sold Before November':0,'Items Sold After November':0},2014:{'Items Sold Before November':0,'Items Sold After November':0}}

for year in [2013,2014]:
    salesPartitionDict[year]['Items Sold After November']=salesYearDF[(salesYearDF['month'].isin(['Nov','Dec'])) & (salesYearDF['year']==year)]['item_cnt_day'].mean()
    salesPartitionDict[year]['Items Sold Before November']=salesYearDF[(-salesYearDF['month'].isin(['Nov','Dec'])) & (salesYearDF['year']==year)]['item_cnt_day'].mean()
    
salesPartitionDF=pd.DataFrame(salesPartitionDict).transpose()
salesPartitionDF.reset_index(inplace=True)
salesPartitionDF.rename(columns={'index':'year'},inplace=True)

data=[go.Bar(x=salesPartitionDF['year'],y=salesPartitionDF[i],name=i)for i in salesPartitionDF.columns[1:]]
layout=go.Layout(title='Items sold for every year before and after November')
fig=go.Figure(data=data,layout=layout)
fig.update_layout(
    xaxis=dict(
        showline=True,
        showgrid=False,
        linecolor='rgb(204, 204, 204)',
        linewidth=3,
        tickmode='linear',
        dtick=1
    ),
    yaxis=dict(
        showgrid=False,
        zeroline=False,
        showline=True,
        showticklabels=True,
        linecolor='rgb(204, 204, 204)',
        linewidth=3,
    ),
    plot_bgcolor='white'
)
fig.show()

In [None]:
dateBlockNumMonth={0:'Jan 2013',1:'Feb 2013',2:'Mar 2013',3:'Apr 2013',4:'May 2013',5:'Jun 2013',6:'Jul 2013',7:'Aug 2013',8:'Sep 2013',9:'Oct 2013',10:'Nov 2013',11:'Dec 2013',
                  12:'Jan 2014',13:'Feb 2014',14:'Mar 2014',15:'Apr 2014',16:'May 2014',17:'Jun 2014',18:'Jul 2014',19:'Aug 2014',20:'Sep 2014',21:'Oct 2014',22:'Nov 2014',23:'Dec 2014',
                  24:'Jan 2015',25:'Feb 2015',26:'Mar 2015',27:'Apr 2015',28:'May 2015',29:'Jun 2015',30:'Jul 2015',31:'Aug 2015',32:'Sep 2015',33:'Oct 2015'}
a=salesDF.groupby(['date_block_num','item_id'],as_index=False).agg({'item_cnt_day':'sum'})
monthlyHighestDict=OrderedDict()
n=0
for i in range(34):
    b=a[a['date_block_num']==i]
    b=b.sort_values(by='item_cnt_day',ascending=False)[:5]
    b.reset_index(inplace=True)
    for j in range(5):
        #item_name=itemsDF[itemsDF['item_id']==b.loc[j]['item_id']]['item_name'].values[0]
        monthlyHighestDict[n]={'month':dateBlockNumMonth[i],'item_name':str(b.loc[j]['item_id']),'count':b.loc[j]['item_cnt_day']}
        n+=1

In [None]:
fig=plt.figure(figsize=(20,20))
gs=fig.add_gridspec(5,7)
monthlyHighestDF=pd.DataFrame(monthlyHighestDict).transpose()
n=0
for i in range(5):
    for j in range(7):
        if n==34:
            break
        df=monthlyHighestDF[monthlyHighestDF['month']==dateBlockNumMonth[n]]
        ax=fig.add_subplot(gs[i,j])
        img=sns.barplot(data=df,y='item_name',x='count',palette='spring',orient='h',ax=ax)
        ax.set_title(dateBlockNumMonth[n])
        img.set(xlabel=None,ylabel=None,xticks=[])
        for s in ['top','right','left']:
            ax.spines[s].set_visible(False)
        ax.tick_params(left=False)
        n=n+1
    
plt.tight_layout()

> **The mostly sold products for all months is Branded package T-shirt 1C White,Diablo III(Game), Battlefield IV, Grand Theft Auto V (GTA V), FIFA 14, Sims 4. Most of them are games for PC,XBOX and PS3.**
> 
> **For eg: Battlefield 4 has been one of the top sold products in Nov 2013,Dec 2013, Jan 2014 and Feb 2014. The release date of Battlefield 4 is 29th October 2013.**
> 
> **Similarly GTA V was released on 17th September 2013 and it was top sold products in Sep,Oct,Dec 2013 and Jan 2014. We would be wondering what happened in Nov 2013 and it was ocupied by Battlefield 4. Nov 2013 also saw higher sales of Assassin's creed 4 and Call of Duty Ghosts. But its sales faded in subsequent months.**

In [None]:
plt.figure(figsize=(15,6))
categoriesSalesDF=salesDF.groupby(['item_category_id'],as_index=False).agg({'item_cnt_day':'sum'})
categoriesSalesDF.reset_index(inplace=True)
categoriesSalesDF=categoriesSalesDF.sort_values(by='item_cnt_day',ascending=False)
categoriesSalesDF=pd.merge(categoriesSalesDF,itemsCategoriesDF[['item_category_name','item_category_id']],on='item_category_id',how='inner')
fig=px.bar(categoriesSalesDF[:5],x='item_category_name',y='item_cnt_day',title='Most sold categories')
fig.update_layout(plot_bgcolor='#fff',xaxis=dict(showline=True,linecolor='rgb(204, 204, 204)',linewidth=3),
                  yaxis=dict(showticklabels=False))
fig.show()

**Games CD's, Movies CD's, Music CD's are the most sold categories**

In [None]:
shopItemSalesDF=salesDF.groupby(['shop_id','item_id'],as_index=False).agg({'item_cnt_day':'sum'})
shopItemSalesDF.rename(columns={'item_cnt_day':'item_cnt'},inplace=True)
shopItemSalesDF['item_cnt']=shopItemSalesDF['item_cnt'].astype(np.int16)
shopItemSalesDF.sort_values(by='item_cnt',ascending=False,inplace=True)

In [None]:
shopTopItem=OrderedDict()
n=0
for shop in shopItemSalesDF['shop_id'].unique():
    shop_id=shop
    item_id=shopItemSalesDF[shopItemSalesDF['shop_id']==shop][:1]['item_id'].values[0]
    item_cnt=shopItemSalesDF[shopItemSalesDF['shop_id']==shop][:1]['item_cnt'].values[0]
    shopTopItem[n]={'shop_id':shop,'item_id':item_id,'item_cnt':item_cnt}
    n+=1
    
shopTopItemDF=pd.DataFrame(shopTopItem).transpose()

li=[]
for i in range(len(shopTopItemDF)):
    if shopTopItemDF.loc[i]['item_id']==20949:
        li.append('item_20949')
    else:
        li.append('others')
shopTopItemDF['category']=li

In [None]:
fig=px.bar(shopTopItemDF,x='shop_id',y='item_cnt',color='category', # if values in column category = 'item_20949' and 'others'
    color_discrete_map={
        'item_20949': 'yellow',
        'others': 'violet'
    })
fig.update_layout(plot_bgcolor='#fff',xaxis={'showline':True,'linewidth':1,'linecolor':'black'},yaxis={'showticklabels':False})
fig.show()

**For almost all shops T-Shirts is the most sold category**

In [None]:
yearDayDF=salesDF.groupby(['year','month'],as_index=False).agg({'item_cnt_day':'sum'})
fig=px.bar(yearDayDF,x='month',y='item_cnt_day',facet_row='year',title='Total sales for every month across years')
fig.show()

> **2015 has lower sales number compared with 2013 and 2014**

In [None]:
yearMonthDayDF=salesDF.groupby(['year','month','date','shop_id'],as_index=False).agg({'item_cnt_day':'sum','DateModified':'min'})

shopStats=OrderedDict()
n=0
for i in yearMonthDayDF['shop_id'].unique():
    mad=yearMonthDayDF[yearMonthDayDF['shop_id']==i]['item_cnt_day'].mad()
    mean=yearMonthDayDF[yearMonthDayDF['shop_id']==i]['item_cnt_day'].mean()
    max=yearMonthDayDF[yearMonthDayDF['shop_id']==i]['item_cnt_day'].max()
    shopStats[n]={'shop_id':i,'mean':mean,'max':max,'mad':mad}
    n+=1
    
shopStatsDF=pd.DataFrame(shopStats).transpose()
shopStatsDF['shop_id']=shopStatsDF['shop_id'].astype('str')
data=[go.Bar(x=shopStatsDF['shop_id'],y=shopStatsDF[i],name=i)for i in ['mean','mad']]
layout=go.Layout(title='Mean sales and Deviation in sales for every shop in the list',
                 margin=dict(l=0,r=0,b=0,t=30),
                 width=1500)
fig=go.Figure(data,layout)
fig.show()

> **Shops with higher sales are predominantly found in Moscow. Atrium Mall in Moscow(Shop id=25) has a good mean sales and lesser mean absolute deviation compared with others.**

# Categorical Encoding

We have certain columns which should be encoded like cities,shops and categories. If we apply one hot encoding then there will be numeral columns. If we apply lable encoding the information may not be correctly represented. 

So **let's group the cities and categories based on their sales. For eg Moscow city sells the most number of products. Other cities lag behind. Let's create two categories as moscow and other_cities. Now it will be easy to encode.**

In [None]:
dicCity=pd.cut(salesDF.groupby('city').agg({'item_cnt_day':'sum'})['item_cnt_day'],2,labels=['other_cities','moscow'])
shopsDF['city'].replace(dicCity,inplace=True)
salesDF['city'].replace(dicCity,inplace=True)

In [None]:
dicMainCategory=pd.cut(salesDF.groupby('main_category').agg({'item_cnt_day':'sum'})['item_cnt_day'],3,labels=['low','medium','high'])
itemsNewDF['main_category'].replace(dicMainCategory,inplace=True)
salesDF['main_category'].replace(dicMainCategory,inplace=True)

In [None]:
dicSubCategory=pd.cut(salesDF.groupby('sub_category').agg({'item_cnt_day':'sum'})['item_cnt_day'],2,labels=['medium','high'])
itemsNewDF['sub_category'].replace(dicSubCategory,inplace=True)
salesDF['sub_category'].replace(dicSubCategory,inplace=True)

In [None]:
dicCategory=pd.cut(salesDF.groupby('category').agg({'item_cnt_day':'sum'})['item_cnt_day'],3,labels=['low','medium','high'])
shopsDF['category'].replace(dicCategory,inplace=True)
salesDF['category'].replace(dicCategory,inplace=True)

In [None]:
for col in salesDF.select_dtypes('object').columns:
    le=LabelEncoder()
    salesDF[col]=le.fit_transform(salesDF[col])
    if col in itemsNewDF.columns:
        itemsNewDF[col]=le.fit_transform(itemsNewDF[col])
    elif col in shopsDF.columns:
        shopsDF[col]=le.fit_transform(shopsDF[col])

In [None]:
salesDF.head()

# Feature Engineering

Now we want to make sales forecast for a particular item to be sold in a particular shop for the month of 2015 November. So we are creating a dataframe with all possible combinations of month,shop_id and item_id.

In [None]:
cols=['date_block_num', 'shop_id', 'item_id'],
matrix=[]
for i in range(34):
    dup=salesDF[salesDF['date_block_num']==i]
    matrix.append(np.array(list(product([i],dup['shop_id'].unique(),dup['item_id'].unique())), dtype = np.int16))
    
matrix=pd.DataFrame(np.vstack(matrix))
matrix.rename(columns={0:'date_block_num',1:'shop_id',2:'item_id'},inplace=True)

matrix["date_block_num"] = matrix["date_block_num"].astype(np.int8)
matrix["shop_id"] = matrix["shop_id"].astype(np.int8)
matrix["item_id"] = matrix["item_id"].astype(np.int16)

matrix.sort_values(by=['date_block_num', 'shop_id', 'item_id'],inplace=True)
matrix.reset_index(inplace=True)
matrix.drop('index',axis=1,inplace=True)

After creating the dataframe with month,shop_id and item_id we will merge this dataframe with the monthly sales of a item sold in a particular shop calculated from the sales dataframe

In [None]:
group=salesDF.groupby(["date_block_num", "shop_id", "item_id"],as_index=False).agg({'item_cnt_day':'sum'})
group.rename(columns={'item_cnt_day':'item_cnt_month'},inplace=True)
matrix=pd.merge(matrix,group,on=['date_block_num', 'shop_id', 'item_id'],how='left')
matrix['item_cnt_month']=matrix['item_cnt_month'].fillna(0).astype(np.float16)
matrix['item_cnt_month']=matrix['item_cnt_month'].clip(0,20)

In [None]:
group=salesDF.groupby(['date_block_num','shop_id','item_id'],as_index=False).agg({'item_price':'sum'})
group.rename(columns={'item_price':'item_price_month'},inplace=True)
matrix=pd.merge(matrix,group,on=['date_block_num','shop_id','item_id'],how='left')
matrix['item_price_month']=matrix['item_price_month'].fillna(0).astype(np.float16)

In [None]:
matrix.head()

Lets concat the test dataframe to our matrix dataframe. November 2015 is indicated by date block num 34

In [None]:
testDF["date_block_num"] = 34
testDF["date_block_num"] = testDF["date_block_num"].astype(np.int8)
testDF["shop_id"] = testDF.shop_id.astype(np.int8)
testDF["item_id"] = testDF.item_id.astype(np.int16)

In [None]:
matrix=pd.concat([matrix,testDF.drop('ID',axis=1)],ignore_index=True,sort=False,keys=['date_block_num', 'shop_id', 'item_id'])
matrix.fillna(0,inplace=True)

matrix=pd.merge(matrix,itemsNewDF[['item_id','item_category_id','main_category','sub_category']],on='item_id',how='left')
matrix=pd.merge(matrix,shopsDF[['shop_id','city','category']],on='shop_id',how='left')

In [None]:
matrix.head()

In [None]:
import gc
del [salesDF,itemsDF,itemsCategoriesDF,itemsNewDF,shopsDF]
gc.collect()

Now we will create the **lag columns**

**item_cnt_month_lag_1** indicates the **no of items sold in that particular shop in the previous month** and similary

**item_cnt_month_lag_12** indicates the **no of items sold in that particular shop 12 months before**

In [None]:
df=matrix
for col in ['item_cnt_month']:
    tmp = df[["date_block_num", "shop_id","item_id",col ]]
    for i in [1,2,3,12]:
        shifted = tmp.copy()
        shifted.columns = ["date_block_num", "shop_id", "item_id", col + "_lag_"+str(i)]
        shifted.date_block_num = shifted.date_block_num + i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')

matrix=df

In [None]:
group=matrix.groupby(['date_block_num'],as_index=False).agg({'item_cnt_month':'mean'}).rename(columns={'item_cnt_month':'avg_item_cnt_month'})
matrix = pd.merge(matrix, group, on = ["date_block_num"], how = "left")
matrix['avg_item_cnt_month']=matrix['avg_item_cnt_month'].astype(np.float16)

df=matrix[['date_block_num','shop_id','item_id','avg_item_cnt_month']]
shifted=df.copy()
shifted.columns=['date_block_num','shop_id','item_id','prev_month_avg_item_cnt']
shifted['date_block_num']=shifted['date_block_num']+1

matrix=pd.merge(matrix,shifted,on=['date_block_num','shop_id','item_id'],how='left')
matrix.drop('avg_item_cnt_month',axis=1,inplace=True)

In [None]:
group=matrix.groupby(['date_block_num','shop_id'],as_index=False).agg({'item_cnt_month':'mean'}).rename(columns={'item_cnt_month':'shop_avg_cnt_month'})
group['shop_avg_cnt_month']=group['shop_avg_cnt_month'].astype(np.float16)
matrix=pd.merge(matrix,group,on=['date_block_num','shop_id'],how='left')

df=matrix

tmp=df[['date_block_num', 'shop_id','item_id','shop_avg_cnt_month']]

for i in [1,2,3,12]:
    shifted=tmp.copy()
    shifted.columns=['date_block_num','shop_id','item_id','shop_avg_cnt_month_'+'_lag_'+str(i)]
    shifted['date_block_num']=shifted['date_block_num']+i
    df=pd.merge(df,shifted,on=['date_block_num','shop_id','item_id'],how='left')
        
matrix=df
matrix.drop('shop_avg_cnt_month',axis=1,inplace=True)

In [None]:
group=matrix.groupby(['date_block_num','item_id'],as_index=False).agg({'item_cnt_month':'mean'}).rename(columns={'item_cnt_month':'item_avg_cnt_month'})
group['item_avg_cnt_month']=group['item_avg_cnt_month'].astype(np.float16)
matrix=pd.merge(matrix,group,on=['date_block_num','item_id'],how='left')

df=matrix

tmp=df[['date_block_num', 'shop_id','item_id','item_avg_cnt_month']]
for i in [1,2,3,12]:
    shifted=tmp.copy()
    shifted.columns=['date_block_num','shop_id','item_id','item_avg_cnt_month_'+'_lag_'+str(i)]
    shifted['date_block_num']=shifted['date_block_num']+i
    df=pd.merge(df,shifted,on=['date_block_num','shop_id','item_id'],how='left')
        
matrix=df
matrix.drop('item_avg_cnt_month',axis=1,inplace=True)

In [None]:
import gc
del [group,tmp,shifted,df,dup]
gc.collect()

In [None]:
group=matrix.groupby(['date_block_num','main_category'],as_index=False).agg({'item_cnt_month':'mean'}).rename(columns={'item_cnt_month':'item_category_avg_cnt_month'})
group['item_category_avg_cnt_month']=group['item_category_avg_cnt_month'].astype(np.float16)
matrix=pd.merge(matrix,group,on=['date_block_num','main_category'],how='left')

df=matrix
for col in ['item_category_avg_cnt_month']:
    tmp=df[['date_block_num','shop_id','item_id',col]]
    for i in [1,2,12]:
        shifted=tmp.copy()
        shifted.columns=['date_block_num','shop_id','item_id',col+'_lag_'+str(i)]
        shifted['date_block_num']=shifted['date_block_num']+i
        df=pd.merge(df,shifted,on=['date_block_num','shop_id','item_id'],how='left')
        
matrix=df
matrix.drop('item_category_avg_cnt_month',axis=1,inplace=True)

In [None]:
group=matrix.groupby(['date_block_num','item_id','main_category','sub_category'],as_index=False).agg({'item_cnt_month':'mean'}).rename(columns={'item_cnt_month':'item_categories_avg_cnt_month'})
group['item_categories_avg_cnt_month']=group['item_categories_avg_cnt_month'].astype(np.float16)
matrix=pd.merge(matrix,group,on=['date_block_num','item_id','main_category','sub_category'],how='left')

df=matrix
for col in ['item_categories_avg_cnt_month']:
    tmp=df[['date_block_num','shop_id','item_id',col]]
    for i in [1,2,3,12]:
        shifted=tmp.copy()
        shifted.columns=['date_block_num','shop_id','item_id',col+'_lag_'+str(i)]
        shifted['date_block_num']=shifted['date_block_num']+i
        df=pd.merge(df,shifted,on=['date_block_num','shop_id','item_id'],how='left')
        
matrix=df
matrix.drop('item_categories_avg_cnt_month',axis=1,inplace=True)

In [None]:
del [group,tmp,shifted,df]
gc.collect()

Since mostly the lag count for the first three months would be null, lets remove the information pertaining to those months from the dataframe.

# Preparing the data for modelling

In [None]:
matrix=matrix[matrix['date_block_num']>3]

In [None]:
##Filling nan's with zero
for col in matrix.columns:
    if ('lag' in col) & (matrix[col].isnull().any()):
        matrix[col].fillna(0,inplace=True)
        
matrix['prev_month_avg_item_cnt'].fillna(0,inplace=True)

After building the model and finding feature importance this 'item_price_month' was the most important feature. But this was causing overfitting problems and hence this column has been removed.

In [None]:
matrix.drop('item_price_month',axis=1,inplace=True)

In [None]:
x_train = matrix[matrix['date_block_num'] < 33].drop(['item_cnt_month'], axis=1)
y_train = matrix[matrix['date_block_num'] < 33]['item_cnt_month']
x_cv = matrix[(matrix['date_block_num'] == 33)].drop(['item_cnt_month'], axis=1)
y_cv = matrix[(matrix['date_block_num'] == 33)]['item_cnt_month']
x_test = matrix[matrix['date_block_num'] == 34].drop(['item_cnt_month'], axis=1)

Since we have a large dataframe lets randomly select 30,000 samples from each month for building our baseline model.

In [None]:
df2=matrix
df2=df2.drop(df2.index[0:],axis=0) ## We are duplicating the matrix and removing the rows and appending samples for each month
for i in x_train['date_block_num'].unique():
    sampleDF=matrix[matrix['date_block_num']==i].sample(n=30000,replace=False)
    df2=df2.append(sampleDF,ignore_index=True)
    
df2=df2.sample(frac=1)
df2.reset_index(inplace=True)
df2.drop('index',axis=1,inplace=True)
x_train_baseline=df2.drop('item_cnt_month',axis=1)
y_train_baseline=df2['item_cnt_month']

# LGBM

Initially we fit a simple model to our resampled data and make predictions. 

Then we calculate the feature importance and consider the top 20 features for making predictions in our tuned model.

In [None]:
lgbmBaseline=lgb.LGBMRegressor(objective='regression',
                              boosting_type='gbdt',
                              n_estimators=500)
lgbmBaseline.fit(x_train_baseline,y_train_baseline)
lgbmBaselinePred=lgbmBaseline.predict(x_cv)
print(mean_squared_error(lgbmBaselinePred,y_cv))
#predLgbmTestBaseline=lgbmBaseline.predict(x_test)

In [None]:
featureImportanceLgbmDF=pd.DataFrame(sorted(zip(lgbmBaseline.feature_importances_,x_train_baseline.columns)),columns=['values','features'])
featuresLgbm=list(featureImportanceLgbmDF[featureImportanceLgbmDF['values']>featureImportanceLgbmDF['values'].median()]['features'].values) ##selecting features above mean lgbm feature score
sns.barplot(y='features',x='values',data=featureImportanceLgbmDF[-20:])
#featuresLgbm=featureImportanceLgbmDF['features'][-15:].values

In [None]:
x_train_lgbm=x_train[featuresLgbm]
x_cv_lgbm=x_cv[featuresLgbm]
x_test_lgbm=x_test[featuresLgbm]

In [None]:
#x_train_lgbm,x_test_lgbm,y_train_lgbm,y_test_lgbm=train_test_split(x_train,y_train,test_size=0.3)
def optimize(trial):
    params={
        'objective':'regression',
        'num_leaves':trial.suggest_int('num_leaves',2,256),
        'feature_fraction':trial.suggest_uniform('feature_fraction',0.4,1.0),
        'boosting_type':'gbdt',
        'reg_alpha':trial.suggest_uniform('alpha',1,10),
        'reg_lambda':trial.suggest_uniform('lambda',1,10),
        'learning_rate':trial.suggest_uniform('learning_rate',0.001,1),
        'subsample':trial.suggest_uniform('subsample',0.5,1),
        'n_estimators':trial.suggest_int('n_estimators',200,1000),
        'min_data_in_leaf':trial.suggest_int('min_data_in_leaf',20,200)
    }
    model=lgb.LGBMRegressor(**params)
    model.fit(x_train_lgbm,y_train,eval_set=[(x_cv_lgbm,y_cv)],early_stopping_rounds=30)
    pred=model.predict(x_cv_lgbm)
    rmse=mean_squared_error(pred,y_cv)
    return rmse
study=optuna.create_study(direction='minimize')
study.optimize(optimize,n_trials=10)
print('Best Trail: ',study.best_trial.params)

In [None]:
lgbrEstimator=lgb.LGBMRegressor(**study.best_trial.params)
lgbrEstimator.fit(x_train_lgbm,y_train)
predLgbmCV=lgbrEstimator.predict(x_cv_lgbm)
print(mean_squared_error(predLgbmCV,y_cv))
predLgbTest=lgbrEstimator.predict(x_test_lgbm)
predLgbTest=predLgbTest.clip(0,20)

# Model Explainability

**SHAP values (SHapley Additive exPlanations) is used to explain the predictions made by your model.**

**SHAP shows the contribution or the importance of each feature on the prediction of the model, it does not evaluate the quality of the prediction itself.**

In [None]:
import shap

explainer(data) ---> Returns data,base_values,shap_values

data - The original data

base_values - The expected value of the target or the mean target value from the data

shap_values - SHAP values for each sample

explainer.shap_values ---> Returns only the shap values


In [None]:
shap_values=shap.TreeExplainer(lgbrEstimator).shap_values(x_cv_lgbm)
shap.summary_plot(shap_values,x_cv_lgbm,plot_type='bar')

From the above plot it evident that **item_avg_cnt_month_lag,item_cnt_month_lag,shop_av_cnt_month_lag** features have better predicting power compared with other variables. This prediction is also meaningful because previous month's sales is a good indicator for sales in forecoming months. 

Another point to be noted is **sales one month before(i.e last month) is more detrimental compared with other variables**

The below plot is similar to a summary bar plot but this is more interpretable.

In [None]:
shap.summary_plot(shap_values,x_cv_lgbm,feature_names=x_cv_lgbm.columns)

> **The first 3 features indicates that higher items sold in the previous month means more items would be sold in the coming months.**

> **The plot indicates that lower the item id better would be the average sales. We can confirm this conclusion from pdp plots.**

In [None]:
from pdpbox import info_plots
fig=info_plots.target_plot(df=matrix,feature='item_id',feature_name='item_id',target='item_cnt_month')
fig[0]

> **Lower item id's have better average item count per month confirming our inference from shap plot.**

In [None]:
info_plots.actual_plot(lgbrEstimator,x_cv_lgbm,feature='item_id',feature_name='item_id');

The pattern of predictions is maintained i.e compared with other item id's 2708-4838 item id's have more items sold. But the predicted count varies significantly from actual count.

In [None]:
salesA=salesDF[(salesDF['item_id']>2708)&(salesDF['item_id']<7917)]['item_cnt_day'].sum()
salesB=salesDF[(salesDF['item_id']<2708)|(salesDF['item_id']>7917)]['item_cnt_day'].sum()
salesValues=[salesA,salesB]
salesNames=['item ids 2708-7917','other item ids']
fig=px.pie(names=salesNames,values=salesValues,title='Item ids 2708-7917 account for 33% of sales')
fig.update_layout(margin=dict(l=0,r=0,b=0),showlegend=False)
fig.update_traces(textinfo='label+percent')
fig.show()

In [None]:
ax1=info_plots.target_plot(df=df2[(df2['date_block_num']>=10) & (df2['date_block_num']<19)],feature='date_block_num',feature_name='month',target='item_cnt_month')
ax2=info_plots.target_plot(df=df2[(df2['date_block_num']>=19) & (df2['date_block_num']<28)],feature='date_block_num',feature_name='month',target='item_cnt_month')
ax2=info_plots.target_plot(df=df2[df2['date_block_num']>=28],feature='date_block_num',feature_name='month',target='item_cnt_month')

# XGBOOST

In [None]:
xgbBaseline=xgb.XGBRegressor(objective='reg:squaredlogerror',
                            eval_metric='rmse',
                            booster='gbtree',
                            n_estimators=200)
xgbBaseline.fit(x_train_baseline,y_train_baseline)
predXgbBaseline=xgbBaseline.predict(x_cv)
print(mean_squared_error(predXgbBaseline,y_cv))

In [None]:
featureImportanceXgbDF=pd.DataFrame(sorted(zip(xgbBaseline.feature_importances_,x_train_baseline.columns)),columns=['values','features'])
featuresXgb=featureImportanceXgbDF[featureImportanceXgbDF['values']>=featureImportanceXgbDF['values'].median()]['features']
sns.barplot(x='values',y='features',data=featureImportanceXgbDF[-15:])

In [None]:
x_train_xgb=x_train[featuresXgb]
x_cv_xgb=x_cv[featuresXgb]
x_test_xgb=x_test[featuresXgb]

In [None]:
def optimize(trial):
    params={
        'objective':'reg:squaredlogerror',
        'eval_metric':'rmse',
        'booster':'gbtree',
        'alpha':trial.suggest_loguniform('alpha',1,20),
        'lambda':trial.suggest_loguniform('lambda',1,20),
        'max_depth':trial.suggest_int('max_depth',1,10),
        'learning_rate':trial.suggest_uniform('learning_rate',0.001,1),
        'subsample':trial.suggest_uniform('subsample',0.5,1),
        'n_estimators':trial.suggest_int('n_estimators',10,300)
    }
    model=xgb.XGBRegressor(**params)
    model.fit(x_train_xgb,y_train,eval_set=[(x_cv_xgb,y_cv)],early_stopping_rounds=30)
    pred=model.predict(x_cv_xgb)
    rmse=mean_squared_error(pred,y_cv)
    return rmse
study=optuna.create_study(direction='minimize')
study.optimize(optimize,n_trials=10)
print(study.best_trial.params)

In [None]:
xgbBestParams=study.best_trial.params
xgbBestParams.update({'objective':'reg:squaredlogerror',
           'eval_metric':'rmse',
           'booster':'gbtree'})

In [None]:
#xgbReg=xgb.XGBRegressor(**study.best_trial.params)
xgbReg=xgb.XGBRegressor(**xgbBestParams)
xgbReg.fit(x_train_xgb,y_train)
predXgbCV=xgbReg.predict(x_cv_xgb)
print(mean_squared_error(predXgbCV,y_cv))
predXgbTest=xgbReg.predict(x_test_xgb)
predXgbTest=predXgbTest.clip(0,20)

### CATBOOST

In [None]:
catboostBaseline=CatBoostRegressor(eval_metric='RMSE',
                            loss_function='RMSE',
                            iterations=500)
catboostBaseline.fit(x_train_baseline,y_train_baseline.astype('float'),early_stopping_rounds=50,verbose=50)
predCatBoostBaseline=xgbBaseline.predict(x_cv)
print(mean_squared_error(predCatBoostBaseline,y_cv))

In [None]:
featureImportanceCatBoostDF=pd.DataFrame(sorted(zip(catboostBaseline.feature_importances_,x_train_baseline.columns)),columns=['values','features'])
featuresCatBoost=featureImportanceCatBoostDF[featureImportanceCatBoostDF['values']>=featureImportanceCatBoostDF['values'].median()]['features']
sns.barplot(x='values',y='features',data=featureImportanceCatBoostDF[-15:])

In [None]:
x_train_cat=x_train[featuresCatBoost]
x_cv_cat=x_cv[featuresCatBoost]
x_test_cat=x_test[featuresCatBoost]

In [None]:
def optimize(trial):
    params={
        'depth': trial.suggest_int('depth', 4, 8),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',10,100),
        'eval_metric':'RMSE',
        'loss_function':'RMSE',
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg',5,20),
        'colsample_bylevel':trial.suggest_uniform('colsample_bylevel',0.5,1),
        'learning_rate':trial.suggest_uniform('learning_rate',0.0001,1),
        'subsample':trial.suggest_uniform('subsample',0.5,1),
        'iterations':trial.suggest_int('iterations',100,1000)
    }
    model=CatBoostRegressor(**params)
    model.fit(x_train_cat,y_train.astype('float'),eval_set=[(x_cv_cat,y_cv.astype('float'))],early_stopping_rounds=30,verbose=100)
    pred=model.predict(x_cv_cat)
    rmse=mean_squared_error(pred,y_cv.astype('float'))
    return rmse
study=optuna.create_study(direction='minimize')
study.optimize(optimize,n_trials=10)
print(study.best_trial.params)

In [None]:
catBestParams=study.best_trial.params
catBestParams.update({
           'eval_metric':'RMSE',
           'loss_function':'RMSE'})

In [None]:
catBoostReg=CatBoostRegressor(**catBestParams)
catBoostReg.fit(x_train_cat,y_train.astype('float'),eval_set=[(x_cv_cat,y_cv.astype('float'))],early_stopping_rounds=50,verbose=50)
predCatCV=catBoostReg.predict(x_cv_cat)
print('MEAN SQUARED ERROR OF VALIDATION SET FOR CATBOOST REGRESSOR ',mean_squared_error(predCatCV,y_cv))
predCatTest=catBoostReg.predict(x_test)
predCatTest=predCatTest.clip(0,20)

In [None]:
#predLgb=predLgb.clip(0,20)
predFinal1=[]
for i in predLgbTest:
    predFinal1.append(np.math.floor(i))
    
# predXgbr=predXgbr.clip(0,20)
predFinal2=[]
for i in predXgbTest:
    predFinal2.append(np.math.floor(i))
    
predFinal3=[]
for i in predCatTest:
    predFinal3.append(np.math.floor(i))

# Averaging the predictions

In [None]:
predictions=[]
for i in range(len(predFinal1)):
    val=0.3*predFinal1[i]+0.3*predFinal2[i]+0.4*predFinal3[i]
    predictions.append(int(val))

In [None]:
submissionDF=pd.DataFrame(testDF['ID'],columns=['ID'])
submissionDF['item_cnt_month']=predictions
submissionDF.to_csv('result.csv',index=False)

**The RMSE for the testing set is 0.92**