# Future Sales Predictions
lets first import necessary libraries

In [None]:
import datetime as dt
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import re
import gc
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import sklearn.preprocessing as prep 
import sklearn.ensemble as ens
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.model_selection import GridSearchCV
from catboost import *

Then we import the data

In [None]:
MyPath='../input/future-sales-prediction/'
CompPath='../input/competitive-data-science-predict-future-sales/'
TRPath='../input/translated/'
RawCategories=pd.read_csv(CompPath+'item_categories.csv')
RawItems=pd.read_csv(CompPath+'items.csv')
RawSalesTrain=pd.read_csv(CompPath+'sales_train.csv')
RawSalesTrain['Date']=pd.to_datetime(RawSalesTrain['date'],format='%d.%m.%Y')
RawShops=pd.read_csv(TRPath+'shopsTR.csv')
SalesTest1=pd.read_csv(CompPath+'test.csv')
SalesTest=SalesTest1.set_index(['shop_id','item_id'])
print("Expected Number of Predictions:"+str(SalesTest.size))
ShopIds=np.sort(SalesTest1.shop_id.unique())
print("Expected Shops for prediction("+str(len(ShopIds))+" shops):"+  str(ShopIds))

# 1.EDA and Data Cleaning

In [None]:
def LowerTypes(df,coltypes):
    for col in coltypes.keys():
        if col in df.columns:
            df[col]=df[col].astype(coltypes[col])
    return df

def delifexists(var):
    if var in globals():
        del var
        
def delifexistslist(varlist):
    for var in varlist:
        if var in globals():
            del var
    gc.collect()

We are expected to predict for the below month

In [None]:
M=RawSalesTrain.Date.max().month+1
Y=RawSalesTrain.Date.max().year
TargetPeriod=str(Y)+str(M)
TargetMonthId=RawSalesTrain.date_block_num.max()+1
print("Month "+ str(M)+" of "+ str(Y)+",Period:"+TargetPeriod+",date_block_num:"+str(TargetMonthId))

Lets look at outliers and duplicates

In [None]:
fig, ax = plt.subplots(1,2)
#plt.figure(figsize=(6,2))
sns.boxplot(x=RawSalesTrain.item_cnt_day,ax=ax[0])
sns.boxplot(x=RawSalesTrain.item_price,ax=ax[1])

Remove outliers, returns (item_cnt_day<0) and date column

In [None]:
SalesTrain=RawSalesTrain[(RawSalesTrain.item_cnt_day>0) & (RawSalesTrain.item_cnt_day<1000)& (RawSalesTrain.item_price>0) & (RawSalesTrain.item_price<100000)]
SalesTrain=SalesTrain.drop(columns=['date'])

### Shops

Now we can seperate features from Shop Name field

In [None]:
#Remove ! character before we slice
RawShops.shop_name_en=RawShops.shop_name_en.str.replace('!','').str.strip()
#Get City out
RawShops[['City','Type','NL']]=RawShops.shop_name_en.str.lower().str.split(' ',2,expand=True)
RawShops.loc[RawShops.shop_id==9,'City']=''
RawShops.loc[RawShops.shop_id==12,'City']=''
RawShops.loc[RawShops.shop_id==55,'City']=''
ShopIdx=[0,6,10,11,22,57]
RawShops.loc[RawShops.index.isin(ShopIdx),'Type']='shop'
MallIdx=[2,4,13,29,30,32,39,40,46]
RawShops.loc[RawShops.index.isin(MallIdx),'Type']='mall'
OnlineIdx=[12,55]
RawShops.loc[RawShops.index.isin(OnlineIdx),'Type']='online'
RawShops.loc[RawShops.index.isin(OnlineIdx),'City']='online'
RawShops.loc[RawShops.City=='rostovnadonu','City']='rostov-on-don'
RawShops.loc[RawShops.City=='chekhov','City']='moscow'
RawShops1=RawShops.loc[ RawShops.shop_id.isin(ShopIds),['shop_id','City','Type']]

It is important to note that although shop Id's 0,1,10 seem duplicates, they are stripped from predictions and can be safely ignored.

In [None]:
#Populations taken from Google searches
PopulationsK={
            'adygea':440,
            'balashikha':229,
            'volzhsky':330,
            'vologda':304,
            'voronezh':998,
            'zhukovsky':107,
            'online':0,
            'kazan':1170,
            'kaluga':329,
            'kolomna':145,
            'krasnoyarsk':1007,
            'kursk':426,
            'moscow':1192,
            'n.novgorod':1257,
            'novosibirsk':1511,
            'omsk':1159,
            'rostov-on-don':1100,
            'spb':4991,
            'samara':1170,
            'sergiev':109,
            'surgut':321,
            'tomsk':544,
            'tyumen':622,
            'ufa':1075,
            'yakutsk':282,
            'yaroslavl':597
            }
#Convert to Dataframe with index
Pop=pd.DataFrame({'City':list(PopulationsK.keys()),'Population':list(PopulationsK.values())}).set_index('City')
Pop.Population=(Pop.Population.astype('float32')/1000).astype(np.float32)
Shops1=RawShops1.set_index('City').join(Pop).reset_index().set_index('shop_id')
#le1 = prep.LabelEncoder()
#le2 = prep.LabelEncoder()
#le1.fit(Shops1.City)
Shops1['CityEn']=prep.LabelEncoder().fit_transform(Shops1.City).astype(np.int8)
#le2.fit(Shops1.Type)
Shops1['TypeEn']=prep.LabelEncoder().fit_transform(Shops1.Type).astype(np.int8)
Shops=Shops1.drop(columns=['City','Type'])
Shops.info()

Lets analyze test data

In [None]:
#Lets calculate number of items and shops
Number_Of_Shops=Shops.index.size
Number_Of_Items=RawItems.item_id.size
Total_Shop_Item_Combinations=Number_Of_Items*Number_Of_Shops
Number_Of_TestIDs=SalesTest.ID.size
print('Test IDs to submit='+str(Number_Of_TestIDs)+",Combinations="+str(Total_Shop_Item_Combinations))
# Not all shop/item combinations are requested!!!

### Items

Lets clean Item data (thanks Mykyta Minenko since I have no info on Russian and couldnt trasilate)

In [None]:
def name_correction(x):
    x = x.lower()
    x = x.partition('[')[0]
    x = x.partition('(')[0]
    x = re.sub('[^A-Za-z0-9А-Яа-я]+', ' ', x)
    x = x.replace('  ', ' ')
    x = x.strip()
    return x

RawItems["name1"], RawItems["name2"] = RawItems.item_name.str.split("[", 1).str
RawItems["name1"],RawItems["name3"] = RawItems.item_name.str.split("(", 1).str

RawItems["name2"] = RawItems.name2.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()
RawItems["name3"] = RawItems.name3.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()
RawItems = RawItems.fillna("0")

RawItems["item_name"] = RawItems["item_name"].apply(lambda x: name_correction(x))
RawItems.name2 = RawItems.name2.apply( lambda x: x[:-1] if x !="0" else "0")
RawItems["type"] = RawItems.name2.apply(lambda x: x[0:8] if x.split(" ")[0] == "xbox" else x.split(" ")[0] )
RawItems.loc[(RawItems.type == "x360") | (RawItems.type == "xbox360") | (RawItems.type == "xbox 360") ,"type"] = "xbox 360"
RawItems.loc[ RawItems.type == "", "type"] = "mac"
RawItems.type = RawItems.type.apply( lambda x: x.replace(" ", "") )
RawItems.loc[ (RawItems.type == 'pc' )| (RawItems.type == 'pс') | (RawItems.type == "pc"), "type" ] = "pc"
RawItems.loc[ RawItems.type == 'рs3' , "type"] = "ps3"
GroupedItems = RawItems.groupby(["type"]).agg({"item_id": "count"}).reset_index()
to_drop = []
for cat in GroupedItems.type.unique():
    if GroupedItems.loc[(GroupedItems.type == cat), "item_id"].values[0] <40:
        to_drop.append(cat)
RawItems.name2 = RawItems.name2.apply( lambda x: "etc" if (x in to_drop) else x )
RawItems = RawItems.drop(["type"], axis = 1)
RawItems.name2 = prep.LabelEncoder().fit_transform(RawItems.name2)
RawItems.name3 = prep.LabelEncoder().fit_transform(RawItems.name3)
Items=RawItems.drop(['item_name','name1'],axis=1)
coltypes={'item_id':np.int16,'item_category_id':np.int8,'name2':np.int8,'name3':np.int8}
Items=LowerTypes(Items,coltypes)
Items.info()

### Item Category

In [None]:
RawCategories['item_category_name_2']=RawCategories["item_category_name"].apply(lambda x: x.split()[0])
RawCategories["item_category_name_2"]=prep.LabelEncoder().fit_transform(RawCategories["item_category_name_2"]).astype(np.int8)
Categories=RawCategories.drop(columns=['item_category_name'])
Categories.info()

### Agregate to months

Lets group sales based on shop_id and item_id

In [None]:
#Group by month, shop_id, item_id
SalesPerPeriodPerItem1=SalesTrain.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day':'sum','item_price':'mean'}).reset_index().rename(columns={'item_cnt_day':'item_cnt_month','item_price':'avg_shopitem_price'})
ShopItemCombinationsFor33=SalesPerPeriodPerItem1[SalesPerPeriodPerItem1.date_block_num==33].shape[0]
print('Train data for month 33 is '+str(ShopItemCombinationsFor33)+", where for month 34 we predict for "+str(Number_Of_TestIDs))

We need to expand train data to cover all shop-item pairs and pad empty values for zeros

In [None]:
%%time
SalesTemp=SalesTest.drop(columns=['ID']).reset_index()
SalesTable=pd.DataFrame({'date_block_num':[0],'shop_id':[0],'item_id':[0]})
for i in range(34):  
    #print(i)
    SalesTemp['date_block_num']=i
    SalesTemp2=SalesTemp.set_index(['date_block_num','shop_id','item_id'])
    SalesPPX=SalesPerPeriodPerItem1[SalesPerPeriodPerItem1.date_block_num==i].set_index(['date_block_num','shop_id','item_id'])
    SalesTemp2=SalesTemp2.join(SalesPPX).reset_index().fillna(0)
    SalesTable=SalesTable.append(SalesTemp2)
SalesTable=SalesTable.iloc[1:]
SalesTable.info()

In [None]:
#Group by month, item_id
SalesPerPeriodPerItem2=SalesTrain.groupby(['date_block_num','item_id']).agg({'item_cnt_day':'sum','item_price':'mean'}).reset_index().rename(columns={'item_cnt_day':'item_cnt_month','item_price':'avg_item_price'})
SalesPerPeriodPerItem3=SalesTable.set_index(['date_block_num','item_id']).join(SalesPerPeriodPerItem2.set_index(['date_block_num','item_id']).drop(columns=['item_cnt_month'])).fillna(0)

In [None]:
#Calculate price difference for each item in each shop to mean item price
SalesPerPeriodPerItem3['price_diff']=(SalesPerPeriodPerItem3['avg_shopitem_price']-SalesPerPeriodPerItem3['avg_item_price'])/SalesPerPeriodPerItem3['avg_item_price']
SalesPerPeriodPerItem4=SalesPerPeriodPerItem3.drop(columns=['avg_shopitem_price','avg_item_price']).reset_index()

SalesPerPeriodPerItem4.info()

Lets expand our train data to accomodate our test data

In [None]:
X_target1=SalesTest.reset_index().drop(columns=['ID'])
X_target1['date_block_num']=34
X_target1['item_cnt_month']=0
X_target1['price_diff']=0
SalesPerPeriodPerItem=SalesPerPeriodPerItem4.append(X_target1).fillna(0)
coltypes={'date_block_num':np.int8,'shop_id':np.int8,'item_id':np.int16,'price_diff':np.float32,'item_cnt_month':np.float32}
SalesPerPeriodPerItem=LowerTypes(SalesPerPeriodPerItem,coltypes)
#if (SalesPerPeriodPerItem[SalesPerPeriodPerItem.date_block_num==34].shape[0])>214200:
#    print('!!!ERROR, row number larger than 214200')
delifexistslist(['SalesPerPeriodPerItem1','SalesPerPeriodPerItem2','SalesPerPeriodPerItem3','SalesPerPeriodPerItem4'])

Lets find how most busy shops are doing

In [None]:
MostBusyShopIds=np.argsort(SalesPerPeriodPerItem.groupby('shop_id').item_cnt_month.mean(),axis=None)[-3:]
BusyShopActivity=SalesPerPeriodPerItem[SalesPerPeriodPerItem.shop_id.isin(MostBusyShopIds)]
plt.figure(figsize=(14,4))
sns.lineplot(data=BusyShopActivity,x='date_block_num',y='item_cnt_month',hue='shop_id')
#There is a decreasing trend in sales of shops
#There is a seasonality to the data with peaks happening on december (due to christmas).

Lets do a seasonal decompose

In [None]:
#https://stackoverflow.com/questions/45184055/how-to-plot-multiple-seasonal-decompose-plots-in-one-figure
def plotseasonal(res, axes, title):
    res.observed.plot(ax=axes[0], legend=False)
    axes[0].set_ylabel('Observed')
    res.trend.plot(ax=axes[1], legend=False)
    axes[1].set_ylabel('Trend')
    res.seasonal.plot(ax=axes[2], legend=False)
    axes[2].set_ylabel('Seasonal')
    res.resid.plot(ax=axes[3], legend=False)
    axes[3].set_ylabel('Residual')
    axes[0].set_title(title)
    
fig, axes1 = plt.subplots(ncols=2, nrows=4, sharex=True, figsize=(12,5))    
ShopActivity19=BusyShopActivity.loc[BusyShopActivity.shop_id==19,['date_block_num','item_cnt_month']].set_index('date_block_num')
ShopActivity18=BusyShopActivity.loc[BusyShopActivity.shop_id==18,['date_block_num','item_cnt_month']].set_index('date_block_num')
result19 = seasonal_decompose(ShopActivity19, model='additive', period=12)
result18 = seasonal_decompose(ShopActivity18, model='additive', period=12)
plotseasonal(result19,axes1[:,0], title = 'Sales decomposition for Store 19')
plotseasonal(result18,axes1[:,1], title = 'Sales decomposition for Store 8')
plt.tight_layout()

# 2.Feature Extraction

We add City and Type features from shops

In [None]:
SalesPerPeriodPerItem1=SalesPerPeriodPerItem.set_index('item_id').join(Items.set_index('item_id')).reset_index()
SalesPerPeriodPerItem2=SalesPerPeriodPerItem1.set_index('item_category_id').join(Categories.set_index('item_category_id')).reset_index()
EnrichedSales1=SalesPerPeriodPerItem2.set_index('shop_id').join(Shops).reset_index()
EnrichedSales1.info()

In [None]:
# add month day, week day and weekend count

def last_day_of_month(date_block_num):
    addyear=date_block_num//12
    addmonth=date_block_num%12+1
    date=dt.datetime(year=2013+addyear,month=addmonth,day=28)
    next_month = date + dt.timedelta(days=4)
    return (next_month - dt.timedelta(days=next_month.day)).day

def count_holidays(date_block_num):
    m = 1 + date_block_num % 12
    if m == 1:
        return 1
    elif m == 2:
        return 1
    elif m == 3:
        return 1
    elif m == 5:
        return 2
    elif m == 6:
        return 1
    elif m == 11:
        return 1
    elif m == 12:
        return 2
    else:
        return 0
    
def count_weekdays(date_block_num):
    try:
        y = 2013 + date_block_num // 12
        m = 1 + date_block_num % 12
        if m < 9:
            return np.busday_count(f'{y}-0{m}', f'{y}-0{m+1}')
        elif m ==9:
            return np.busday_count(f'{y}-0{m}', f'{y}-10')
        elif m != 12:
             return np.busday_count(f'{y}-{m}', f'{y}-{m+1}')
        else:
            return np.busday_count(f'{y}-{m}', f'{y+1}-01')
    except ValueError:
        print(m,y)

In [None]:
%%time
EnrichedSales1['days_in_month']=EnrichedSales1.date_block_num.apply(lambda x:last_day_of_month(x)).astype(np.int8)
EnrichedSales1['holidays']=EnrichedSales1.date_block_num.apply(lambda x:count_holidays(x)).astype(np.int8)
EnrichedSales1['weekdays']=EnrichedSales1.date_block_num.apply(lambda x:count_weekdays(x)).astype(np.int8)
EnrichedSales1.info()

Lets look at moving averages to seperate trends and seasonalities

In [None]:
ShopActivity19['zdata']=ShopActivity19.item_cnt_month-ShopActivity19.item_cnt_month.rolling(window=12).mean()/ShopActivity19.item_cnt_month.rolling(window=12).std()
fig, ax = plt.subplots(2, figsize=(12,6))
ax[0].plot(ShopActivity19.index,ShopActivity19.item_cnt_month,label='Raw Data')
ax[0].plot(ShopActivity19.item_cnt_month.rolling(window=12).mean(),label='Rolling Mean')
ax[0].plot(ShopActivity19.item_cnt_month.rolling(window=12).std(),label='Rolling Std')
ax[0].legend()
ax[1].plot(ShopActivity19.index,ShopActivity19.zdata,label='Untrended Data')
ax[1].plot(ShopActivity19.zdata.rolling(window=12).mean(),label='Rolling Mean')
ax[1].plot(ShopActivity19.zdata.rolling(window=12).std(),label='Rolling Std')
ax[1].legend()
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(2, figsize=(12,6))
ax[0] = plot_acf(ShopActivity19.zdata.dropna(), ax=ax[0], lags=10)
ax[1] = plot_pacf(ShopActivity19.zdata.dropna(), ax=ax[1], lags=10)

Add item introduction

In [None]:
%%time
FirstIntroShop=EnrichedSales1.groupby(['item_id','shop_id']).date_block_num.min().reset_index().rename(columns={'date_block_num':'item_first_intro_shop'})
EnrichedSales1=EnrichedSales1.reset_index().set_index(['item_id','shop_id'])
EnrichedSales2=EnrichedSales1.join(FirstIntroShop.set_index(['item_id','shop_id'])).reset_index()
EnrichedSales2['item_first_intro_shop']=EnrichedSales2['date_block_num']-EnrichedSales2['item_first_intro_shop']

FirstIntroCity=EnrichedSales2.groupby(['item_id','CityEn']).date_block_num.min().reset_index().rename(columns={'date_block_num':'item_first_intro_city'})
EnrichedSales2=EnrichedSales2.reset_index().set_index(['item_id','CityEn'])
EnrichedSales3=EnrichedSales2.join(FirstIntroCity.set_index(['item_id','CityEn'])).reset_index()
EnrichedSales3['item_first_intro_city']=EnrichedSales3['date_block_num']-EnrichedSales3['item_first_intro_city']

FirstIntro=EnrichedSales3.groupby(['item_id']).date_block_num.min().reset_index().rename(columns={'date_block_num':'item_first_intro'})
EnrichedSales3=EnrichedSales3.set_index(['item_id'])
EnrichedSales4=EnrichedSales3.join(FirstIntro.set_index(['item_id'])).reset_index()
EnrichedSales4['item_first_intro']=EnrichedSales4['date_block_num']-EnrichedSales4['item_first_intro']

EnrichedSales4['item_first_intro_shopCity']=EnrichedSales4['item_first_intro_shop']-EnrichedSales4['item_first_intro_city']
EnrichedSales4['item_first_intro_shopGen']=EnrichedSales4['item_first_intro_shop']-EnrichedSales4['item_first_intro']

EnrichedSales4.info()

In [None]:
#Clean memory
EnrichedSales1=EnrichedSales4.copy()
delifexistslist(['Enriched2','Enriched3','Enriched4','FirstIntroShop','FirstIntroCity','FirstIntro'])

Add lagged features

In [None]:
def laggedFeature(df, lags, col,ntype):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        NewcolumnName=col+'_lag_' + str(i)
        shifted=shifted.rename(columns={col:NewcolumnName})
        shifted['date_block_num'] += i
        shifted=shifted.set_index(['date_block_num','shop_id','item_id'])
        df=df.set_index(['date_block_num','shop_id','item_id']).join(shifted).reset_index()
        df[NewcolumnName]=df[NewcolumnName].astype(ntype)
    return df.fillna(0)

In [None]:
%%time
EnrichedSales2=laggedFeature(EnrichedSales1,[1,2,3],'item_cnt_month',np.float32)

Lets check if it was populated for test part

In [None]:
ESL1=EnrichedSales2[EnrichedSales2.date_block_num==34].set_index(['shop_id','item_id']).item_cnt_month_lag_1
ESL2=EnrichedSales2[EnrichedSales2.date_block_num==33].set_index(['shop_id','item_id']).item_cnt_month
testdf=pd.DataFrame(ESL1)
testdf=testdf.join(ESL2).fillna(0)
testdf['diff']=testdf.item_cnt_month_lag_1-testdf.item_cnt_month
if testdf['diff'].unique().size==1:
    print('Good')
else:
    print('There must be an error')

Lets add the change of prices as a feature

In [None]:
%%time
EnrichedSales3=laggedFeature(EnrichedSales2,[1,2,3],'price_diff',np.float32).fillna(0).drop(columns=['price_diff'])
EnrichedSales3.info()

Add previous Target Encodings to EnrichedSales

In [None]:
%%time
#Item-item_cnt_month Encoding
ItemTargetEncoding=EnrichedSales3.groupby(['date_block_num','item_id'])['item_cnt_month'].mean().reset_index().rename(columns={"item_cnt_month": "item_target_enc"})
EnrichedSales4=EnrichedSales3.set_index(['date_block_num','item_id']).join(ItemTargetEncoding.set_index(['date_block_num','item_id'])).reset_index()
#Item-item+Shop_id_cnt_month Encoding
ShopItemTargetEncoding=EnrichedSales4.groupby(['date_block_num','shop_id','item_id'])['item_cnt_month'].mean().reset_index().rename(columns={"item_cnt_month": "shopitem_target_enc"})
EnrichedSales5=EnrichedSales4.set_index(['date_block_num','shop_id','item_id']).join(ShopItemTargetEncoding.set_index(['date_block_num','shop_id','item_id'])).reset_index()
#Item+item_category_id-item_cnt_month Encoding
ItemCategoryTargetEncoding=EnrichedSales5.groupby(['date_block_num','item_id','item_category_id'])['item_cnt_month'].mean().reset_index().rename(columns={"item_cnt_month": "item_target_cat_enc"})
EnrichedSales6=EnrichedSales5.set_index(['date_block_num','item_id','item_category_id']).join(ItemCategoryTargetEncoding.set_index(['date_block_num','item_id','item_category_id'])).reset_index()

#Item+item_category_name_2-item_cnt_month Encoding
ItemCategoryTargetEncoding=EnrichedSales6.groupby(['date_block_num','item_id','item_category_name_2'])['item_cnt_month'].mean().reset_index().rename(columns={"item_cnt_month": "item_target_cat_enc2"})
EnrichedSales7=EnrichedSales6.set_index(['date_block_num','item_id','item_category_name_2']).join(ItemCategoryTargetEncoding.set_index(['date_block_num','item_id','item_category_name_2'])).reset_index()

#Item+CityEn-item_cnt_month Encoding
ItemCategoryCityTargetEncoding=EnrichedSales7.groupby(['date_block_num','item_id','CityEn'])['item_cnt_month'].mean().reset_index().rename(columns={"item_cnt_month": "item_target_city_enc"})
EnrichedSales8=EnrichedSales7.set_index(['date_block_num','item_id','CityEn']).join(ItemCategoryCityTargetEncoding.set_index(['date_block_num','item_id','CityEn'])).reset_index()

In [None]:
#Clean memory
delifexistslist(['ItemTargetEncoding','ShopItemTargetEncoding','ItemCategoryTargetEncoding','ItemCategoryTargetEncoding','ItemCategoryCityTargetEncoding','EnrichedSales3','EnrichedSales4','EnrichedSales5','EnrichedSales6','EnrichedSales7'])

In [None]:
%%time
EnrichedSales9=laggedFeature(EnrichedSales8,[1,2,3],'item_target_enc',np.float32)
EnrichedSales10=laggedFeature(EnrichedSales9,[1,2,3],'shopitem_target_enc',np.float32)
EnrichedSales11=laggedFeature(EnrichedSales10,[1,2,3],'item_target_cat_enc',np.float32)
EnrichedSales12=laggedFeature(EnrichedSales11,[1,2,3],'item_target_cat_enc2',np.float32)
EnrichedSales13=laggedFeature(EnrichedSales12,[1,2,3],'item_target_city_enc',np.float32)
#*************************
# It is extremely important to delete below columns after introducing lagged values 
# as they are a source of target leakage
#*************************
EnrichedSales13=EnrichedSales13.drop(columns=['item_target_enc','shopitem_target_enc','item_target_cat_enc','item_target_cat_enc2','item_target_city_enc'])
EnrichedSales13=EnrichedSales13[EnrichedSales11['date_block_num']>2]
EnrichedSales13.info()

Convert data and lower mem usage

In [None]:
coltypes={'date_block_num':np.int8,'shop_id':np.int8,'item_id':np.int16,'CityEn':np.int8,'item_category_id':np.int8}
EnrichedSales13=LowerTypes(EnrichedSales13,coltypes)

save and clear data

In [None]:
EnrichedSales=EnrichedSales13
pickle.dump(EnrichedSales, open("EnrichedSales.pkl", "wb"))
delifexistslist(['RawShops','RawItems','Items','Categories']) 
delifexistslist(['SalesTrain','SalesTable','SalesPerPeriodPerItem1','SalesPerPeriodPerItem2','SalesPerPeriodPerItem3','SalesPerPeriodPerItem'])
delifexistslist(['EnrichedSales1','EnrichedSales2','EnrichedSales3','EnrichedSales4','EnrichedSales5'])
delifexistslist(['EnrichedSales6','EnrichedSales7','EnrichedSales8','EnrichedSales9','EnrichedSales10'])
delifexistslist(['EnrichedSales11','EnrichedSales12'])

Lets look at the correlation matrix

In [None]:
corrMatrix1=EnrichedSales.corr()
sns.heatmap(corrMatrix1, annot=False)

# 3.Create and Tune Model

First lets group the sales data last available month (2015-09) as a baseline

In [None]:
MyPath='../input/future-sales-prediction/'
CompPath='../input/competitive-data-science-predict-future-sales/'
TRPath='../input/translated/'
ModelData = pickle.load(open(MyPath+"EnrichedSales.pkl", "rb"))
SalesTest1=pd.read_csv(CompPath+'test.csv')
SalesTest=SalesTest1.set_index(['shop_id','item_id'])
ModelData.item_cnt_month[ModelData.item_cnt_month>20]=20
ModelData201509=ModelData[ModelData.date_block_num==32]
ModelData201509y=SalesTest.join(ModelData201509.set_index(['shop_id','item_id']).item_cnt_month).fillna(0).item_cnt_month.values
ModelData201510=ModelData[ModelData.date_block_num==33]
ModelData201510y=SalesTest.join(ModelData201510.set_index(['shop_id','item_id']).item_cnt_month).fillna(0).item_cnt_month.values
#Lets check RSME value if 201509 is used as prediction for 201510
rms = mean_squared_error(ModelData201510y, ModelData201509y, squared=False)
print('RMSE for 201509->201510 Regession:'+str(rms))

In [None]:
#Create Train, Validation Set
X_train=ModelData[ModelData.date_block_num<33].drop(columns=['item_cnt_month'])
y_train=ModelData[(ModelData.date_block_num<33)]['item_cnt_month']
X_valid=ModelData[ModelData.date_block_num==33].drop(columns=['item_cnt_month'])
y_valid=ModelData[ModelData.date_block_num==33]['item_cnt_month'].values
X_target=ModelData[ModelData.date_block_num==34].drop(columns=['item_cnt_month'])

We will train a linear regression model and test it on validation set

In [None]:
model1=LinearRegression()
model1.fit(X_train,y_train)
y_pred1=model1.predict(X_valid)
y_pred=np.rint(y_pred1)
y_pred[y_pred>20]=20
rms = mean_squared_error(y_valid, y_pred, squared=False)
print('RMSE for Linear Regession:'+str(rms))
plt.style.use('seaborn-whitegrid')
plt.plot(y_valid,'.',color='red')
plt.plot(y_pred,'.',color='blue')
plt.show()

We will train catboostregressor model and test it on validation set

In [None]:
%%time
categorical=['date_block_num','shop_id','item_id','CityEn','item_category_id','name2','name3','TypeEn']
model2 = CatBoostRegressor(
            iterations=50,
            learning_rate=1,
            depth=2)

model2.fit(
    X_train, y_train,
    cat_features=categorical,
    eval_set=(X_valid, y_valid),
    logging_level='Silent'
)
print('Model is fitted: ' + str(model2.is_fitted()))
model2.get_best_score()
y_pred1=model2.predict(X_valid)
y_pred=np.rint(y_pred1)
y_pred[y_pred>20]=20
rms = mean_squared_error(y_valid, y_pred, squared=False)
print('RMSE for Catboost:'+str(rms))
plt.plot(y_valid,'.',color='red')
plt.plot(y_pred,'.',color='blue')
plt.show()
pickle.dump(model2, open("model2.pkl", "wb"))

We will train randomforest model and test it on validation set, please note that we used GridSearch for HYPERPARAMETER tuning

In [None]:
%%time
#params_rf = {
#    'n_estimators': [20,30,40,50], 
#    'max_depth':[2,4,6], 
#    'max_features': ['auto', 'sqrt'], 
#}
#n_estimators larger than 20 does not improve score considerably
#max_depth is 6
# Below paramaters results better score
#RFR=ens.RandomForestRegressor()
#model3 = GridSearchCV(RFR, params_rf, cv=None, refit=True,n_jobs=-1)
model3 = ens.RandomForestRegressor(n_estimators=20,max_depth=6, random_state=0,n_jobs=-1)
model3.fit(X_train, y_train)
#print(model3.best_params_)
y_pred1=model3.predict(X_valid)
y_pred=np.rint(y_pred1)
y_pred[y_pred>20]=20
rms = mean_squared_error(y_valid, y_pred, squared=False)
print('RMSE for RandomForest Regession:'+str(rms))
plt.plot(y_valid,'.',color='red')
plt.plot(y_pred,'.',color='blue')
plt.show()
pickle.dump(model3, open("model3.pkl", "wb"))

Lets also try xgboost

In [None]:
%%time
model4 = XGBRegressor(
    max_depth=10,
    booster='gbtree',
    n_estimators=100,
    min_child_weight=0.5, 
    subsample=0.8,
    sampling_method="uniform",
    colsample_bynode=1,
    colsample_bytree=0.8, 
    eta=0.1,
    seed=0)

model4.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, y_train), (X_valid, y_valid)], 
    verbose=True, 
    early_stopping_rounds = 10)

In [None]:
%%time
y_pred1=model4.predict(X_valid)
y_pred=np.rint(y_pred1)
y_pred[y_pred>20]=20
rms = mean_squared_error(y_valid, y_pred, squared=False)
print('RMSE for RandomForest Regession:'+str(rms))
plt.plot(y_valid,'.',color='red')
plt.plot(y_pred,'.',color='blue')
plt.show()
pickle.dump(model4, open("model4.pkl", "wb"))

In [None]:
fig, ax = plt.subplots(1,1,figsize=(15,20))
plot_importance(booster=model4, ax=ax)

# 4.Make Predictions and submit

Use the most promising method, train it on whole data set

In [None]:
X_all=ModelData.drop(columns=['item_cnt_month'])
y_all=ModelData['item_cnt_month'].values
modelFinal = ens.RandomForestRegressor(n_estimators=20,max_depth=6, random_state=0)

modelFinal=pickle.load(open("./model4.pkl", "rb"))

In [None]:
#Predict Final Model Output
ModelOutput1=modelFinal.predict(X_target)
Predictions=np.rint(ModelOutput1)
#Clip values between [0:20]
Predictions[Predictions>20]=20
Predictions[Predictions<0]=0
df=pd.DataFrame(Predictions)

In [None]:
SalesTest1=pd.read_csv(CompPath+'test.csv')
SalesTest=SalesTest1.set_index(['shop_id','item_id'])

Organize output data for submission

In [None]:
PredsDF=X_target[['shop_id','item_id']]
PredsDF['item_cnt_month']=Predictions
Submission=SalesTest.join(PredsDF.set_index(['shop_id','item_id'])).reset_index().drop(columns=['shop_id','item_id'])
plt.plot(Submission.item_cnt_month,'.')

For submission, encapsulate data to create csv file

In [None]:
Submission.to_csv('submission10.csv',index=False)