<a href="https://colab.research.google.com/github/riemann1859/store-item-demand-forecasting-challenge/blob/master/Lasso%2BRecursiveStrategy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#load datasets
df_train=pd.read_csv('train.csv',sep=',')
df_test=pd.read_csv('test.csv',sep=',')

#drop id column 

if 'id' in df_test.columns:
  df_test.drop(labels='id',axis=1,inplace=True)
  
if 'id' in df_train.columns:
  df_train.drop(labels='id',axis=1,inplace=True)
  

# create sales column for df_test which is to be predicted  
df_test['sales']=np.NaN

df_test['purpose']='test'
df_train['purpose']='train'

#combine df_train and df_test into one dataframe

df=pd.concat([df_train, df_test], axis=0)

df.loc[(df.purpose=='train')&(df.date>'2016-12-31'),'purpose']='validation'


#convert to date_time object

df['date'] = pd.to_datetime(df['date'])

df=df.sort_values(by=['item','store','date'])



In [0]:
# prepare dataframe 
# if we denote sales column with S_t, we add S_(t-1), S_(t-2),.... to this new dataframe

new_df=pd.DataFrame()
for item in df.item.unique():
    for store in df.store.unique():
        #add lagged sales column to  the related part of the original dataframe 
        new=pd.concat([df.loc[(df.store==store)&(df.item==item)]]+[df.loc[(df.store==store)&(df.item==item),'sales'].shift(i) for i in range(1,400)],axis=1)
        new.columns=list(df.columns)+['sales_lagged_{}'.format(i) for i in range(1,400)]
        new_df=pd.concat([new_df,new],axis=0)

In [0]:
#create some new categorical features from date

new_df['weekofyear']=new_df.date.apply(lambda x:x.weekofyear)
new_df['dayofweek']=new_df.date.apply(lambda x:x.dayofweek)
new_df['year']=new_df.date.apply(lambda x:x.year)
new_df['month']=new_df.date.apply(lambda x:x.month)

new_df['weekofyear']=new_df['weekofyear'].astype('category')
new_df['dayofweek']=new_df['dayofweek'].astype('category')
new_df['year']=new_df['year'].astype('category')
new_df['month']=new_df['month'].astype('category')
new_df['store']=new_df.store.astype('category')
new_df['item']=new_df.item.astype('category')


In [8]:
# convert categoricals to dummy variables

new_df_with_dummies=pd.concat([pd.get_dummies(new_df.drop(labels='purpose',axis=1)),new_df.purpose],axis=1)

train=new_df_with_dummies[new_df_with_dummies.purpose=='train']
validation=new_df_with_dummies[new_df_with_dummies.purpose=='validation']
test=new_df_with_dummies[new_df_with_dummies.purpose=='test']

train.dropna(inplace=True)
validation.dropna(inplace=True)


xtrain=train.drop(labels=['sales','date','purpose'],axis=1)
ytrain=train.sales.values
xvalidation=validation.drop(labels=['sales','date','purpose'],axis=1)
yvalidation=validation.sales.values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [9]:
from sklearn.linear_model import  LassoLarsIC



model_bic = LassoLarsIC(criterion='bic')
model_bic.fit(xtrain,ytrain)

LassoLarsIC(copy_X=True, criterion='bic', eps=2.220446049250313e-16,
            fit_intercept=True, max_iter=500, normalize=True, positive=False,
            precompute='auto', verbose=False)

In [10]:
model_aic = LassoLarsIC(criterion='aic')
model_aic.fit(xtrain,ytrain)



LassoLarsIC(copy_X=True, criterion='aic', eps=2.220446049250313e-16,
            fit_intercept=True, max_iter=500, normalize=True, positive=False,
            precompute='auto', verbose=False)

In [11]:

from sklearn.linear_model import  LassoLarsCV


model_CV=LassoLarsCV(cv=10,max_iter=1000)
model_CV.fit(xtrain,ytrain)

LassoLarsCV(copy_X=True, cv=10, eps=2.220446049250313e-16, fit_intercept=True,
            max_iter=1000, max_n_alphas=1000, n_jobs=None, normalize=True,
            positive=False, precompute='auto', verbose=False)

In [12]:
#best alphas according to the three methods above

print(model_aic.alpha_)
print(model_bic.alpha_)
print(model_CV.alpha_)



8.02253852187597e-06
0.00013497300365540607
5.200219542816652e-06


In [13]:
# performances on validation set with respect to smape metric

def smape(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

print(smape(model_aic.predict(xvalidation),yvalidation))
print(smape(model_bic.predict(xvalidation),yvalidation))
print(smape(model_CV.predict(xvalidation),yvalidation))

12.434772034534191
12.610491233115054
12.421586803620523


In [14]:
# we now construct a Lasso model with alpha=model_CV.alpha_ on the training set xtrain+xvalidation

xtrain_val=pd.concat([xtrain,xvalidation],axis=0)
ytrain_val=np.concatenate((ytrain,yvalidation),axis=0)

from sklearn.linear_model import Lasso



lasso=Lasso(max_iter=1000, alpha=model_CV.alpha_)
lasso.fit(xtrain_val,ytrain_val)

  positive)


Lasso(alpha=5.200219542816652e-06, copy_X=True, fit_intercept=True,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [48]:
# predictions with Recursive strategy


predictions={} # two dimensional dictionary
for item in df.item.unique():
    predictions[item]=dict()
    for store in df.store.unique():
        predictions[item][store]=list()
        start=900*(item-1)+90*(store-1)    
        end=900*(item-1)+90*store
        count=0
        for ind1 in range(start,end):
            predictions[item][store].append(lasso.predict(np.concatenate((predictions[item][store][:count],test.iloc[ind1,count+2:-1].values)).reshape(1,-1))[0])
            count+=1
     
    


1565974974.106983
1565975015.4061806


In [0]:
#prepare the submission file


for item in df.item.unique():
    for store in df.store.unique():
        start=900*(item-1)+90*(store-1)    
        end=900*(item-1)+90*store
        df_test.iloc[start:end,3]=predictions[item][store]
        
d={}
d['id']=df_test.index
d['sales']=df_test.sales

pred_submission=pd.DataFrame(d, columns=['id','sales'])

from google.colab import files

with open('pred.csv', 'w') as f:
    pred_submission.to_csv(f, index=False)

files.download('pred.csv')        
        