In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import warnings
warnings.filterwarnings("ignore")

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# Loading Data 

In [None]:
sales = pd.read_csv("../input/train.csv")
sales.head()

In [None]:
sales.info()

In [None]:
stores = pd.read_csv("../input/store.csv")
stores.head()

In [None]:
stores.info()

In [None]:
test = pd.read_csv('../input/test.csv')
test = pd.merge(test, stores, 'left', 'Store')
test.head()

In [None]:
data = pd.merge(sales, stores, 'left', 'Store').sample(frac= 1)
data['Date'] = pd.to_datetime(data['Date'])
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year
data['Day'] = data['Date'].dt.day
data['WeekOfYear'] = data.Date.dt.weekofyear
data['Quarter'] = data.Date.dt.quarter
data['StateHoliday'] = data['StateHoliday'].replace(0, '0')
data.head()

In [None]:
data.info()

In [None]:
test.head()

In [None]:
#Number of stores in sales not in stores data

print(len(set(sales['Store']) - set(stores['Store'])))

In [None]:
#Number of test stores not in train stores

print(len(set(test['Store']) - set(data['Store'])))

In [None]:
#Number of train stores not in test stores

print(len(set(data['Store']) - set(test['Store'])))

# EDA 

In [None]:
g = sns.distplot(data['Sales'])
g.set_title("Data Distribution")


Log normal target sales with several no sales data. Let's investigate that.

### Zero Sales

In [None]:
zero_sales = data[data['Sales']==0].copy()
data =  data[data['Sales']!=0].drop('Open', 1)

In [None]:
fig, ax = plt.subplots (1,4, figsize=(20,4))
sns.barplot(['Size'], [len(zero_sales)], ax=ax[0])
sns.countplot('DayOfWeek', data=zero_sales, ax=ax[1])
sns.countplot('Open', data=zero_sales, ax=ax[2])
sns.countplot('Promo', data=zero_sales, ax=ax[3])
plt.tight_layout()

Most of them are recorded on Sundays, stores were closed, mainly during non promo days.

In [None]:
print (len(sales[(sales['Sales']==0) & (sales['Open'])]))
sales[(sales['Sales']==0) & (sales['Open'])].head()

Will be removed

stores_id = np.random.choice(stores['Store'], 200)
data_sub = data[data['Store'].isin(stores_id)].copy()

In [None]:
int_cols = data.select_dtypes(include='int').drop(['Store', 'Customers', 'Sales'], 1).columns
categorical_cols = data.select_dtypes(include='object').columns
float_cols = data.select_dtypes(include='float').drop('CompetitionDistance', 1).columns

In [None]:
fig, ax = plt.subplots(2, len(int_cols)//2, figsize=(25,10))

for i,j in enumerate(ax.flatten()):
    
    if int_cols[i] !='Sales':
#         sns.boxplot(int_cols[i], 'Sales', data=data, ax=j)
        sns.pointplot(int_cols[i], 'Sales', data=data, ax=j, n_boot=50)
    
plt.tight_layout()
plt.show()

Most of sales are observed during Promo days, school holidays en of years and promo2 = 0.

In [None]:
fig, ax = plt.subplots(1, len(categorical_cols), figsize=(25,6))

for i,j in enumerate(ax.flatten()):
    
    if int_cols[i] !='Sales':
#         sns.boxplot(categorical_cols[i], 'Sales', data=data, ax=j)
        sns.pointplot(categorical_cols[i], 'Sales', data=data, ax=j, n_boot=100)
    
plt.tight_layout()
plt.show()

high variations in sales observed in these features.

In [None]:
fig, ax = plt.subplots(2, len(float_cols)//2, figsize=(25,12))

for i,j in enumerate(ax.flatten()):
    
#     sns.boxplot(float_cols[i], 'Sales', data=data_sub, ax=j)
    sns.pointplot(float_cols[i], 'Sales', data=data, ax=j, n_boot=100)
    
plt.tight_layout()
plt.show()

Usefull information could be extracted from these features.

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15,6))
cols_to_plot = ['CompetitionDistance', 'Customers']
for i,j in enumerate(ax.flatten()):
    
    sns.regplot(cols_to_plot[i], 'Sales', data=data, ax=j, n_boot=100)
    
plt.tight_layout()
plt.show()

Strong correlation between Sales and Customers, worth creating features about customers.

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(data .corr(), cmap='coolwarm')

## Features ingineering

In [None]:
data.fillna(0, inplace=True)

test.fillna(1, inplace=True)

In [None]:
# Combining train and test data
# Decomposing date features

data['part'] = 'train'
test['part'] = 'test'
all_data = pd.concat([data, test], 0)[data.columns.tolist()+['Id']]
all_data['Date'] = pd.to_datetime(all_data['Date'])
all_data['Month'] = all_data['Date'].dt.month
all_data['Year'] = all_data['Date'].dt.year
all_data['Day'] = all_data['Date'].dt.day
all_data['WeekOfYear'] = data.Date.dt.weekofyear
all_data['Quarter'] = data.Date.dt.quarter
all_data.sort_values('Date', inplace=True)

In [None]:
all_data.head()

In [None]:
all_data['CompetitionOpen'] = 12 * (2015 - all_data.Year - all_data.CompetitionOpenSinceYear) + \
        (all_data.Month - all_data.CompetitionOpenSinceMonth)
all_data['CompetitionOpen'] = all_data['CompetitionOpen'].apply(lambda x: x if x > 0 else 0)

In [None]:
tmp = all_data[all_data['part']=='train']
sns.regplot('CompetitionOpen', 'Sales', data=tmp, ci=None)

In [None]:
all_data['PromoOpen'] = 12 * (all_data.Year - all_data.Promo2SinceYear) + \
    (all_data.WeekOfYear - all_data.Promo2SinceWeek) / 4.0
all_data['PromoOpen'] = all_data['PromoOpen'].apply(lambda x: x if x > 0 else 0)

In [None]:
tmp = all_data[all_data['part']=='train']
sns.regplot('PromoOpen', 'Sales', data=tmp, ci=None)

In [None]:
month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
         7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
all_data['month_str'] = all_data.Month.map(month2str)

def check(row):
    if isinstance(row['PromoInterval'],str) and row['month_str'] in row['PromoInterval']:
        return 1
    else:
        return 0

all_data['IsPromoMonth'] =  all_data.apply(lambda row: check(row),axis=1)    

In [None]:
all_data.head()

In [None]:
tmp = all_data[all_data['part']=='train']
sns.boxplot('IsPromoMonth', 'Sales', data=tmp)

In [None]:
all_data ['isBeforeCompetition'] = all_data.apply(lambda x: 1 if x['Year'] < x['CompetitionOpenSinceYear'] else 0, 1)

In [None]:
tmp = all_data[all_data['part']=='train']
sns.boxplot('isBeforeCompetition', 'Sales', data=tmp)

### Time Analysis

In [None]:
fig, ax = plt.subplots(5, 1, figsize=(15,10))
for p in range (5):
    i = np.random.choice(data['Store'].unique())
    data[data['Store']== i ].plot('Date', 'Sales', ax=ax[p])
    ax[p] .set_title("Store %d" %i)
    
plt.tight_layout()
plt.show()

Several stores have missing data

### Stores Performance

In [None]:
fig, ax = plt.subplots (1,5, figsize=(25,4))
sns.boxplot('StoreType',  'Sales','Promo', data=data, ax=ax[0])
sns.boxplot('StoreType', 'Sales', 'SchoolHoliday', data=data, ax=ax[1])
sns.boxplot('StoreType','Sales','Assortment',  data=data, ax=ax[2])
sns.boxplot('StoreType', 'Sales', 'StateHoliday', data=data, ax=ax[3])
sns.boxplot('StoreType', 'Sales', 'Promo2', data=data, ax=ax[4])
plt.tight_layout()

Small difference observeds by promo. 

StoreType b is somehow different / assortment b

In [None]:
grid = sns.FacetGrid(data, col="StoreType", row="Promo", palette="tab10", col_order="abcd")
grid.map(sns.pointplot, "Month", "Sales")
plt.show()

StoreType b got the most total sales.
Higher average sales when promo

In [None]:
all_data['SalesPerCustomer'] = data['Sales']/data['Customers']

In [None]:
grid = sns.FacetGrid(all_data, col="StoreType", row="Promo", palette="tab10", col_order="abcd")
grid.map(sns.pointplot, "Month", "SalesPerCustomer")
plt.show()

But got less total sale per customer !

In [None]:
mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
all_data.StoreType.replace(mappings, inplace=True)
all_data.Assortment.replace(mappings, inplace=True)
all_data.StateHoliday.replace(mappings, inplace=True)

In [None]:
def prepareDf (df, submission=False):
    
    tests_date = "2015-06-12"
    tmp_data = all_data[all_data['part']=='train'] .copy()
    if not submission:
        tmp_data = tmp_data[tmp_data['Date']<tests_date]
        
    shift = 48

    cols = ['isBeforeCompetition', 'IsPromoMonth', 'PromoOpen', 'DayOfWeek', 'Promo', 'Promo2', 'Month', 'Year', 'Day', 'StateHoliday', 'Assortment', 'Quarter']

    for i in ['Sales', 'Customers']:
        for j in cols:
            tmp = pd.pivot_table(tmp_data, i, ['Store', 'StoreType',j], aggfunc='mean').reset_index().rename(columns={i: i+'_StrType_'+j})
            df = pd.merge(df, tmp, 'left', ['Store','StoreType', j])

    cols = ['DayOfWeek', 'Promo', 'Promo2', 'Month', 'Year', 'Day', 'StateHoliday', 'StoreType', 'Assortment', 'PromoInterval', 'Quarter',
           'isBeforeCompetition', 'IsPromoMonth', 'PromoOpen']


    for i in ['Sales', 'Customers']:
        for j in cols:
            tmp = pd.pivot_table(tmp_data, i, ['Store', j], aggfunc='mean').reset_index().rename(columns={i: i+'_'+j})
            df = pd.merge(df, tmp, 'left', ['Store', j])

   

    df['rolling_mean_t7_sales']  = df.groupby(['Store'])['Sales'].transform(lambda x:  x.shift(shift).rolling(7).mean())
    df['rolling_mean_t30_sales'] = df.groupby(['Store'])['Sales'].transform(lambda x:  x.shift(shift).rolling(30).mean())
    df['rolling_mean_t360_sales'] = df.groupby(['Store'])['Sales'].transform(lambda x:  x.shift(shift).rolling(360).mean())

    df['rolling_mean_t7_customer']  = df.groupby(['Store'])['Customers'].transform(lambda x:  x.shift(shift).rolling(7).mean())
    df['rolling_mean_t30_customer'] = df.groupby(['Store'])['Customers'].transform(lambda x:  x.shift(shift).rolling(30).mean())
    df['rolling_mean_t360_customer'] = df.groupby(['Store'])['Customers'].transform(lambda x:  x.shift(shift).rolling(360).mean())

    df['shift_sales']  = df.groupby(['Store'])['Sales'].transform(lambda x: x.shift(shift))
    df['shift_t7_sales']  = df.groupby(['Store'])['Sales'].transform(lambda x: x.shift(shift+7))
    df['shift_t30_sales'] = df.groupby(['Store'])['Sales'].transform(lambda x: x.shift(shift+30))
    df['shift_t360_sales'] = df.groupby(['Store'])['Sales'].transform(lambda x: x.shift(shift+360))
    
    df['shift_customer']  = df.groupby(['Store'])['Customers'].transform(lambda x: x.shift(shift))
    df['shift_t7_customer']  = df.groupby(['Store'])['Customers'].transform(lambda x: x.shift(shift+7))
    df['shift_t30_customer'] = df.groupby(['Store'])['Customers'].transform(lambda x: x.shift(shift+30))
    df['shift_t360_customer'] = df.groupby(['Store'])['Customers'].transform(lambda x: x.shift(shift+360))

    cols = ['Quarter']

    for i in ['Sales', 'Customers']:
        for j in cols:
            tmp = pd.pivot_table(tmp_data, i, ['Store', 'Year',j], aggfunc='mean').reset_index().rename(columns={i: i+'_Year_'+j})
            df = pd.merge(df, tmp, 'left', ['Store','Year', j])

   

    cols = ['DayOfWeek', 'Promo', 'Promo2', 'Month', 'Year', 'Day', 'StateHoliday', 'Assortment']

    for i in ['SalesPerCustomer']:
        for j in cols:
            tmp = pd.pivot_table(tmp_data, i, ['Store', 'StoreType',j], aggfunc='mean').reset_index().rename(columns={i: i+'_StrType_'+j})
            df = pd.merge(df, tmp, 'left', ['Store','StoreType', j])

    return df

In [None]:
df = prepareDf(all_data)

In [None]:
df[df['Store']== 696 ].sort_values('Date')

In [None]:
fig, ax = plt.subplots(5, 1, figsize=(15,10))
for p in range (5):
    i = np.random.choice(data['Store'].unique())
    df[df['Store']== i ].plot('Date', 'Sales', ax=ax[p])
    df[df['Store']== i ].plot('Date', 'shift_sales', ax=ax[p])
    ax[p] .set_title("Store %d" %i)
    ax[p].legend(bbox_to_anchor=(1,.5))
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(5, 1, figsize=(15,10))
for p in range (5):
    i = np.random.choice(data['Store'].unique())
    df[df['Store']== i ].plot('Date', 'Sales', ax=ax[p])
    df[df['Store']== i ].plot('Date', 'shift_t7_sales', ax=ax[p])
    ax[p] .set_title("Store %d" %i)
    ax[p].legend(bbox_to_anchor=(1,.5))
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(5, 1, figsize=(15,10))
for p in range (5):
    i = np.random.choice(data['Store'].unique())
    df[df['Store']== i ].plot('Date', 'Sales', ax=ax[p])
    df[df['Store']== i ].plot('Date', 'rolling_mean_t7_sales', ax=ax[p])
    ax[p] .set_title("Store %d" %i)
    ax[p].legend(bbox_to_anchor=(1,.5))
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(5, 1, figsize=(15,10))
for p in range (5):
    i = np.random.choice(data['Store'].unique())
    df[df['Store']== i ].plot('Date', 'Customers', ax=ax[p])
    df[df['Store']== i ].plot('Date', 'shift_t30_customer', ax=ax[p])
    ax[p] .set_title("Store %d" %i)
    ax[p].legend(bbox_to_anchor=(1,.5))
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(5, 1, figsize=(15,10))
for p in range (5):
    i = np.random.choice(data['Store'].unique())
    df[df['Store']== i ].plot('Date', 'Customers', ax=ax[p])
    df[df['Store']== i ].plot('Date', 'rolling_mean_t7_customer', ax=ax[p])
    ax[p] .set_title("Store %d" %i)
    ax[p].legend(bbox_to_anchor=(1,.5))
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(25,12))
mask = np.zeros_like(df.corr())
mask[np.triu_indices_from(mask)] = True
sns.heatmap(df.corr(), cmap='coolwarm', vmax=1.0, vmin=-1.0 , mask = mask, linewidths=2.5)
plt.show() 

In [None]:
df.head()

# Training models

In [None]:
df_train = df[df['part']== 'train']

In [None]:
cols_train = ['Store', 'DayOfWeek', 'Date', 'Sales', 'Promo','PromoOpen',
#               'shift_sales','shift_t7_sales', 'shift_t30_sales','shift_customer','shift_t7_customer', 'shift_t30_customer',
           'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
           'CompetitionDistance', 'Promo2', 'Month', 'Year', 'Day','IsPromoMonth',
#            'Sales_DayOfWeek', 'Sales_Promo', 'Sales_Promo2', 'Sales_Month', 'Sales_Year', 'Sales_Day', 'Sales_StateHoliday', 'Sales_StoreType',
#            'Sales_Assortment', 'Customers_DayOfWeek', #'isBeforeCompetition', 'Customers_Promo', 'Customers_Promo2', 'Customers_Month',
#            'Customers_Year', 'Customers_Day', 'Customers_StateHoliday','Customers_StoreType', 'Customers_Assortment',
#            'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear',
           'Sales_StrType_DayOfWeek','SalesPerCustomer_StrType_Assortment', 'Customers_StrType_Promo2',
           'Sales_StrType_Promo', 'Sales_StrType_Promo2', 'Sales_StrType_Month','Customers_StrType_StateHoliday',
           'Sales_StrType_Year', 'Sales_StrType_Day', 'Sales_StrType_StateHoliday','SalesPerCustomer_StrType_Promo2',
           'Sales_StrType_Assortment', 'Customers_StrType_DayOfWeek', 'Customers_StrType_Promo', 
           'Customers_StrType_Month', 'Customers_StrType_Year', 'Customers_StrType_Day', 'SalesPerCustomer_StrType_StateHoliday',
           'Customers_StrType_Assortment', 'SalesPerCustomer_StrType_DayOfWeek', 'SalesPerCustomer_StrType_Promo', 
           'SalesPerCustomer_StrType_Month', 'SalesPerCustomer_StrType_Year', 'SalesPerCustomer_StrType_Day', 
#          'Sales_StrType_Quarter', 'Customers_StrType_Quarter', 'Sales_Year_Quarter','Customers_Year_Quarter',   'Customers_Quarter','Sales_Quarter',
#            'Quarter'
             ]  

In [None]:
params = {"objective": "reg:linear", # for linear regression
          "booster" : "gbtree",   # use tree based models 
          "eta": 0.02,   # learning rate
          "max_depth": 11,    # maximum depth of a tree
          "subsample": 0.9,    # Subsample ratio of the training instances
          "colsample_bytree": 0.7,   # Subsample ratio of columns when constructing each tree
          "silent": 1,   # silent mode
          "seed": 10,   # Random number seed
          'tree_method': 'gpu_hist',
          }
num_boost_round = 800


def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)

import xgboost as xgb

#### Use only top Stores to train models

In [None]:
tmp= pd.pivot_table(data, ['Date'], "Store", aggfunc="count").reset_index().sort_values('Date', ascending=False).head(300)
top_stores = tmp["Store"].values

In [None]:
# from sklearn.manifold import TSNE
# from sklearn.preprocessing import StandardScaler


def process(x, cols=None, all_stores=False):
    x.sort_values("Date",inplace=True)
#     scaler = StandardScaler()
    
    if cols is None:
        cols = x.columns
        
    x = x.fillna(x.median())
       
#     for i in x.columns[(x.dtypes.values == np.dtype('float64'))]:
#         if i not in ['Id', 'Promo2SinceWeek', 'Promo2SinceYear','CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Sales',
#                         'Quarter', 'WeekOfYear', 'PromoOpen', 'Promo2SinceWeek', 'Promo2SinceYear']:
#             x[i] = np.round(np.log1p(x[i]),2)
    
    x_train = x[x["Date"]<="2015-06-12"][cols].copy()
    x_test  = x[x["Date"]>"2015-06-12"][cols].copy()

    store_test = x_test['Store'].unique().tolist()
    x_train = x_train[(x_train['Store'].isin(store_test))]
    
    y_train = np.log(x_train['Sales'])
    
    if all_stores:
        rmv = ['Date', 'Sales']
    else:
        rmv = ['Date', 'Sales', 'Store']
  
    x_train= x_train.drop(rmv, 1)
    x_train = pd.get_dummies(x_train)
    x_train_arr = x_train.values
    x_test_arr = pd.get_dummies(x_test.drop(rmv, 1)).values
    
    #scaler.fit(x_train_arr)
    #x_train_arr = scaler.transform(x_train_arr)
    #x_test_arr = scaler.transform(x_test_arr)
    #reduc = TSNE(n_components=2)
    #reduc.fit(x_train_arr)
    #x_train_arr = reduc.transform(x_train_arr)
    #x_test_arr = reduc.transform(x_test_arr)
   
    return x_train.columns, x_train_arr, y_train, x_test, x_test_arr

### Xgboost Model

In [None]:
fig, ax = plt.subplots(5, 2, figsize=(25, 15))

X = df_train[df_train['Store'].isin(top_stores)] .copy()
X_train_col, X_train_arr, Y_train, X_test, X_test_arr = process(X, cols_train, True)

dtrain = xgb.DMatrix(X_train_arr, Y_train)
estimator = xgb.train(params, dtrain, num_boost_round, feval=rmspe_xg,)
Y_pred = estimator.predict(xgb.DMatrix(X_test_arr))
X_test["Pred"] = np.exp(Y_pred)

scores = np.round(mean_squared_error(X_test['Sales'], X_test["Pred"]))
cpt = 0

for i in top_stores[:5]: 
    
    x_train = df_train[df_train["Store"]==i]
    x_test = X_test[X_test["Store"]==i]
    ax[cpt, 0].plot(x_train["Date"], x_train["Sales"])
    ax[cpt, 0].plot(x_test["Date"], x_test["Pred"])
    ax[cpt, 0].set_title(i)

    ax[cpt, 1].scatter(x_test["Date"].values, x_test['Sales'].values - x_test["Pred"].values)
    ax[cpt, 1].plot(x_test["Date"], [0 for _ in range(len(x_test))])
    ax[cpt, 1].set_title( np.round(mean_squared_error(X_test['Sales'], X_test["Pred"])))

    #feat_importances = pd.Series(reg.feature_importances_, index=X_train_col)
    #feat_importances.nlargest(10).sort_values(ascending = True).plot(kind='barh', ax=ax[cpt, 2])
#     ax[cpt, 2].set_xlabel('importance')
    cpt+=1
    
    
plt.tight_layout()
print (np.mean(scores))

In [None]:
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller

fig, ax = plt.subplots(5, 2, figsize=(25, 15))
cpt = 0
result = pd.Series()
scores= []
cpt_store=0
for i in top_stores[:5] : 
    X = data[data['Store']== i].sort_values('Date').copy()
    print(adfuller(X['Sales'])[1])
    if cpt<5:
        fig = sm.graphics.tsa.plot_acf(X['Sales'].diff().dropna(), lags=40, ax=ax[cpt,0] )
        fig = sm.graphics.tsa.plot_pacf(X['Sales'].diff().dropna(), lags=40, ax=ax[cpt ,1])
        
        cpt+=1
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(5, 2, figsize=(25, 15))
cpt = 0
for i in top_stores[:5] : 
    X = data[data['Store']==i] .sort_values('Date').copy()
    
    X_train = X[X["Date"]<"2015-06-12"][['Date', 'Sales']].copy()
    X_train.index = X_train.Date
    X_train= np.log(X_train[[ 'Sales']])
    X_test  = X[X["Date"]>="2015-06-12"].copy()

    reg = sm.tsa.statespace.SARIMAX(X_train,order=(7,1,5),seasonal_order=(2,1,1,7),trend='c',enforce_invertibility=False)
    res = reg.fit()
    
    #Y_pred = res.predict(start="2015-01-01", end="2015-07-31", dynamic=True)
    Y_pred = res.forecast(len(X_test)).values
    X_test["Pred"] = np.exp(Y_pred)
    
    score = np.round(mean_squared_error(X_test[ 'Sales'], X_test["Pred"]))
    data[data["Store"]==i].plot("Date",  'Sales', ax=ax[cpt, 0])
    ax[cpt, 0].plot(X_test["Date"], X_test["Pred"], label='Predictions')
    ax[cpt, 0].set_title(i)

    res.resid.plot(ax=ax[cpt, 1] )
#     fig = sm.graphics.tsa.plot_acf(res.resid, lags=40, ax=ax[2])
#     fig = sm.graphics.tsa.plot_pacf(res.resid, lags=40, ax=ax[3])
    cpt+=1
    
plt.tight_layout()
plt.show()

# Training for submissions

In [None]:
num_boost_round = 500

params = {"objective": "reg:linear", # for linear regression
          "booster" : "gbtree",   # use tree based models 
          "eta": 0.03,   # learning rate
          "max_depth": 10,    # maximum depth of a tree
          "subsample": 0.9,    # Subsample ratio of the training instances
          "colsample_bytree": 0.7,   # Subsample ratio of columns when constructing each tree
          "silent": 1,   # silent mode
          "seed": 10,   # Random number seed
          'tree_method': 'gpu_hist',
          }


tmp = prepareDf(all_data, True)
tmp = tmp.fillna(tmp.dropna().median())
 

# for i in tmp.columns[(tmp.dtypes.values == np.dtype('float64'))]:
#     if i not in ['Id', 'Promo2SinceWeek', 'Promo2SinceYear', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear',
#                 'Sales', 'Quarter', 'WeekOfYear', 'PromoOpen', 'Promo2SinceWeek', 'Promo2SinceYear']:
#         tmp[i] = np.log1p(tmp[i])

test = tmp[tmp['part']=='test'].sort_values('Id')
df_train = tmp[tmp['part']=='train']

result = pd.Series()

X_train = df_train
X_train= X_train[cols_train]
X_test  = test

store_test = X_test['Store'].unique().tolist()
X_train = X_train[(X_train['Store'].isin(store_test))]

store_ind = X_test["Id"]
Y_train = np.log(X_train["Sales"])

X_train = pd.get_dummies(X_train.drop(['Date', 'Sales'], 1)).values
X_test = pd.get_dummies(X_test[cols_train]).drop(['Date', 'Sales'], 1).values# .drop(['Date'], 1)).values

dtrain = xgb.DMatrix(X_train, Y_train)
estimator = xgb.train(params, dtrain, num_boost_round, feval=rmspe_xg,)
Y_pred = np.exp(estimator.predict(xgb.DMatrix(X_test)))
result = result.append(pd.Series(Y_pred, index=store_ind))
results = pd.DataFrame({ "Id": result.index, "Sales": result.values})

In [None]:
df_test = pd.read_csv('../input/test.csv')
df_test = pd.merge(df_test, stores, 'left', 'Store')
df_test.head()

In [None]:
merged_test = pd.merge(df_test, results, 'left', ['Id']) 
merged_test.loc[ merged_test.Open == 0, 'Sales' ] = 0 
sub = merged_test[['Id', 'Sales']].copy() 

sub['Sales'] = sub['Sales'].fillna(0) 
sub.to_csv('submission.csv', index=False) 

sub.info()