In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 150)
pd.set_option('max_rows', 150)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import pyplot

import plotly.graph_objs as go
import plotly as py
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [None]:
# Read data
path = '../input/walmart-recruiting-store-sales-forecasting/'

features = pd.read_csv(f'{path}features.csv.zip')
train = pd.read_csv(f'{path}train.csv.zip')
test = pd.read_csv(f'{path}test.csv.zip')
stores = pd.read_csv(f'{path}stores.csv')
sample_submission = pd.read_csv(f'{path}sampleSubmission.csv.zip')

In [None]:
dataset = train.merge(stores, how='left').merge(features, how='left')
dataset.shape

In [None]:
dataset_test = test.merge(stores, how='left').merge(features, how='left')
dataset_test.shape

In [None]:
dataset['Date'] = pd.to_datetime(dataset['Date'])
dataset['Year'] = pd.to_datetime(dataset['Date']).dt.year
dataset['Month'] = pd.to_datetime(dataset['Date']).dt.month
dataset['Week'] = pd.to_datetime(dataset['Date']).dt.week
dataset['DayOfTheWeek'] = dataset['Date'].dt.dayofweek
dataset['Day'] = pd.to_datetime(dataset['Date']).dt.day
dataset.replace({'A': 1, 'B': 2,'C':3},inplace=True)

dataset_test['Date'] = pd.to_datetime(dataset_test['Date'])
dataset_test['Year'] = pd.to_datetime(dataset_test['Date']).dt.year
dataset_test['Month'] = pd.to_datetime(dataset_test['Date']).dt.month
dataset_test['Week'] = pd.to_datetime(dataset_test['Date']).dt.week
dataset_test['DayOfTheWeek'] = dataset_test['Date'].dt.dayofweek

dataset_test['Day'] = pd.to_datetime(dataset_test['Date']).dt.day
dataset_test.replace({'A': 1, 'B': 2,'C':3},inplace=True)

In [None]:
# Threshold for removing correlated variables
threshold = 0.92

# Absolute value correlation matrix
corr_matrix = dataset.corr().abs()
corr_matrix.head()

In [None]:
#fill in mean for floats
for c in dataset.columns:
    if dataset[c].dtype=='float16' or  dataset[c].dtype=='float32' or  dataset[c].dtype=='float64':
        dataset[c].fillna(dataset[c].mean())

#fill in -999 for categoricals
dataset = dataset.fillna(-999)
# Label Encoding
for f in dataset.columns:
    if dataset[f].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(dataset[f].values))
        dataset[f] = lbl.transform(list(dataset[f].values))
        
print('Labelling done.') 

In [None]:
#fill in mean for floats
for c in dataset_test.columns:
    if dataset_test[c].dtype=='float16' or  dataset_test[c].dtype=='float32' or  dataset_test[c].dtype=='float64':
        dataset_test[c].fillna(dataset_test[c].mean())

#fill in -999 for categoricals
dataset_test = dataset_test.fillna(-999)
# Label Encoding
for f in dataset_test.columns:
    if dataset_test[f].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(dataset_test[f].values))
        dataset_test[f] = lbl.transform(list(dataset_test[f].values))
        
print('Labelling done.') 

In [None]:
split_date = '2012-06-01'
data_train = dataset.loc[dataset.Date <= split_date].copy()
data_test = dataset.loc[dataset.Date > split_date].copy()

In [None]:
X_train, y_train = data_train[['Store', 'Size', 'Dept', 'Week', 'Year', 'IsHoliday', 'Temperature', 'Fuel_Price', 
          'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']], data_train['Weekly_Sales']

X_test, y_test = data_test[['Store', 'Size', 'Dept', 'Week', 'Year', 'IsHoliday', 'Temperature', 'Fuel_Price', 
          'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']], data_test['Weekly_Sales']

X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
dataset_store = dataset.groupby(['Date','Store','Dept','CPI'])['Weekly_Sales'].sum().reset_index()
Store1 = dataset_store[dataset_store.Store.isin([1])]

In [None]:
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries, dept):
    train_original = timeseries.copy()
    #Determing rolling statistics
    rolmean = train_original.rolling(52).mean() # 52 weeks 
    rolstd = train_original.rolling(52).std()
    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='blue',label='Original') 
    mean = plt.plot(rolmean, color='red', label='Rolling Mean') 
    std = plt.plot(rolstd, color='black', label = 'Rolling Std') 
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation fro dpt (%s)'%dept) 
    plt.show(block=False)
    #Perform Dickey-Fuller test:
    print ('Results of Dickey-Fuller Test for dept :', dept)
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key, value in dftest[4].items():
        dfoutput['critical value (%s) '%key] = value
    print(dfoutput)


In [None]:
dpts = Store1["Dept"].unique()

In [None]:
import sys
for e in dpts:
    ts = Store1[Store1["Dept"] == e]['Weekly_Sales']
    try:
        test_stationarity(ts, e)
    except:
        print("Unexpected error:", sys.exc_info()[0])
    