In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys
warnings.simplefilter(action='ignore')
sys.path.append("../")

In [None]:
data = pd.read_csv('/kaggle/input/sales-forecasting/train.csv')
data

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data_ts = data[['Order Date','Sales']]
data_ts

In [None]:
data_ts.describe()

In [None]:
data_ts.info()

In [None]:
data_ts['Order Date'] = pd.to_datetime(data_ts['Order Date'])
data_ts

In [None]:
data_ts.info()

In [None]:
plt.scatter(data_ts['Order Date'],data_ts['Sales'])

In [None]:
data_ts = data_ts.sort_values(by="Order Date")

In [None]:
data_ts

In [None]:
data_ts.reset_index(inplace = True)

In [None]:
data_ts = data_ts.drop(columns=['index'])

In [None]:
data_ts

In [None]:
plt.figure(figsize=(10,5))
plt.plot(data_ts['Order Date'],data_ts['Sales'])
plt.show()

In [None]:
sns.boxplot(data_ts['Sales'])

In [None]:
plt.scatter(data_ts['Order Date'],data_ts['Sales'])

In [None]:
data_ts = data_ts.groupby(['Order Date']).sum()

In [None]:
data_ts

In [None]:
y = data_ts['Sales']

fig, ax = plt.subplots(figsize=(20, 6))
ax.plot(y,marker='.', linestyle='-', linewidth=0.5, label=' Monthly')
ax.plot(y.resample('y').mean(),marker='o', markersize=8, linestyle='-', label='Yearly Mean Resample')
ax.set_ylabel('Orders')
ax.legend()

In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
def testing(timeseries):
  #Determing rolling statistics
  rolmean = timeseries.rolling(window=12).mean()
  rolstd = timeseries.rolling(window=12).std()
      
  #Plot rolling statistics:
  plt.figure(figsize=(20,6))
  orig = plt.plot(timeseries, color='blue',label='Original')
  mean = plt.plot(rolmean, color='red', label='Rolling Mean')
  std = plt.plot(rolstd, color='black', label = 'Rolling Std')
  plt.legend(loc='best')
  plt.title('Rolling Mean & Standard Deviation')
  plt.show(block=False)

  #dickey-fuller test 

  print('Results of Dickey-Fuller Test:')
  dftest = adfuller(timeseries, autolag='AIC')
  dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
  for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
  print(dfoutput.round(2))

In [None]:
testing(data_ts)

In [None]:
ts_log = np.log(data_ts)

In [None]:
plt.plot(ts_log)

In [None]:
testing(ts_log)

In [None]:
mov_avg = ts_log.rolling(window=12).mean()
plt.figure(figsize=(20,6))
plt.plot(mov_avg,c='r')
plt.plot(ts_log)
plt.show()

In [None]:
ts_log_moving_avg_diff = ts_log - mov_avg
ts_log_moving_avg_diff.head(20)

In [None]:
ts_log_moving_avg_diff.dropna(inplace=True)

In [None]:
ts_log_moving_avg_diff

In [None]:
testing(ts_log_moving_avg_diff)

In [None]:
expweighted_avg = ts_log.ewm(halflife=12).mean()
plt.plot(ts_log)
plt.plot(expweighted_avg, color='red')

In [None]:
ts_log_ewma_diff = ts_log - expweighted_avg
testing(ts_log_ewma_diff)

In [None]:
ts_log_diff = ts_log - ts_log.shift()
plt.plot(ts_log_diff)

In [None]:
ts_log_diff.dropna(inplace=True)
testing(ts_log_diff)

In [None]:
from statsmodels.tsa.stattools import acf, pacf

In [None]:
lag_acf = acf(ts_log_diff, nlags=20)
lag_pacf = pacf(ts_log_diff, nlags=20, method='ols')

In [None]:
plt.figure(figsize=(20,6))
plt.subplot(121) 
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.grid()
plt.title('Autocorrelation Function')
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.grid()
plt.title('Partial Autocorrelation Function')
plt.tight_layout()

In [None]:
from statsmodels.tsa.arima_model import ARIMA

In [None]:
model = ARIMA(ts_log, order=(1,1,1))  
results_AR = model.fit(disp=-1)  
plt.plot(ts_log_diff)
plt.plot(results_AR.fittedvalues, color='red')

In [None]:
predictions_ARIMA_diff = pd.Series(results_AR.fittedvalues, copy=True)
predictions_ARIMA_diff.head()

In [None]:
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
predictions_ARIMA_diff_cumsum.head()

In [None]:
predictions_ARIMA_log = pd.Series(ts_log.Sales, index=ts_log.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)
predictions_ARIMA_log.head()

In [None]:
predictions_ARIMA = np.exp(predictions_ARIMA_log)
predictions_ARIMA.round(2)

In [None]:
my_pred = pd.DataFrame(predictions_ARIMA)

In [None]:
my_pred

In [None]:
my_pred.columns = ['Value']

In [None]:
my_pred

In [None]:
predictions_ARIMA_show = my_pred[my_pred['Value']>40000]
predictions_ARIMA_show.round(3)

In [None]:
my_pred.drop(my_pred.loc[my_pred['Value'] >40000].index, inplace=True)

In [None]:
plt.plot(my_pred,c = 'r')

In [None]:
plt.plot(my_pred,c = 'r')
plt.plot(data_ts, c='y')