In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## EDA of Air Passenger with Timeseries Analysis


Objective: Build a model to forecast the demand(passenger traffic) in Airplanes. The data is classified in date/time and the passengers travelling per month




In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline 
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6

from datetime import datetime as dt
from statsmodels.tsa.stattools import adfuller,acf,pacf
from statsmodels.tsa.arima_model import ARIMA
import math

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('../input/air-passengers/AirPassengers.csv')

In [None]:
data['Month'].head()

In [None]:
data['Month']=data['Month'].apply(lambda x: dt(int(x[:4]), int(x[5:]),15))
data = data.set_index('Month')
data.head()

In [None]:
ts = data['#Passengers']

In [None]:
plt.plot(ts)

In [None]:
ts_log = np.log(ts)

In [None]:
def test_stationarity(timeseries):
    rolmean = timeseries.rolling(window=52,center=False).mean()
    rolstd = timeseries.rolling(window=52,center=False).std()
    
    orig = plt.plot(timeseries,color='blue', label='Original')
    mean = plt.plot(rolmean,color='red', label='Rolling Mean')
    std = plt.plot(rolstd,color='black', label='Rolling std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries,autolag='AIC')
    dfoutput = pd.Series(dftest[0:4],index=['Test Statistic','p-value','#Lags Used',
                                           'Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)

In [None]:
test_stationarity(data['#Passengers'])

By Observing the plot in the result of the Dickey-Fuller Test, we infer the test statistic is more than critical value and the moving avg is not constant over time.
so the null hypothesis of Dickey-Fuller test can not be rejected.
* This shows that the timeseries is not stationary

In [None]:
#plot the moving avg

plt.plot(ts_log)

In [None]:
#initializing the moving avg variable
movingAverage = ts_log.rolling(window=12).mean()
movingSTD = ts_log.rolling(window=12).std()
plt.plot(ts_log)
plt.plot(movingAverage, color='green')

In [None]:
ts_log_mv_diff = ts_log - movingAverage
ts_log_mv_diff.head(12)

ts_log_mv_diff.dropna(inplace=True)
ts_log_mv_diff.head(10)

In [None]:
#parse-out time series data and check for stationarty
test_stationarity(ts_log_mv_diff)

the test is 1% of the critical value which tells us, that we are 99% confident this series is stationary. we can reject the null hypothesis

New Problem Statement:
* Forecasting Time Series

In [None]:
plt.plot(np.arange(0,11),acf(ts_log_mv_diff,nlags=10))
plt.axhline(y=0,linestyle='--', color='gray')
plt.axhline(y=-7.96/np.sqrt(len(ts_log_mv_diff)),linestyle='--', color='gray')
plt.axhline(y=7.96/np.sqrt(len(ts_log_mv_diff)),linestyle='--', color='gray')
plt.title('Autocorrelation Function')
plt.show()

The AFC curve crosses the upper confidence value when the lagged value is between 0 and 1. Thus the optimal value or ARIMA can be 0 or 1.

In [None]:
plt.plot(np.arange(0,11),pacf(ts_log_mv_diff,nlags=10))
plt.axhline(y=0,linestyle='--', color='gray')
plt.axhline(y=-7.96/np.sqrt(len(ts_log_mv_diff)),linestyle='--', color='gray')
plt.axhline(y=7.96/np.sqrt(len(ts_log_mv_diff)),linestyle='--', color='gray')
plt.title('Partial Autocorrelation Function')
plt.show()

In [None]:
model = ARIMA(ts_log,order=(1,1,0))
results_ARIMA = model.fit(disp=-1)
plt.plot(ts_log_mv_diff)
plt.plot(results_ARIMA.fittedvalues,color='red')
plt.title('RSS: %.4f'%sum((results_ARIMA.fittedvalues[1:] - ts_log_mv_diff)**2))

In [None]:
#Model Predictions
predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
predictions_ARIMA_diff.head()

As We can see that it starting from 1949-02-15 and not the first month because we took lag by one, and the first element doesn't have anything before to subtact from the way to convert the differencing to log scale is to add these differences consecultively to the base number.

In [None]:
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
predictions_ARIMA_diff_cumsum.head()

In [None]:
predictions_ARIMA_log = pd.Series(ts_log.iloc[0],index=ts_log.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)
predictions_ARIMA_log.head()

In [None]:
predictions_ARIMA = np.exp(predictions_ARIMA_log)
plt.plot(ts)
plt.plot(predictions_ARIMA)
plt.title('RMSE: %.4f'%np.sqrt(sum((predictions_ARIMA-ts)**2)/len(ts)))

Well Our model is good at predicting future but fails in capturing the seasonality.

# Thank You For Checking Out

* Do help me to improve this.