## Importing Library and data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
data = pd.read_csv('../input/AirPassengers.csv',  index_col='Month',  parse_dates = ['Month'])

In [None]:
data.head()

In [None]:
data.dtypes

In [None]:
data.index

In [None]:
data.plot()

### Time series analysis

In [None]:
ts = data["#Passengers"] 
ts.head(10)

In [None]:
ts[datetime(1949,8,1)]

In [None]:
ts['1949']

## Stationarty Check

What is stationary process?

- Mean = constant over all intervals.
- Variance = constant over all intervals.

## Dickey Fuller Test

Here the null hypothesis is that the TS is non-stationary. The test results comprise of a Test Statistic and some Critical Values for difference confidence levels. If the Test Statistic is less than the Critical Value, we can reject the null hypothesis and say that the series is stationary.

In [None]:
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
    
    #Determing rolling statistics
    rolmean = timeseries.rolling(window=12).mean()
    rolstd = timeseries.rolling(window=12).std()

    #Plot rolling statistics:
    plt.figure(figsize=(8,5))
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    #Perform Dickey-Fuller test:
    print ('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)

In [None]:
test_stationarity(ts)

Therefore, the time series is not stationary.

There are 2 major reasons behind non-stationaruty of a TS:

1. **Trend** – varying mean over time. For eg, in this case we saw that on average, the number of passengers was growing over time.

 2.** Seasonality** – variations at specific time-frames. eg people might have a tendency to buy cars in a particular month because of pay increment or festivals.

### Using Log to eliminate Trend

In [None]:
ts_log = np.log(ts)
plt.plot(ts_log)

We will deal with the noise by taking rolling mean i.e. smoothing

In [None]:
ts_smooth = ts_log.rolling(window = 12).mean()
plt.plot(ts_smooth, color = 'red')
plt.plot(ts_log)
plt.show()

#### Case 1 : No subtraction

In [None]:
no_sub_ts = ts_smooth
no_sub_ts.dropna(inplace = True)
test_stationarity(no_sub_ts)

#### Case 2: Subtraction

In [None]:
sub_ts = ts_log - ts_smooth
sub_ts.dropna(inplace = True)
test_stationarity(sub_ts)

### Exponential weighted average 

In [None]:
expwighted_avg = ts_log.ewm(halflife=12).mean()
plt.plot(expwighted_avg, color='red')
plt.plot(ts_log)

#### Case 1: No Subtraction

In [None]:
no_sub_ts = expwighted_avg
no_sub_ts.dropna(inplace = True)
test_stationarity(no_sub_ts)

#### Case 2: Subtraction

In [None]:
exp_ts_diff = ts_log-expwighted_avg
exp_ts_diff.dropna(inplace = True)
test_stationarity(exp_ts_diff)


We are 99% sure that we have a stationary series.

## Differencing

In [None]:
ts_diff = ts_log - ts_log.shift()
ts_diff.dropna(inplace = True)
test_stationarity(ts_diff)

#### Differencing + Exponential weighting

In [None]:
ts_diff_exp = ts_diff  - ts_diff.ewm(halflife = 12).mean()
ts_diff_exp.dropna(inplace = True)
test_stationarity(ts_diff_exp)

## Decomposition

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(ts_log)

trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

plt.subplot(411)
plt.plot(ts_log, label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout()

In [None]:
ts_log_decompose = residual
ts_log_decompose.dropna(inplace=True)
test_stationarity(ts_log_decompose)

### Forecasting

Lets make model on the TS after differencing as it is a very popular technique. Also, its relatively easier to add noise and seasonality back into predicted residuals in this case.

### ARIMA model

In [None]:
#ACF and PACF plots:
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
plot_acf(ts_diff,lags=20,alpha=1)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_diff)),linestyle='--',color='gray')
plt.title('Autocorrelation Function')


In [None]:
plot_pacf(ts_diff,lags=20,alpha=1)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_diff)),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')

p = 2, q = 2

Now AR, MA & ARIMA models for the data

In [None]:
from statsmodels.tsa.arima_model import ARMA
mod = ARMA(ts_diff, order=(1,0))
res = mod.fit()
print("The AIC for an AR(1) is: ", res.aic)

# Fit the data to an AR(2) model and print AIC:
mod = ARMA(ts_diff, order=(2,0))
res = mod.fit()
print("The AIC for an AR(2) is: ", res.aic)

mod = ARMA(ts_diff, order=(3,0))
res = mod.fit()
print("The AIC for an AR(3) is: ", res.aic)

# Fit the data to an MA(1) model and print AIC:
mod = ARMA(ts_diff, order=(0,1))
res = mod.fit()
print("The AIC for an MA(1) is: ", res.aic)

mod = ARMA(ts_diff, order=(0,2))
res = mod.fit()
print("The AIC for an MA(2) is: ", res.aic)

mod = ARMA(ts_diff, order=(0,3))
res = mod.fit()
print("The AIC for an MA(3) is: ", res.aic)

# Fit the data to an ARMA(1,1) model and print AIC:
mod = ARMA(ts_diff, order=(1,1))
res = mod.fit()
print("The AIC for an ARMA(1,1) is: ", res.aic)

mod = ARMA(ts_diff, order=(2,2))
res = mod.fit()
print("The AIC for an ARMA(2,2) is: ", res.aic)

mod = ARMA(ts_diff, order=(3,3))
res = mod.fit()
print("The AIC for an ARMA(3,3) is: ", res.aic)

In [None]:
model=pd.DataFrame()
names=['AR(1)','AR(2)','AR(3)','MA(1)','MA(2)','MA(3)','ARMA(1,1)','ARMA(2,2)','ARMA(3,3)']
aic=[-235.38589888263135,-237.6046356975284,-236.95178478978522,-237.5073149855421,-240.3789540202477,-257.8902625951644,-241.60771402612232,-287.2808079212986,-289.4747225981437]
model['Model Name']=names
model['AIC']=aic
model=model.set_index('Model Name')
model

#### AR(2) Model

In [None]:
from statsmodels.tsa.arima_model import ARIMA
model = ARIMA(ts_log, order=(2, 1, 0))  
results_AR = model.fit(disp=-1)  
plt.plot(ts_diff)
plt.plot(results_AR.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_AR.fittedvalues-ts_diff)**2))

#### MA(2) Model

In [None]:
model = ARIMA(ts_log, order=(0, 1, 2))  
results_AR = model.fit(disp=-1)  
plt.plot(ts_diff)
plt.plot(results_AR.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_AR.fittedvalues-ts_diff)**2))

#### ARIMA(2,1,2)

In [None]:
model = ARIMA(ts_log, order=(2, 1, 2))  
results_AR = model.fit(disp=-1)  
plt.plot(ts_diff)
plt.plot(results_AR.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_AR.fittedvalues-ts_diff)**2))

In [None]:
preds = pd.Series(results_AR.fittedvalues, copy = True)
preds_cumsum = preds.cumsum()
print (preds_cumsum.head())

In [None]:
preds_log = pd.Series(ts_log.iloc[0], index=ts_log.index)
preds_log = preds_log.add(preds_cumsum,fill_value=0)
preds_log.head()

In [None]:
preds_ARIMA = np.exp(preds_log)
plt.plot(ts)
plt.plot(preds_ARIMA)
plt.xlabel('Years')
plt.ylabel("Number of Passengers")
plt.title('RMSE: %.4f'% np.sqrt(sum((preds_ARIMA-ts)**2)/len(ts)))

In [None]:
results_AR.plot_predict(start='1953-07-01', end='1962-12-01')
plt.show()

### ARIMA(1,1,1)

In [None]:
model = ARIMA(ts_log, order=(1, 1, 1))  
results_AR_3 = model.fit(disp=-1)  
plt.plot(ts_diff)
plt.plot(results_AR_3.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_AR_3.fittedvalues-ts_diff)**2))

In [None]:
preds = pd.Series(results_AR_3.fittedvalues, copy = True)
preds_cumsum = preds.cumsum()
print (preds_cumsum.head())

In [None]:
preds_log = pd.Series(ts_log.iloc[0], index=ts_log.index)
preds_log = preds_log.add(preds_cumsum,fill_value=0)
preds_log.head()

In [None]:
preds_ARIMA = np.exp(preds_log)
plt.plot(ts)
plt.plot(preds_ARIMA)
plt.xlabel('Years')
plt.ylabel("Number of Passengers")
plt.title('RMSE: %.4f'% np.sqrt(sum((preds_ARIMA-ts)**2)/len(ts)))

In [None]:
results_AR_3.plot_predict(start='1953-07-01', end='1962-12-01')
plt.show()

This is overfitting the data.