In [None]:
# This is a summary of the articles on the links bellow:

#Link1: https://www.analyticsvidhya.com/blog/2016/02/time-series-forecasting-codes-python/
#Link2: https://www.analyticsvidhya.com/blog/2015/12/complete-tutorial-time-series-modeling/

import numpy as np 
import pandas as pd 
from datetime import datetime
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.arima_model import ARIMA

from pylab import rcParams

#Size of plt.plot fig
rcParams['figure.figsize'] = 15, 5

df = pd.read_csv('../input/airpassengers/AirPassengers.csv', 
                 parse_dates = True, 
                 index_col = 'Month')

In [None]:
df.head()

In [None]:
df.index

In [None]:
ts = df['#Passengers']
ts

In [None]:
ts['1949-01-01']
ts[datetime(1949,1,1)]

In [None]:
ts[:'1949-05-01']

In [None]:
ts['1949']

**********************
# **How to Check Stationarity of a Time Series?**

#### **Stationarity is defined using very strict criterion. However, for practical purposes we can assume the series to be stationary if it has constant statistical properties over time, ie. the following:**

* Constant mean
* Constant variance
* An autocovariance that does not depend on time.
********************

In [None]:
plt.plot(ts)

# **More formally, we can check stationarity using the following:**
*****
* **Plotting Rolling Statistics:** We can plot the moving average or moving variance and see if it varies with time. By moving average/variance I mean that at any instant ‘t’, we’ll take the average/variance of the last year, i.e. last 12 months. But again this is more of a visual technique.
* **Dickey-Fuller Test:** This is one of the statistical tests for checking stationarity. Here the null hypothesis is that the TS is non-stationary. The test results comprise of a Test Statistic and some Critical Values for difference confidence levels. If the ‘Test Statistic’ is less than the ‘Critical Value’, we can reject the null hypothesis and say that the series is stationary. 
*****

In [None]:
def test_stationarity(timeseries):
    
    #Determing rolling statistics
    rolmean = timeseries.rolling(window=12).mean()
    rolstd = timeseries.rolling(window=12).std()

    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)

In [None]:
test_stationarity(ts)

# **How to make a Time Series Stationary?**
***
**Lets understand what is making a TS non-stationary. There are 2 major reasons behind non-stationaruty of a TS:**
* **Trend –** varying mean over time. For eg, in this case we saw that on average, the number of passengers was growing over time.
* **Seasonality –** variations at specific time-frames. eg people might have a tendency to buy cars in a particular month because of pay increment or festivals.
***

# **Estimating & Eliminating Trend**
***
One of the first tricks to reduce trend can be transformation. For example, in this case we can clearly see that the there is a significant positive trend. So we can apply transformation which penalize higher values more than smaller values. These can be taking a **log, square root, cube root, etc**. Lets take a log transform here for simplicity:
***

In [None]:
ts_log = np.log(ts)
plt.plot(ts_log)

In this simpler case, it is easy to see a forward trend in the data. But its not very intuitive in presence of noise. So we can use some techniques to estimate or model this trend and then remove it from the series. There can be many ways of doing it and some of most commonly used are:

* **Aggregation –** taking average for a time period like monthly/weekly averages
* **Smoothing –** taking rolling averages
* **Polynomial Fitting –** fit a regression model

In [None]:
moving_avg = ts_log.rolling(12).mean()
plt.plot(ts_log)
plt.plot(moving_avg, color='red')

The red line shows the rolling mean. Lets subtract this from the original series. Note that since we are taking average of last 12 values, rolling mean is not defined for first 11 values. This can be observed as:

In [None]:
ts_moving_avg_diff = ts_log - moving_avg
ts_moving_avg_diff.head(12)

In [None]:
ts_moving_avg_diff.dropna(inplace=True)
test_stationarity(ts_moving_avg_diff)

### **Exponentially weighted moving average**

Note that here the parameter ‘halflife’ is used to define the amount of exponential decay. This is just an assumption here and would depend largely on the business domain. Other parameters like span and center of mass can also be used to define decay which are discussed in the link shared above. Now, let’s remove this from series and check stationarity:

In [None]:
expweighted_mavg = ts_log.ewm(halflife = 12).mean()
plt.plot(ts_log)
plt.plot(expweighted_mavg, color='red')

In [None]:
ts_log_ewma_diff = ts_log - expweighted_mavg
test_stationarity(ts_log_ewma_diff)

# **Eliminating Trend and Seasonality**
****

The simple trend reduction techniques discussed before don’t work in all cases, particularly the ones with high seasonality. Lets discuss two ways of removing trend and seasonality:

* **Differencing** – taking the differece with a particular time lag
* **Decomposition** – modeling both trend and seasonality and removing them from the model.
****

#### **Differencing**

In [None]:
ts_log_diff = ts_log - ts_log.shift()
plt.plot(ts_log_diff)

In [None]:
ts_log_diff.dropna(inplace=True)
test_stationarity(ts_log_diff)

#### **Decomposing**

In [None]:
decomposing = seasonal_decompose(ts_log)

trend = decomposing.trend
seasonal = decomposing.seasonal
residual = decomposing.resid

plt.subplot(411)
plt.plot(ts_log, label='Original')
plt.legend(loc='upper left')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='upper left')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='upper left')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='upper left')
plt.tight_layout()

In [None]:
ts_log_decompose = residual
ts_log_decompose.dropna(inplace=True)
test_stationarity(ts_log_decompose)

***
# **Forecasting**


Let me give you a brief introduction to ARIMA. I won’t go into the technical details but you should understand these concepts in detail if you wish to apply them more effectively. ARIMA stands for **Auto-Regressive Integrated Moving Averages**. The ARIMA forecasting for a stationary time series is nothing but a linear (like a linear regression) equation. The predictors depend on the parameters (p,d,q) of the ARIMA model:

* **Number of AR (Auto-Regressive) terms (p):** AR terms are just lags of dependent variable. For instance if p is 5, the predictors for x(t) will be x(t-1)….x(t-5).

* **Number of MA (Moving Average) terms (q):** MA terms are lagged forecast errors in prediction equation. For instance if q is 5, the predictors for x(t) will be e(t-1)….e(t-5) where e(i) is the difference between the moving average at ith instant and actual value.

* **Number of Differences (d):** These are the number of nonseasonal differences, i.e. in this case we took the first order difference. So either we can pass that variable and put d=0 or pass the original variable and put d=1. Both will generate same results.

An importance concern here is how to determine the value of ‘p’ and ‘q’. We use two plots to determine these numbers. Lets discuss them first.

**Autocorrelation Function (ACF):** It is a measure of the correlation between the the TS with a lagged version of itself. For instance at lag 5, ACF would compare series at time instant ‘t1’…’t2’ with series at instant ‘t1-5’…’t2-5’ (t1-5 and t2 being end points).

**Partial Autocorrelation Function (PACF):** This measures the correlation between the TS with a lagged version of itself but after eliminating the variations already explained by the intervening comparisons. Eg at lag 5, it will check the correlation but remove the effects already explained by lags 1 to 4.
***

In [None]:
lag_acf = acf(ts_log_diff, nlags = 20)
lag_pacf = pacf(ts_log_diff, nlags=20, method = 'ols')

In [None]:
plt.plot(lag_acf, label='acf')
plt.plot(lag_pacf, label='pacf')
plt.legend(loc='upper right')
plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')

# We set the range from 0 to 5 after analising the full chart, to find the exact number 
#for acf and pacf where it crosses the confidence limits
plt.xlim([0, 5]) 

The p,d,q values can be specified using the order argument of ARIMA which take a tuple (p,d,q).

* **p –** The lag value where the **PACF** chart crosses the upper confidence interval for the first time. If you notice closely, in this case p=2.

* **q –** The lag value where the **ACF** chart crosses the upper confidence interval for the first time. If you notice closely, in this case q=3.

***
# **Modelling**

In [None]:
model = ARIMA(ts_log, order = (2,1,2))
results_ARIMA = model.fit(disp=-1)
plt.plot(ts_log_diff)
plt.plot(results_ARIMA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_ARIMA.fittedvalues-ts_log_diff)**2))

***
# **Re-Scaling**

In [None]:
predicted_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy = True)
predicted_ARIMA_diff.head()

In [None]:
predicted_ARIMA_diff_cumsum = predicted_ARIMA_diff.cumsum()
predicted_ARIMA_diff_cumsum.head()

In [None]:
predictions_ARIMA_log = pd.Series(ts_log.iloc[0], index=ts_log.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predicted_ARIMA_diff_cumsum,fill_value=0)
predictions_ARIMA_log.head()

In [None]:
predictions_ARIMA = np.exp(predictions_ARIMA_log)
plt.plot(ts)
plt.plot(predictions_ARIMA)
plt.title('RMSE: %.4f'% np.sqrt(sum((predictions_ARIMA-ts)**2)/len(ts)))

In [None]:
# Now let's forecast the next year
results_ARIMA.plot_predict(1,156)
plt.legend(loc='upper left')

In [None]:
# Some extra info just in case

#x=results_ARIMA.forecast(steps=12)
#print(x[1])
#print(len(x[1]))
#print(np.exp(x[1]))