In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

djia= pd.read_csv('../input/stocknews/upload_DJIA_table.csv',parse_dates=['Date'], index_col='Date')
djia.head()


In [None]:
ts=djia['Open']
plt.plot(ts)
plt.show()

**It is clearly evident that there is an overall increasing trend in the data along with some seasonal variations. However, it might not always be possible to make such visual inferences (we’ll see such cases later). So, more formally, we can check stationarity using the following:**

# Time Series Stationarity

**What is time series stationarity?**



First of all we define time series(TS). A TS is a collection of data points collected at specific time intervals i.e they are time varying data.
A time series is said to be stationary if its statistical properties such as mean, variance remain constant over time.

# Plotting Rolling Statistics

**We can plot the moving average or moving variance and see if it varies with time. By moving average/variance I mean that at any instant ‘t’, we’ll take the average/variance of the last year, i.e. last 12 months. But again this is more of a visual technique.**

# Dickey Fuller Test

**This is one of the statistical tests for checking stationarity. Here the null hypothesis is that the TS is non-stationary. The test results comprise of a Test Statistic and some Critical Values for difference confidence levels. If the ‘Test Statistic’ is less than the ‘Critical Value’, we can reject the null hypothesis and say that the series is stationary.**

**We’ll be using the rolling statistics plots along with Dickey-Fuller test results a lot so I have defined a function which takes a TS as input and generated them for us. Please note that I’ve plotted standard deviation instead of variance to keep the unit similar to mean.**

In [None]:
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
    
    # Determining Rolling Statistics
    rolmean=timeseries.rolling(window=12).mean()
    rolstd=timeseries.rolling(window=12).std()
    
    #Plot Rolling Statistics
    orig=plt.plot(timeseries,color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    #Plot Dickey-Fuller Test
    
    print('Results of Dickey-Fuller Test:')
    dftest=adfuller(timeseries , autolag='AIC')
    dfoutput=pd.Series(dftest[0:4],index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key]=value
    print (dfoutput)
    

In [None]:
test_stationarity(ts)

**Though the variation in standard deviation is small, mean is clearly increasing with time and this is not a stationary series. Also, the test statistic is way more than the critical values.**

# Making the time series stationary

**To make a TS stationary, we must first understand what makes it non-stationary.There are 2 reasons**

Trend and Seasonality

**Therefore to get a stationary TS we must identify and eliminate Trend and Seasonality**

# Estimating and Eliminating Trend

**Here we use transformation. Using transformation such as log or square root function can penalize higher values more than smaller values**

In [None]:
#taking log transformation

ts_log=np.log(ts)

plt.plot(ts_log)

In [None]:
import matplotlib.gridspec as gridspec

ts_log = np.log(ts)
fig = plt.figure(constrained_layout = True)
gs_1 = gridspec.GridSpec(2, 3, figure = fig)
ax_1 = fig.add_subplot(gs_1[0, :])
ax_1.plot(ts_log)
ax_1.set_xlabel('Year')
ax_1.set_ylabel('Data')
plt.title('Logarithmic time series')

ax_2 = fig.add_subplot(gs_1[1, :])
ax_2.plot(ts)
ax_1.set_xlabel('Year')
ax_1.set_ylabel('Data')
plt.title('Original time series')
plt.show()

**Moving Average**

In this approach, we take average of ‘k’ consecutive values depending on the frequency of time series. Here we can take the average over the past 1 year, i.e. last 12 values. Pandas has specific functions defined for determining rolling statistics

In [None]:
mov_avg = ts_log.rolling(window=12).mean()
plt.plot(ts_log)
plt.plot(mov_avg, color='red')

**Linear Regression**

In [None]:
from sklearn import datasets, linear_model

ts_wi = ts_log.reset_index()
df_values = ts_wi.values
train_y = df_values[:,1]
train_y = train_y[:, np.newaxis]
train_x = ts_wi.index
train_x = train_x[:, np.newaxis]
regr = linear_model.LinearRegression()
regr.fit(train_x, train_y)
pred = regr.predict(train_x)
plt.plot(ts_wi.Date, pred)
plt.plot(ts_log)

**Eliminating Trend**

**Eliminating trends are absolutely necessary as TS are time dependent and developing a regression model requires stationarity.**

There are 3 ways to eliminate trends
1. Aggregation - taking average of a time period
2. Smoothing - taking rolling averages
3. Polynomial fitting - taking a regression model

Since we have already done moving averages, let us use it (smoothing) to eliminate the trends

In [None]:
ts_log_moving_avg_diff= ts_log-mov_avg
ts_log_moving_avg_diff.dropna(inplace=True)
test_stationarity(ts_log_moving_avg_diff)


**Decomposing**

**Seasonal decomposing is the fastest way to remove trend and seasonality components from a time serie to becoming it stationary.**

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(ts_log,freq=4, model='additive')

trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

plt.subplot(411)
plt.plot(ts_log, label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

In [None]:
#ts_decompose = residual
ts_log_diff = ts_log - ts_log.shift(1)
ts_decompose = ts_log_diff
ts_decompose.dropna(inplace=True)
test_stationarity(ts_decompose)

# Forecasting the time series

Since the series is now stationary, we can perform forecasting on the data

1. A strictly stationary series with no dependence among the values. This is the easy case wherein we can model the residuals as white noise. But this is very rare.
2. A series with significant dependence among values. In this case we need to use some statistical models like ARIMA to forecast the data.

# Using ARIMA to forecast the data

In [None]:
#ACF and PACF plots:
from statsmodels.tsa.stattools import acf, pacf

lag_acf = acf(ts_log_diff, nlags=20)
lag_pacf = pacf(ts_log_diff, nlags=20, method='ols')

#Plot ACF: 
plt.subplot(121) 
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.title('Autocorrelation Function')

#Plot PACF:
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')
plt.tight_layout()

In this plot, the two dotted lines on either sides of 0 are the confidence interevals. These can be used to determine the ‘p’ and ‘q’ values as:

1. p – The lag value where the PACF chart crosses the upper confidence interval for the first time. If you notice closely, in this case p=2.
2. q – The lag value where the ACF chart crosses the upper confidence interval for the first time. If you notice closely, in this case q=2.

Now, lets make 2 different ARIMA models considering individual as well as combined effects. I will also print the RSS for each. Please note that here RSS is for the values of residuals and not actual series.

In [None]:
from statsmodels.tsa.arima_model import ARIMA

# AR Model

In [None]:
model = ARIMA(ts_log, order=(2, 1, 0))  
results_AR = model.fit(disp=-1)  
plt.plot(ts_log_diff)
plt.plot(results_AR.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_AR.fittedvalues-ts_log_diff)**2))

# MA Model

In [None]:
model = ARIMA(ts_log, order=(0, 1, 2))  
results_MA = model.fit(disp=-1)  
plt.plot(ts_log_diff)
plt.plot(results_MA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_MA.fittedvalues-ts_log_diff)**2))

We can see that RSS for AR model is slightly better

# Taking it back to original scale

In [None]:
predictions_ARIMA_diff = pd.Series(results_AR.fittedvalues, copy=True)
print (predictions_ARIMA_diff.head())

In [None]:
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
print (predictions_ARIMA_diff_cumsum.head())

In [None]:
predictions_ARIMA_log = pd.Series(ts_log.iloc[0], index=ts_log.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)
predictions_ARIMA_log.head()

In [None]:
predictions_ARIMA = np.exp(predictions_ARIMA_log)
plt.plot(ts)
plt.plot(predictions_ARIMA)
plt.title('RMSE: %.4f'% np.sqrt(sum((predictions_ARIMA-ts)**2)/len(ts)))