In [1]:
#!pip install pmdarima
#!pip install yfinance
import pandas as pd
import numpy as np
import scipy
import yfinance
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")
from pmdarima.arima import auto_arima
from arch import arch_model

### Auto ARIMA
- Returns the model with the lowest AIC because the formula for this criteria use LL so a lower criterion value means the LL is high.
#### Advantages;
- Saves time
- removes ambiguity
- reduces risk of human error
#### Disadvantages:
- Blindly putting faith into one criterion
- We never really see how well other models perform 
- Topic expertise is hindered because we dont get to compare models especially if we can save degrees of freedom by picking another model, we wont see this to compare
- Improper interpretations is another form of human error

In [2]:
#importing the data
raw_data = yfinance.download (tickers = "^GSPC ^FTSE ^N225 ^GDAXI", start = "1994-01-07", end = "2018-01-29", 
                              interval = "1d", group_by = 'ticker', auto_adjust = True, treads = True)
df_comp = raw_data.copy()

#defining columns
df_comp['spx'] = df_comp['^GSPC'].Close[:]
df_comp['dax'] = df_comp['^GDAXI'].Close[:]
df_comp['ftse'] = df_comp['^FTSE'].Close[:]
df_comp['nikkei'] = df_comp['^N225'].Close[:]

del df_comp['^N225'], df_comp['^GSPC'], df_comp['^GDAXI'], df_comp['^FTSE']
df_comp=df_comp.asfreq('b')
df_comp=df_comp.fillna(method='ffill')

#adding returns columns
df_comp['ret_spx'] = df_comp.spx.pct_change(1)*100
df_comp['ret_dax'] = df_comp.dax.pct_change(1)*100
df_comp['ret_ftse'] = df_comp.ftse.pct_change(1)*100
df_comp['ret_nikkei'] = df_comp.nikkei.pct_change(1)*100

#split data
size = int(len(df_comp)*0.8)
df, df_test = df_comp.iloc[:size], df_comp.iloc[size:]

[*********************100%***********************]  4 of 4 completed


In [3]:
#default Auto ARIMA model - without arguments
model_auto = auto_arima(df.ret_ftse[1:])
model_auto

ARIMA(maxiter=50, method='lbfgs', order=(2, 0, 5), out_of_sample_size=0,
      scoring='mse', scoring_args=None, seasonal_order=(0, 0, 0, 0),
      with_intercept=True)

In [4]:
model_auto.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,5020.0
Model:,"SARIMAX(2, 0, 5)",Log Likelihood,-7885.69
Date:,"Sun, 12 Apr 2020",AIC,15789.379
Time:,18:18:14,BIC,15848.07
Sample:,0,HQIC,15809.945
,- 5020,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,0.0309,0.024,1.289,0.197,-0.016,0.078
ar.L1,0.1766,0.039,4.544,0.000,0.100,0.253
ar.L2,-0.8128,0.035,-22.984,0.000,-0.882,-0.743
ma.L1,-0.2005,0.038,-5.239,0.000,-0.275,-0.125
ma.L2,0.7654,0.037,20.436,0.000,0.692,0.839
ma.L3,-0.0953,0.012,-8.246,0.000,-0.118,-0.073
ma.L4,0.0112,0.009,1.229,0.219,-0.007,0.029
ma.L5,-0.1113,0.009,-12.960,0.000,-0.128,-0.094
sigma2,1.3550,0.014,94.014,0.000,1.327,1.383

0,1,2,3
Ljung-Box (Q):,69.64,Jarque-Bera (JB):,6575.67
Prob(Q):,0.0,Prob(JB):,0.0
Heteroskedasticity (H):,2.0,Skew:,-0.18
Prob(H) (two-sided):,0.0,Kurtosis:,8.6


In [None]:
###Add interpretaiton of model above. What this actual is is ARMA(4,5) explain why, note insignificant coefs which proves that best model does not mean the same things 

### Important Arguments
http://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.auto_arima.html#pmdarima.arima.auto_arima
- endogenous -> the series we are trying to model
- exogenous -> outside factors (e.g other time series). Make sure the exogenous and endogenous data are in the same forms e.g. returns vs returns. Also make sure they are in the same state (stationary/non-stationary)
- m -> seasonal cycle length. This is the **s argument** in SARIMAX models
- max_order -> maximum amount of variables to be used in the regression (p + q). Total number of non-seasonal AR and MA components the model can have. Can set it to None if you dont have a max.
- max_p -> maximum AR components = non=seasonal, (max_P = seasonal)
- max_q -> maximum MA components = non=seasonal, (max_Q = seasonal)
- max_d -> maximum Integrations = non=seasonal, (max_D = seasonal)
- maxiter -> maximum iterations we're giving the model to converge the coefficients (becomes harder as the order increases)
- return_valid_fits -> whether or not the method should validate the results 
- alpha -> level of significance, default is 5%, which we should be using most of the time
- n_jobs -> how many models to fit at a time (-1 indicates "as many as possible"). Max may slow up computation
- trend -> deterministic trend set ->"ct" usually. ct = constant and trend, ctt = when there is a quadratic trend relationship
- information_criterion -> 'aic', 'aicc', 'bic', 'hqic', 'oob' (Akaike Information Criterion, Corrected Akaike Information Criterion, Bayesian Information Criterion, Hannan-Quinn Information Criterion, or "out of bag"--for validation scoring--respectively). Default arg is aic
- out_of_sample_size -> validates the model selection (pass the entire dataset (dont need to split between train and test, and set 20% to be the out_of_sample_size) e.g = int(len(df_comp)* 0.2)


### Fitting a Model

In [6]:
#Model with more variables set
model_auto = auto_arima(df_comp.ftse[1:], exogenous= df_comp[['spx', 'dax', 'nikkei']][1:], m=5, 
                       max_order= None, max_p=7, max_q = 7,max_d= 2, max_P = 4, max_Q = 4, max_D = 2,
                       maxiter=50, alpha = 0.05, n_jobs = -1, trend = 'ct', information_criterion = 'oob',
                       out_of_sample_size = int(len(df_comp)*0.2))

KeyboardInterrupt: 

In [None]:
model_auto.summary()

In [None]:
# drift = constant term
#try out different values for the arguments to see what the best model is!