# ARIMA 2m return

In [None]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
import pmdarima as pm
from pmdarima.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
ret_df = pd.read_csv('returns.csv') # read 2-min return data
ret_df # check if data works
ret_df = ret_df.drop(columns = ['Datetime']) # drop date column

In [None]:
train, test = train_test_split(ret_df, train_size = 0.8) # train test split

In [None]:
def fit_auto_arima(train, ticker, m = None):
    """
    Fit an ARIMA model automatically
    """
    if m is None:
        model = pm.auto_arima(train[ticker])
    else:
        m_input = m
        model = pm.auto_arima(train[ticker], seasonal = True, m = m_input)
    return model

def predict_auto_arima(model, test, ticker):
    """
    Predict using the current ticker and ARIMA model
    """
    forecasts = model.predict(test.shape[0])
    target = test[ticker]
    rmse = mean_squared_error(target, forecasts, squared = False)
    return forecasts, rmse

In [None]:
# fit model for one company
ticker = 'MMM'
model = fit_auto_arima(train, ticker, m = 20)
[forecasts, rmse] = predict_auto_arima(model, test, ticker)

In [None]:
# get predicted plot for one company
plt.plot(range(train.shape[0]), train[ticker], c = 'blue')
plt.plot(range(train.shape[0], ret_df.shape[0]), forecasts, c = 'green')
#plt.plot(range(train.shape[0], ret_df.shape[0]), test['A'], c = 'yellow')
plt.show()

In [None]:
# fit models for all companies
rmse_list = []
start = time.time()
model_list = []
ticker_list = []
iter = 1
for ticker in ret_df.columns:
    if ticker == 'COP':
        iter += 1
        pass
    print(iter)
    model = fit_auto_arima(train, ticker)
    [forecasts, rmse] = predict_auto_arima(model, test, ticker)
    rmse_list += [rmse]
    if model.order != (0, 0, 0):
        model_list += [model]
        ticker_list += [ticker]
    iter += 1
print('Elapsed ' + str(time.time() - start))

In [None]:
rmse # get mean RMSE

# ARIMA 10-min volatility

In [None]:
# import 10-min volatility data

#vol_df = pd.read_csv('target.csv')
vol_df = pd.read_json('data.json')
vol_df = vol_df[['Datetime', 'ticker', 'target']].pivot(index = 'Datetime', columns = 'ticker', values = 'target')
vol_df = vol_df.fillna(0)

In [None]:
train, test = train_test_split(vol_df, train_size = 0.8)

In [None]:
# fit models for all companies
rmse_list = []
start = time.time()
iter = 1
for ticker in vol_df.columns:
    print(iter)
    model = fit_auto_arima(train, ticker)
    [forecasts, rmse] = predict_auto_arima(model, test, ticker)
    rmse_list += [rmse]
    iter += 1
print('Elapsed ' + str(time.time() - start))

In [None]:
rmse # print mean RMSE

In [None]:
# fit model for one company
ticker = 'MMM' 
model = fit_auto_arima(train, ticker, m = 200)
[forecasts, rmse] = predict_auto_arima(model, test, ticker)

In [None]:
# plot predicted data for one company
plt.plot(range(train.shape[0]), train[ticker], c = 'blue', label = 'Training data')
plt.plot(range(train.shape[0], vol_df.shape[0]), forecasts, c = 'green', label = 'Predicted data')
plt.plot(range(train.shape[0], vol_df.shape[0]), test['MMM'], c = 'orange', label = 'Testing data')
plt.legend()
plt.xlabel('Time index')
plt.ylabel('Volatility')
plt.show()
#plt.savefig('6785 plots/3M_actual_vs_predicted', dpi = 500)

In [None]:
# plot first diff histogram for one company
plt.hist(vol_df[ticker].diff())
plt.xlabel("First Difference")
plt.ylabel("Number of Observations")
plt.savefig('6785 plots/3M_first_diff_hist.png', dpi = 500)

In [None]:
# plot acf of one company
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(vol_df[ticker].diff())