In [None]:
import classifiers.labeller as labeller
import classifiers.arima as arima
import nn_train_driver
import plotting.plot_shortcuts as ps
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf

import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

In [None]:
# read in data
data_df = pd.read_csv('./data/qqq_2022.csv')

# produce truth labels
print('Computing Label Space...')
label_df = labeller.driver(data_df, 21, 'Close')
print(label_df.head())

# plot data/labels/label signal
sig_col = 'Close'
forecast_sig_col = 'Forecast ' + sig_col
ps.plot_label_over_signal(data_df.iloc[-390:], label_df.iloc[-390:], sig_col)

#### Labelling Explanation

Stock Market data is very noisy, and is often likened to "random walks". Because of this, computing outright price differentials (price_t - price_t-1) will not result in coherent/noiseless data. So, some form of "truth" data must be computed.

In order to best classify the direction of the market's movement, the following process was conducted to compute "truth" data:

1. To account for growth, the coefficients of the following exponential function were optimized to the data:
$\newline$
$y = Ax^B + C$
$\newline$

2. To account for fluctuations around the exponential function, an FFT was computed on the following price values:
$\newline$
$clean\_signal = FFT(raw\_data - growth\_model)$
$\newline$

3. Produce a clean version of the market data:
$\newline$
$clean\_market\_signal = growth\_model + clean\_signal$
$\newline$

4. Compute Velocity of clean market data
$\newline$
$ velocity = \frac{\delta clean\_market\_signal}{\delta t}$
$\newline$

5. Compute Labels from Velocity
$\newline$
$ y = 0;\space velocity < 0 \newline$
$ y = 1;\space velocity > 0$

# ARIMA Assessment

## Choosing ARIMA Order

In [None]:
# The pacf and acf plots below are plotted twice due to an incompatability
# between the stats package and jupyter notebooks. Adding a semicolon to the
# end of the line solves the problem: https://github.com/statsmodels/statsmodels/issues/4155#issuecomment-445913264
plot_pacf(data_df[[sig_col]].diff().dropna());

# given the spike at 1 with the values significantly dropping off and
# staying around 0, we use 1 as the initial autoregression param
p = 1

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(3)
ax1.plot(data_df[[sig_col]])
ax1.set_title('Original Series')
ax1.axes.xaxis.set_visible(False)

ax2.plot(data_df[[sig_col]].diff())
ax2.set_title('1st Order Differencing')
ax2.axes.xaxis.set_visible(False)

ax3.plot(data_df[[sig_col]].diff().diff())
ax3.set_title('2nd Order Differencing')
plt.show()

# because the data appears relatively stabilized after a first order
# difference, we use that as the d param
d = 1

In [None]:
plot_acf(data_df[[sig_col]].diff().dropna());

# given the spike at 1 with the values significantly dropping off and
# staying around 0, we use 1 as the initial moving average param
q = 1

## Evaluating ARIMA

In [None]:
# properly index the input data to ARIMA
date_indexed_df = data_df.copy(deep=True)
date_indexed_df['DTS'] = pd.to_datetime(date_indexed_df['EpochTime'], unit='s')
date_indexed_df = date_indexed_df.set_index('DTS')

In [None]:
# set up a day-wise dataframe to perform evaluation on
true_values = date_indexed_df[date_indexed_df[sig_col].groupby(pd.Grouper(freq='D')).rank() == 1][1:][sig_col].to_frame()
true_values.index = pd.to_datetime(true_values.index)
true_values.index = true_values.index.strftime('%Y-%m-%d')

In [None]:
# Fit and forecast with ARIMA
forecast_df = arima.fit_forecast(date_indexed_df, sig_col, p, d, q)

In [None]:
arima_df = pd.merge(true_values, forecast_df, left_index=True, right_index=True)
arima_df = arima_df.rename(columns={1: forecast_sig_col})

In [None]:
# plot the actual vs forecast values
ps.plot_forecast(arima_df, sig_col, forecast_sig_col)

# classify each data point as increasing or decreasing
arima_classifications = arima.convert_forecast_to_classification(arima_df)

In [None]:
print(f'\nARIMA Classification Report:\n{classification_report(arima_classifications[:,0], arima_classifications[:,1])}')

# Neural Net Assessment

In [None]:
# use the default config file for building/training the Multilayer Perceptron
nn_config = nn_train_driver.default_training_config()
print(f'MLP Hyperparameter Configuration:')
pprint(nn_config)

# build/train the MLP
print('\nTraining MLP:')
model, test_df = nn_train_driver.train_mlp(data_df, label_df=label_df, config=nn_config)

# format the test df to just be the feature space
feature_cols = list(test_df.columns)
feature_cols.remove('EpochTime')
feature_cols.remove('Label')

# get the properly formatted test space
y_test_true = test_df['Label']
yhat_test = nn_train_driver.inverse_onehot(model.predict(test_df[feature_cols]))
print(f'\nTest Space Classification Report:\n{classification_report(y_test_true, yhat_test)}')

# plot the NN classifications on the last few days of data
yhat_test_df = pd.DataFrame({'EpochTime': test_df['EpochTime'].values,
                             'Label': yhat_test})
ps.plot_label_over_signal(data_df.iloc[-390:], yhat_test_df.iloc[-390:], 'Close')