Load packages

In [11]:
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.stats.proportion import proportions_ztest
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, precision_score, matthews_corrcoef
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from ticktock import tick, tock
import utils

Load the data, either by generating it from the raw .txt files:

In [None]:
data = utils.panelizeRawData('data/price_raw_hr.p', 'data/vol_raw_hr.p', 'data/panel.p')

Or loading it from a pickled file:

In [13]:
data = pd.read_pickle('data/panel.p')

The data should now have the form:
```<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 95208 (major_axis) x 4 (minor_axis)
Items axis: price to vol
Major_axis axis: 2002-01-02 19:00:00-05:00 to 2017-05-18 18:00:00-04:00
Minor_axis axis: 0 to 3
```
Now the slow part: computing the technical indicators for each desired timestep.

In [None]:
data = utils.computeData(data)

Or load the computed data from a file!

In [None]:
data = pd.read_pickle('data/dataframe.p')

In [None]:
def summarize(df):
	s = df['s']
	w = df['w']
	return pd.Series({
		'z': proportions_ztest(sum(s & w).sum(), sum(s), value=0.5, alternative='two-sided')[0] if sum(s) else np.NaN,
		'confusion_matrix': confusion_matrix(w, s),
		'f1': f1_score(w, s),
		'accuracy': accuracy_score(w, s),
		'precision': precision_score(w, s),
		'matthews': matthews_corrcoef(w, s)
	})

tick()
summ = data.groupby(['mkt', 'lkbk', 'wkdy', 'wk#', 'time']).apply(summarize)
summ['abs z'] = np.abs(summ['z'])
summ['NOBS'] = summ['confusion_matrix'].map(lambda i: sum(i.flatten()))
tock('summarize')

In [None]:
summ.head(30)

Now it's '

In [None]:
x = d2[
	(d2.mkt == 0) &
	(d2.lkbk == 5) &
	(d2.wkdy == 4) &
	(d2['wk#'] == 1) &
	(d2.time == 14)
	]

catcols = [
	# 'mkt',
	# 'time',
	# 'lkbk',
	# 'wkdy',
	# 'wk#',
]

numcols = [
	# 'macd',
	# 'obv',
	# 'p_2ma',
	# 'p_avg',
	# 'p_krt',
	# 'p_med',
	# 'p_skw',
	# 'p_std',
	# 'rsi',
	# 'v_avg',
	# 'v_cur',
	# 'v_krt',
	# 'v_med',
	# 'v_skw',
	# 'v_std',
	'prd_rtn',
]

Since the data is a time series, cross-validation is a little easier: the test set will always be the most recent data points.

In [None]:
split = dt.datetime(2017, 1, 1)

man_results = summarize(x[x.index >= split])

x1 = x.loc[x.index < split, catcols + numcols]
y1 = x.loc[x.index < split, 'fwd_rtn']
x2 = x.loc[x.index >= split, catcols + numcols]
y2 = x.loc[x.index >= split, 'fwd_rtn']

tick()

ss = StandardScaler().fit(x1[numcols])
if catcols:
	oe = OneHotEncoder().fit(x1[catcols])
	x1 = np.concatenate((oe.transform(x1[catcols]).toarray(), ss.transform(x1[numcols])), axis=1)
	x2 = np.concatenate((oe.transform(x2[catcols]).toarray(), ss.transform(x2[numcols])), axis=1)
else:
	x1 = ss.transform(x1[numcols])
	x2 = ss.transform(x2[numcols])
  # pca = PCA(n_components=3).fit(x1)
  # x1 = pca.transform(x1)
  # x2 = pca.transform(x2)

clfs = {
	'LSVC': [LinearSVC(), {
		'C': [1, 10, 100],
	}],
	'SVC': [SVC(), {
		'C': [1, 10, 100],
		'kernel': ['linear', 'rbf'],
		'gamma': [0, 0.1, 1, 10],
		'probability': [True, False]
	}],
	'SGD': [SGDClassifier(), {
		'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
		'penalty': ['none', 'l1', 'l2', 'elasticnet'],
		'alpha': [0.0001, 0.001, 0.01],
	}],
	'RFC': [RandomForestClassifier(), {
		'n_estimators': [10, 50, 100],
		'criterion': ['gini', 'entropy'],
		'max_features': [None, 'auto'],
		'min_samples_split': [10, 50, 100],
	}],
	'KNN': [KNeighborsClassifier(), {
		'n_neighbors': [5, 10, 30],
		'weights': ['uniform', 'distance'],
		'algorithm': ['ball_tree', 'kd_tree', 'brute', 'auto'],
		'p': [1, 2, 3],
	}],
	'GBC': [GradientBoostingClassifier(), {
		'loss': ['deviance', 'exponential', 'deviance'],
		'learning_rate': [0.01, 0.1, 1],
		'max_depth': [2, 3, 4],
	}],
	'MLP': [MLPClassifier(), {
		'hidden_layer_sizes': [(5, 2)],
		'activation': ['logistic', 'relu'],
		'solver': ['lbfgs'],
		'alpha': [1e-5],
		'learning_rate': ['constant', 'adaptive'],
	}]
}

def classify(params, x1, y1, x2, y2):
	import time
	now = time.clock()
	gscv = GridSearchCV(*params, scoring='f1').fit(x1, y1)
	t = time.clock() - now
	pred2 = gscv.predict(x2)
	return {'metrics': {
		'precision': precision_score(y2, pred2),
		'accuracy': accuracy_score(y2, pred2),
		'f1': f1_score(y2, pred2),
		'matthews': matthews_corrcoef(y2, pred2),
		'training accuracy': accuracy_score(y1, gscv.best_estimator_.predict(x1)),
	},
		'best_classifier': gscv.best_estimator_,
		'other': {
			'confusion_matrix': confusion_matrix(y2, pred2),
			'best_params': gscv.best_params_,
			'training time': t,
		},
	}
	
results = {name: classify(params, x1=x1, y1=y1, x2=x2, y2=y2) for (name, params) in clfs.items()}
metrics = pd.DataFrame([p['metrics'] for (n, p) in results.items()], index=results.keys())
metrics = metrics.append(pd.Series(man_results[['f1', 'accuracy', 'matthews', 'precision']], name='Manual'))
metrics.index.set_names('Classifier', inplace=True)
metrics.reset_index(inplace=True)
metrics = pd.melt(metrics, id_vars='Classifier', var_name='Metric')
fig, ax = plt.subplots(nrows=1, ncols=len(results) + 1)
for n, p in results.items():
	i = list(results.keys()).index(n)
	# plt.xlabel('pred')
	# plt.ylabel('true')
	# lab = ['N', 'P'] if i == 0 else False
	sns.heatmap(p['other']['confusion_matrix'], annot=True, cbar=False, square=True, ax=ax[i])
	ax[i].set_title(n)
sns.heatmap(man_results['confusion_matrix'], annot=True, cbar=False, square=True, ax=ax[len(results)])
ax[len(results)].set_title('Manual')
plt.figure()
sns.barplot(data=metrics, hue='Classifier', x='Metric', y='value');
for n, p in results.items():
	print(n)
	print(pd.Series(p['other']['best_params']))
	print('----------------------------')
plt.show()