In [33]:
import sys
sys.path.append("/home/ross/quantfin/lib")

%matplotlib inline 
import math
import time
from datetime import datetime
import importlib
import numpy as np # numerical computation packages in python
import matplotlib.pyplot as plt # plotting routines
import pandas as pd
import multiprocessing as mp
import AFMLlib as afml
importlib.reload(afml) # Reload module in case it changed in interactive mode

from statsmodels.tsa.stattools import adfuller
from joblib import dump, load
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import BaggingClassifier
from joblib import dump, load

# Import Data

In [18]:
colNames = ["side", "readtime", "filltime", "volume", "price"]
rawTradesData = pd.read_csv('data/XXXX.csv', sep=',', index_col=1, names=colNames, parse_dates=[1, 2])

In [19]:
# Use a subset of the data
tradesData = rawTradesData
# tradesData = rawTradesData.head(math.floor(rawTradesData.shape[0] * 0.05)).sort_index()
tradesData.shape

(10970966, 4)

In [20]:
# Create volume candles from raw fill data
volCandles = afml.createVolumeCandles(tradesData, frequency=5)
volCandles

Unnamed: 0,Open,High,Low,Close,Volume
2019-07-01 00:00:01.000,10855.6,10861.8,10849.0,10849.0,5.103393
2019-07-01 00:00:06.000,10850.0,10858.0,10849.0,10853.7,12.572330
2019-07-01 00:00:18.000,10837.0,10856.8,10837.0,10856.8,8.856697
2019-07-01 00:00:25.029,10852.0,10853.3,10813.6,10836.7,5.192057
2019-07-01 00:00:40.002,10851.8,10853.3,10849.7,10851.9,6.096606
...,...,...,...,...,...
2019-10-16 13:51:18.000,7979.9,7981.4,7979.8,7981.3,10.355099
2019-10-16 13:52:44.000,7980.2,7981.3,7979.5,7980.2,5.889109
2019-10-16 13:54:00.000,7977.0,7980.6,7977.0,7980.3,7.535009
2019-10-16 13:55:21.004,7987.2,7987.3,7975.1,7976.4,5.411302


# Extract Features 

In [21]:
# Fractional Differencing
closePrices = volCandles.Close
# closePrices = closePrices.apply(math.log)
fracDiffClose = afml.frac_diff_ffd(closePrices, 0.35) # Fractionally differenced close prices
fracDiffClose

2019-07-01 00:00:01.000           NaN
2019-07-01 00:00:06.000           NaN
2019-07-01 00:00:18.000           NaN
2019-07-01 00:00:25.029           NaN
2019-07-01 00:00:40.002           NaN
                              ...    
2019-10-16 13:51:18.000    750.572243
2019-10-16 13:52:44.000    749.182603
2019-10-16 13:54:00.000    749.339693
2019-10-16 13:55:21.004    750.457366
2019-10-16 13:56:49.000    747.089618
Length: 243485, dtype: float64

In [22]:
# Entropy features

# Rolling kurtosis of the fractionally differenced price
fracDiffKurtosis = fracDiffClose.rolling(400).kurt()
fracDiffKurtosis

2019-07-01 00:00:01.000         NaN
2019-07-01 00:00:06.000         NaN
2019-07-01 00:00:18.000         NaN
2019-07-01 00:00:25.029         NaN
2019-07-01 00:00:40.002         NaN
                             ...   
2019-10-16 13:51:18.000   -0.200741
2019-10-16 13:52:44.000   -0.189063
2019-10-16 13:54:00.000   -0.176747
2019-10-16 13:55:21.004   -0.162921
2019-10-16 13:56:49.000   -0.149702
Length: 243485, dtype: float64

# Sampling Events 

In [23]:
cusumEvents = afml.getCUSUMEvents(fracDiffClose, 5) # Timestamps of CUSUM events
cusumEvents

DatetimeIndex([       '2019-07-01 01:08:50',        '2019-07-01 01:09:00',
                      '2019-07-01 01:09:11', '2019-07-01 01:09:44.001000',
                      '2019-07-01 01:09:52', '2019-07-01 01:09:57.011000',
               '2019-07-01 01:10:01.004000', '2019-07-01 01:10:05.002000',
               '2019-07-01 01:10:40.006000', '2019-07-01 01:11:04.004000',
               ...
               '2019-10-16 13:20:42.004000',        '2019-10-16 13:21:16',
                      '2019-10-16 13:24:59', '2019-10-16 13:30:02.004000',
                      '2019-10-16 13:31:10',        '2019-10-16 13:31:24',
                      '2019-10-16 13:32:38', '2019-10-16 13:35:59.001000',
               '2019-10-16 13:41:59.005000',        '2019-10-16 13:48:55'],
              dtype='datetime64[ns]', length=136961, freq=None)

# Labeling events

In [24]:
barrierWidth = 1 # Width of the top and bottom barriers
volCandles['targets'] = 0.05
targets = volCandles['targets'] # Target returns for each event
minReturn = 0.01 # Minimum target return to run a triple barrier search
numThreads = 10 # Number of processor threads to use

# Series of timestamps for the third 'timeout' barriers
timeBarriers = closePrices.index.searchsorted(cusumEvents+pd.Timedelta(hours = 4))
timeBarriers = timeBarriers[timeBarriers<closePrices.shape[0]]
timeBarriers = pd.Series(closePrices.index[timeBarriers],index = cusumEvents[:timeBarriers.shape[0]]) # NaNs at end

In [25]:
# Calculate returns for each event
events = afml.getEvents(closePrices, cusumEvents, barrierWidth, targets, minReturn, numThreads, timeBarriers)

2019-12-20 12:19:26.591082 100.0% applyPtSlOnT1 done after 6.47 minutes. Remaining 0.0 minutes.


In [26]:
# Label events
bins = afml.getBins(events, closePrices)
bins.bin[bins.ret < minReturn] = 0

# Modelling

In [27]:
# Split data into train and test
X = pd.concat([fracDiffClose, fracDiffKurtosis], axis=1)
X = X.loc[bins.index]
X = X[~np.isnan(X).any(axis=1)]
y = bins.loc[X.index].bin

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [28]:
# Train SVM
clf = svm.SVC(gamma=0.1,kernel='rbf',C=1.0)
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [29]:
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
score = cm[1,1] / (cm[0,1] + cm[1,1])
print(cm)
print(score)

[[41519   391]
 [12104   593]]
0.6026422764227642


In [43]:
# Ensemble
baggingSVM = svm.SVC(gamma=0.1, kernel='rbf', C=1.0, random_state=123)
ensembleModel = BaggingClassifier(base_estimator=baggingSVM, n_estimators=10, n_jobs=4, random_state=42)
ensembleModel.fit(X_train, y_train)

BaggingClassifier(base_estimator=SVC(C=1.0, cache_size=200, class_weight=None,
                                     coef0=0.0, decision_function_shape='ovr',
                                     degree=3, gamma=0.1, kernel='rbf',
                                     max_iter=-1, probability=False,
                                     random_state=123, shrinking=True,
                                     tol=0.001, verbose=False),
                  bootstrap=True, bootstrap_features=False, max_features=1.0,
                  max_samples=1.0, n_estimators=10, n_jobs=4, oob_score=False,
                  random_state=42, verbose=0, warm_start=False)

In [44]:
y_pred = ensembleModel.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
score = cm[1,1] / (cm[0,1] + cm[1,1])
print(cm)
print(score)

[[41553   357]
 [12143   554]]
0.6081229418221734


In [47]:
# Save model
dump(ensembleModel, 'VolumeML.joblib')

['wc16.joblib']

In [0]:
# Cross Validation

# Backtesting