## Rename the data

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_raw = pd.read_csv("../data/AAPL_05222012_0930_1300_LOB_2.csv", index_col='Index')

In [3]:
cols = [name.split("..")[0].split(".")[1].lower() for name in data_raw.columns.values if len(name) > 5]

In [4]:
data_raw.columns = ["Time"] + cols

In [5]:
data_raw.head()

Unnamed: 0_level_0,Time,bid_price1,bid_update_time1,bid_size1,ask_price1,ask_update_time1,ask_size1,bid_price2,bid_update_time2,bid_size2,...,bid_size9,ask_price9,ask_update_time9,ask_size9,bid_price10,bid_update_time10,bid_size10,ask_price10,ask_update_time10,ask_size10
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2012/05/22 09:30:00.000,569.02,2012/05/22 09:29:34.006,40,570,2012/05/22 09:29:43.573,400,568.8,2012/05/22 09:29:47.563,100,...,200,571.0,2012/05/22 09:29:49.424,120,567.16,2012/05/22 09:28:28.730,60,571.24,2012/05/22 08:13:36.797,200
2,2012/05/22 09:30:00.003,569.02,2012/05/22 09:29:34.006,40,570,2012/05/22 09:29:43.573,400,568.8,2012/05/22 09:29:47.563,100,...,200,570.99,2012/05/22 09:29:53.347,200,567.16,2012/05/22 09:28:28.730,60,571.0,2012/05/22 09:29:49.424,120
3,2012/05/22 09:30:00.003,569.02,2012/05/22 09:29:34.006,40,570,2012/05/22 09:29:43.573,400,568.8,2012/05/22 09:29:47.563,100,...,200,570.67,2012/05/22 09:29:29.034,10,567.16,2012/05/22 09:28:28.730,60,570.99,2012/05/22 09:29:53.347,200
4,2012/05/22 09:30:00.003,569.03,2012/05/22 09:30:00.003,8,570,2012/05/22 09:29:43.573,400,569.02,2012/05/22 09:29:34.006,40,...,200,570.67,2012/05/22 09:29:29.034,10,567.3,2012/05/22 09:29:38.985,200,570.99,2012/05/22 09:29:53.347,200
5,2012/05/22 09:30:00.003,569.03,2012/05/22 09:30:00.003,8,570,2012/05/22 09:29:43.573,400,569.02,2012/05/22 09:29:34.006,40,...,200,570.55,2012/05/22 09:30:00.003,8,567.3,2012/05/22 09:29:38.985,200,570.67,2012/05/22 09:29:29.034,10


In [6]:
data_raw.to_csv("../data/AAPL_LOB.csv")

## Fit the models

In [1]:
from imp import reload

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
from label import mid_label, spread_label
from basic_set import get_basic
from insensitive_set import get_spread_midprice, get_price_diff, get_mean, get_accumulated_diff
from sensitive_set import get_derivatives
from sampling import sampling_labels
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier as RF

### Load data

In [3]:
data_raw = pd.read_csv('../data/AAPL_LOB.csv')

### Transform the data

In [4]:
def transform_data_tinsen(data_raw):
    spreads_mids = get_spread_midprice(data_raw)
    price_diff = get_price_diff(data_raw)
    means = get_mean(data_raw)
    accum_diff = get_accumulated_diff(data_raw)
    data_tfm = pd.merge(spreads_mids, price_diff, on = ['Index', 'Time'])
    data_tfm = pd.merge(data_tfm, means, on = ['Index', 'Time'])
    data_tfm = pd.merge(data_tfm, accum_diff, on = ['Index', 'Time'])
    return(data_tfm)

In [5]:
def transform_data(data_raw, feature_basic = True, feature_tinsen = True, feature_tsen = True, delta_t = 30):
    if feature_basic * feature_tinsen * feature_tsen:
        basics = get_basic(data_raw)
        data_tinsen = transform_data_tinsen(data_raw)
        data_tsen = get_derivatives(data_raw, delta_t)
        basics = basics.iloc[delta_t:]
        data_tinsen = data_tinsen.iloc[delta_t:]
        #print("basics_ind:" + str(basics.index[:5]) + "tinsen: "+ str(data_tinsen.index[:5]) + str(data_tsen.index[:5]))
        data_tfm = pd.merge(basics, data_tinsen, on = ['Index', 'Time'])
        data_tfm = pd.merge(data_tfm, data_tsen, on = ['Index', 'Time'])
        return(data_tfm)
    elif (feature_basic == True) and (feature_tinsen == True):
        basics = get_basic(data_raw)
        data_tinsen = transform_data_tinsen(data_raw)
        data_tfm = pd.merge(basics, data_tinsen, on = ['Index', 'Time'])
        return(data_tfm)
    elif (feature_basic == True):
        basics = get_basic(data_raw)
        return(basics)

In [6]:
def split_rawdata(data_raw, split_time = datetime(2012, 5, 22, 11, 0)):
    time = np.array([datetime.strptime(time, "%Y/%m/%d %H:%M:%S.%f") for time in data_raw['Time']])
    train_index = time < datetime(2012, 5, 22, 11, 0)
    train = data_raw.iloc[train_index]
    test = data_raw.iloc[np.logical_not(train_index)]
    return({"train": train, "test": test})

In [7]:
raw_data_sets = split_rawdata(data_raw)
train = raw_data_sets["train"]
test = raw_data_sets["test"]

In [56]:
train_tfm = transform_data(train, delta_t = 20)
test_tfm = transform_data(test, delta_t = 20)

In [36]:
#test_tfm.head()
train_tfm.shape

(203299, 128)

## Prepare y labels

#### Mid price based on delta_t = 20

***try without sampling***

Just use ~50% for training, ~25% for validating and ~25% for testing within the 9-11am dataset

In [17]:
# did not use
def get_sampling_labels_ind(train, test, sample_type = "mid", delta_t = 20, sample_size_train = 2000, sample_size_test = 10000, tick_size = 0.01,\
                           train_sample_random=False, test_sample_random = True, up_prob_train = 1/3, down_prob_train = 1/3,\
                           up_prob_test = 1/3, down_prob_test = 1/3):
    if sample_type == "mid":
        y_lab_train_all = mid_label(train, delta_t, tick_size)
        y_lab_test_all = mid_label(test, delta_t, tick_size)
        t_ind = sampling_labels(y_lab_train_all, sample_size_train, sample_random = train_sample_random, up_prob = up_prob_train, down_prob = down_prob_train)
        v_ind = sampling_labels(y_lab_test_all, sample_size_test, sample_random = test_sample_random, up_prob = up_prob_test, down_prob = down_prob_test)
#         y_lab_train = [y_lab_train_all[i] for i in range(len(t_ind)) if t_ind[i]]
#         y_lab_test = [y_lab_test_all[i] for i in range(len(v_ind)) if v_ind[i]]
        y_lab_train = pd.Series(y_lab_train_all).iloc[t_ind]
        y_lab_test = pd.Series(y_lab_test_all).iloc[v_ind]
        return({"y_lab_train": y_lab_train, "y_lab_test": y_lab_test, "t_ind": t_ind, "v_ind": v_ind})
    else:
        y_lab_train_all = spread_label(train, delta_t = delta_t)
        y_lab_test_all = spread_label(test, delta_t = delta_t)
        t_ind = sampling_labels(y_lab_train_all, sample_size_train, sample_random = train_sample_random, up_prob = up_prob_train, down_prob = down_prob_train)
        v_ind = sampling_labels(y_lab_test_all, sample_size_test, sample_random = test_sample_random, up_prob = up_prob_test, down_prob = down_prob_test)
#         y_lab_train = [y_lab_train_all[i] for i in range(len(t_ind)) if t_ind[i]]
#         y_lab_test = [y_lab_test_all[i] for i in range(len(v_ind)) if v_ind[i]]
        y_lab_train = pd.Series(y_lab_train_all).iloc[t_ind]
        y_lab_test = pd.Series(y_lab_test_all).iloc[v_ind]
        return({"y_lab_train": y_lab_train, "y_lab_test": y_lab_test, "t_ind": t_ind, "v_ind": v_ind})

In [56]:
# did not use
samples_inds = get_sampling_labels_ind(train_tfm, test_tfm, sample_type = "mid", delta_t = 30, tick_size = 0.04,\
                                      train_sample_random=False, test_sample_random = False, up_prob_train = 0.45, \
                                      down_prob_train = 0.45,sample_size_train = 10000, sample_size_test = 10000, \
                                             up_prob_test = 1/3, down_prob_test = 1/3)

In [57]:
# did not use
y_mid_train = samples_inds["y_lab_train"]
y_mid_test = samples_inds["y_lab_test"]
t_ind = samples_inds["t_ind"]
v_ind = samples_inds["v_ind"]

In [18]:
# did not use
y_mid_test.value_counts()

down          1700
stationary    6528
up            1772
dtype: int64

In [48]:
# did not use
train_samples_inds = get_sampling_labels_ind(train_tfm, train_tfm, sample_type = "mid", delta_t = 30, tick_size = 0.04,\
                                      train_sample_random=False, test_sample_random = False, up_prob_train = 1/3, \
                                      down_prob_train = 1/3,sample_size_train = 5000, sample_size_test = 5000, \
                                             up_prob_test = 1/3, down_prob_test = 1/3)

In [133]:
# did not use
y_mid_train_train = train_samples_inds["y_lab_train"]
y_mid_train_test = train_samples_inds["y_lab_test"]
t_ind = train_samples_inds["t_ind"]
v_ind = train_samples_inds["v_ind"]

In [57]:
y_lab_train_all = mid_label(train, 20, 0.01)
y_lab_train_all.shape

(203329,)

In [58]:
y_lab_test_all = mid_label(test, 20, 0.01)
y_lab_test_all.shape

(129304,)

In [33]:
y_lab_train_all = spread_label(train, delta_t = 50)
y_lab_train_all.value_counts()

stationary    194130
up              4924
down            4245
dtype: int64

In [34]:
y_lab_test_all = spread_label(test, 50)
y_lab_test_all.value_counts()

stationary    126053
down            1706
up              1515
dtype: int64

In [59]:
index = np.arange(train_tfm.shape[0])
np.random.seed(0)
# Shuffle the index and separate the data into train(50%), validation(25%), and test(25%) set.
np.random.shuffle(index)

# approximate subset
train_x = train_tfm.iloc[index[0:101659]]
validation_x = train_tfm.iloc[index[101659:152489]]
test_x = train_tfm.iloc[index[152489:]]

train_y = y_lab_train_all.iloc[index[0:101659]]
validation_y = y_lab_train_all.iloc[index[101659:152489]]
test_y = y_lab_train_all.iloc[index[152489:]]


In [60]:
print(train_x.shape)
print(validation_x.shape)
print(test_x.shape)
print(train_y.shape)
print(validation_y.shape)
print(test_y.shape)

(101659, 128)
(50830, 128)
(50840, 128)
(101659,)
(50830,)
(50840,)


In [39]:
clf = svm.SVC(C=100, gamma=10**-5, decision_function_shape='ovr')
clf.fit(train_x[train_x.columns[2:]].iloc[:10000], train_y[:10000])


SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1e-05, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [40]:
clf.score(train_x[train_x.columns[2:]].iloc[:10000], train_y[:10000])

0.99990000000000001

In [41]:
clf.score(validation_x[validation_x.columns[2:]].iloc[:10000], validation_y[:10000])

0.95630000000000004

In [42]:
clf.score(test_x[test_x.columns[2:]].iloc[:10000], test_y[:10000])

0.95809999999999995

In [26]:
predict_test = clf.predict(train_x[train_x.columns[2:]].iloc[:10000])
confusion_matrix(predict_test, train_y[:10000])

array([[3639,    5,    0],
       [   0, 2493,    0],
       [   1,    1, 3861]])

In [43]:
predict_test = clf.predict(validation_x[validation_x.columns[2:]].iloc[:10000])
confusion_matrix(predict_test, validation_y[:10000])

array([[  20,   12,    0],
       [ 196, 9498,  216],
       [   0,   13,   45]])

In [32]:
predict_test = clf.predict(test_x[test_x.columns[2:]].iloc[:10000])
confusion_matrix(predict_test, test_y[:10000])

array([[  33,    2,    1],
       [   1,   64,    0],
       [3470, 2490, 3939]])

In [44]:
clf.score(test_tfm[test_tfm.columns[2:]].iloc[:10000], y_lab_test_all[:10000])

0.97999999999999998

In [45]:
predict_test = clf.predict(test_tfm[test_tfm.columns[2:]].iloc[:10000])
CM = confusion_matrix(predict_test, y_lab_test_all[:10000])

In [46]:
CM

array([[   0,    0,    0],
       [ 113, 9800,   87],
       [   0,    0,    0]])

In [47]:
cal_measure(CM)

Unnamed: 0,Precision,Recall,F1_Measure
Up,0,,
Stationary,1,0.98,0.989899
Down,0,,


Mid-price can get OK results... but spread crossing is so bad because even if delta t is 50 or 100, there are still so few non-stationaries...

In [23]:
from sklearn.ensemble import RandomForestClassifier as RF

In [61]:
# max_depth=10 or 20, seems plenty good
cand = RF(n_estimators=500, max_depth=20, max_features='sqrt')
cand = cand.fit(train_x[train_x.columns[2:]], train_y)
# accuracy
cand.score(train_x[train_x.columns[2:]], train_y)

0.99911468733707787

In [62]:
cand.score(validation_x[validation_x.columns[2:]], validation_y)

0.98199881959472757

In [63]:
cand.score(test_x[test_x.columns[2:]], test_y)

0.9829858379228954

In [64]:
cand.score(test_tfm[test_tfm.columns[2:]].iloc[:-1], y_lab_test_all[:-1])

0.96519028947510888

In [65]:
prediction = cand.predict(validation_x[validation_x.columns[2:]])
confusion_matrix(prediction, validation_y)

array([[15915,   198,    15],
       [  223, 17086,   237],
       [   37,   205, 16914]])

In [66]:
predict_test = cand.predict(test_tfm[test_tfm.columns[2:]].iloc[:-1])
CM = confusion_matrix(predict_test, y_lab_test_all[:-1])

In [67]:
CM

array([[34331,   995,   166],
       [ 1046, 53355,  1123],
       [  158,  1013, 37116]])

In [31]:
def cal_measure(CM):
    '''
    input: 3 by 3 confusion matrix
    output: data frame of measurements
    '''
    # for Up (0)
    up = np.array([[CM[0,0],(CM[0,1]+CM[0,2])],[(CM[1,0]+CM[2,0]),(CM[1,1]+CM[2,2]+CM[1,2]+CM[2,1])]], dtype='float')
    station = np.array([[CM[1,1],(CM[1,0]+CM[1,2])],[(CM[0,1]+CM[2,1]),(CM[0,0]+CM[2,2]+CM[0,2]+CM[2,0])]], dtype='float')
    down = np.array([[CM[2,2],(CM[2,0]+CM[2,1])],[(CM[0,2]+CM[1,2]),(CM[0,0]+CM[1,1]+CM[0,1]+CM[1,0])]], dtype='float')
    p_up = up[0,0]/(up[0,0]+up[1,0])
    p_station = station[0,0]/(station[0,0]+station[1,0])
    p_down = down[0,0]/(down[0,0]+down[1,0])
    r_up = up[0,0]/(up[0,0]+up[0,1])
    r_station = station[0,0]/(station[0,0]+station[0,1])
    r_down = down[0,0]/(down[0,0]+down[0,1])
    f_up = 2*p_up*r_up/(p_up+r_up)
    f_station = 2*p_station*r_station/(p_station+r_station)
    f_down = 2*p_down*r_down/(p_down+r_down)
    measure=pd.DataFrame(index=['Up','Stationary','Down'])
    measure['Precision'] = [p_up,p_station,p_down]
    measure['Recall'] = [r_up,r_station,r_down]
    measure['F1_Measure'] = [f_up,f_station,f_down]
    return measure

In [68]:
cal_measure(CM)

Unnamed: 0,Precision,Recall,F1_Measure
Up,0.966118,0.967288,0.966703
Stationary,0.96373,0.960936,0.962331
Down,0.966437,0.969415,0.967924
