# Training Supervised Learning Algorithms

In [1]:
# matlotlib inline plotting
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from IPython.display import display

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer

from utils import train_classifier as train_classifier
from utils import predict_labels as predict_labels
from utils import predict_labels_accuracy as predict_labels_accuracy
from utils import train_predict as train_predict

import pickle

In [6]:
import warnings
warnings.filterwarnings('ignore')

# Load dataset

In [7]:
dataset = pd.read_csv("data/curated_dataset.csv", sep="\t", header=0, index_col = 0)
print ("Number of data points:", len(dataset))

Number of data points: 2678


In [8]:
display(dataset)

Unnamed: 0,Days_Since_Open,Break_Coming,Overnight_Return,Overnight_VIX,O2O,1d_Ret,2d_Ret,3d_Ret,4d_Ret,5d_Ret,...,126d_Ret,189d_Ret,252d_Ret,1d_VIX,5d_VIX,1d_Rel_Vol,5d_Rel_Vol,1d_PtT,1d_VIX_PtT,Intraday_Increase
0,1.0,0,-0.001138,0.011299,0.003261,0.005296,0.011775,0.003086,-0.002669,0.003814,...,0.054196,0.044752,0.152477,-0.054198,-0.068421,0.150982,-0.002956,0.009513,0.105180,0
1,1.0,0,0.000889,-0.015267,0.007367,0.006444,-0.002199,-0.007923,-0.001475,0.009426,...,0.046503,0.050294,0.147818,-0.096552,-0.177136,-0.003421,-0.000493,0.009094,0.135299,1
2,1.0,0,-0.000028,-0.078621,-0.005791,-0.008588,-0.014275,-0.007868,0.002963,0.003263,...,0.044326,0.041296,0.111197,0.179821,-0.066323,-0.062535,-0.016959,0.006272,0.098703,1
3,3.0,0,-0.002841,0.078926,-0.006787,-0.005737,0.000726,0.011651,0.011954,0.011935,...,0.062659,0.048637,0.120301,0.022463,-0.200390,-0.022102,-0.021904,0.004509,0.053520,0
4,1.0,1,-0.001787,-0.001664,0.001082,0.006500,0.017488,0.017792,0.017773,0.013934,...,0.069373,0.067784,0.122976,-0.096241,-0.262577,-0.054481,0.101086,0.004163,0.128296,0
5,1.0,0,0.003615,0.006767,0.012291,0.010917,0.011220,0.011201,0.007386,0.017570,...,0.062066,0.070242,0.113438,-0.164573,-0.266814,0.010656,0.093209,0.011858,0.188227,1
6,1.0,0,0.002253,-0.053392,-0.000634,0.000299,0.000280,-0.003493,0.006581,0.005990,...,0.043887,0.066919,0.087846,0.025113,-0.108123,-0.174317,0.099463,0.005427,0.090615,1
7,1.0,0,0.003188,-0.035415,0.000910,-0.000019,-0.003791,0.006280,0.005689,-0.009226,...,0.042659,0.047618,0.092486,0.010410,0.024406,-0.218716,0.178595,0.008265,0.088686,0
8,3.0,0,0.002258,-0.014964,-0.001160,-0.003772,0.006299,0.005708,-0.009208,0.005334,...,0.043707,0.031880,0.074847,-0.057055,-0.121714,0.320440,0.240760,0.007131,0.114537,0
9,1.0,1,-0.000363,0.006748,0.009937,0.010109,0.009516,-0.005456,0.009141,-0.015605,...,0.052274,0.050822,0.076154,-0.101433,0.302958,-0.159209,0.210159,0.013551,0.135979,0


# split features and labels:


In [9]:
X = dataset.drop('Intraday_Increase', axis=1)
y = pd.DataFrame(dataset['Intraday_Increase'])

# Summarize the label data
display(y.describe())

Unnamed: 0,Intraday_Increase
count,2678.0
mean,0.54705
std,0.497874
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


Split the data into train, validation and test sets: shuffling will not be used because test data should be drawn from the latest data points, to remove any risk of the learning algorithms glimpsing the future

In [10]:
dataPoints = int(len(X) / 5)
otherPoints = len(X) - dataPoints

X_train = X[dataPoints : ]
y_train = y[dataPoints : ]
y_train = np.reshape(y_train.values, [otherPoints, ])

X_test = X[ : dataPoints]
y_test = y[ : dataPoints]
y_test = np.reshape(y_test.values, [dataPoints, ])

In [11]:
display(X_train.describe())
display(X_test.describe())

display(pd.DataFrame(y_train).describe())
display(pd.DataFrame(y_test).describe())

Unnamed: 0,Days_Since_Open,Break_Coming,Overnight_Return,Overnight_VIX,O2O,1d_Ret,2d_Ret,3d_Ret,4d_Ret,5d_Ret,...,63d_Ret,126d_Ret,189d_Ret,252d_Ret,1d_VIX,5d_VIX,1d_Rel_Vol,5d_Rel_Vol,1d_PtT,1d_VIX_PtT
count,2143.0,2143.0,2143.0,2143.0,2143.0,2143.0,2143.0,2143.0,2143.0,2143.0,...,2143.0,2143.0,2143.0,2143.0,2143.0,2143.0,2143.0,2143.0,2143.0,2143.0
mean,1.44377,0.210453,-3.8e-05,0.004964,0.00025,0.000258,0.000508,0.000577,0.000809,0.001071,...,0.013431,0.030061,0.045256,0.059421,0.002912,0.009703,0.008766,0.01093,0.014292,0.080949
std,0.874735,0.407725,0.001661,0.041501,0.013067,0.013761,0.018226,0.021323,0.024203,0.026747,...,0.087797,0.136079,0.167004,0.19047,0.071468,0.135585,0.182688,0.158809,0.011931,0.047197
min,1.0,0.0,-0.014194,-0.300366,-0.087119,-0.09035,-0.124174,-0.139059,-0.172221,-0.183401,...,-0.417706,-0.464658,-0.513279,-0.488228,-0.295726,-0.351824,-0.758845,-0.66969,0.002009,0.014966
25%,1.0,0.0,-0.000311,-0.015144,-0.004312,-0.004548,-0.007171,-0.008623,-0.010272,-0.010659,...,-0.028295,-0.033698,-0.022339,-0.002082,-0.0384,-0.074558,-0.090838,-0.081309,0.007208,0.050708
50%,1.0,0.0,-1.2e-05,0.000477,0.000725,0.000693,0.001302,0.002277,0.002818,0.003082,...,0.028603,0.050801,0.072652,0.09888,-0.004887,-0.007285,-0.005849,0.000915,0.010946,0.070492
75%,1.0,0.0,0.000428,0.023461,0.005604,0.005816,0.009111,0.011159,0.013025,0.014775,...,0.06872,0.110226,0.153798,0.180711,0.033092,0.070613,0.09231,0.088021,0.017187,0.096508
max,5.0,1.0,0.015046,0.292127,0.106712,0.1158,0.132064,0.13948,0.179735,0.191112,...,0.388172,0.502372,0.634783,0.685735,0.642152,1.02874,2.045477,0.947982,0.107198,0.554874


Unnamed: 0,Days_Since_Open,Break_Coming,Overnight_Return,Overnight_VIX,O2O,1d_Ret,2d_Ret,3d_Ret,4d_Ret,5d_Ret,...,63d_Ret,126d_Ret,189d_Ret,252d_Ret,1d_VIX,5d_VIX,1d_Rel_Vol,5d_Rel_Vol,1d_PtT,1d_VIX_PtT
count,535.0,535.0,535.0,535.0,535.0,535.0,535.0,535.0,535.0,535.0,...,535.0,535.0,535.0,535.0,535.0,535.0,535.0,535.0,535.0,535.0
mean,1.448598,0.213084,4e-05,0.009687,0.000259,0.000266,0.000524,0.000753,0.000998,0.001276,...,0.013517,0.027017,0.036694,0.060539,0.003089,0.016108,0.009031,0.008485,0.01049,0.105871
std,0.880858,0.40987,0.001346,0.057112,0.008519,0.009168,0.012973,0.015696,0.017895,0.019572,...,0.047084,0.058484,0.062711,0.076385,0.086434,0.216168,0.178229,0.145383,0.006349,0.057949
min,1.0,0.0,-0.005602,-0.235886,-0.034395,-0.039414,-0.070009,-0.089632,-0.101942,-0.109356,...,-0.121361,-0.123185,-0.128379,-0.115759,-0.213836,-0.42663,-0.832518,-0.437091,0.002467,0.031952
25%,1.0,0.0,-0.000676,-0.018759,-0.003936,-0.004028,-0.006384,-0.006562,-0.007371,-0.008446,...,-0.009836,-0.007099,-0.010483,-0.007226,-0.050692,-0.105659,-0.08557,-0.089628,0.005924,0.067959
50%,1.0,0.0,4.4e-05,0.004468,0.00028,0.000135,0.001287,0.001794,0.002061,0.003203,...,0.019826,0.031309,0.052046,0.067103,-0.007215,-0.018774,-0.006799,-0.008542,0.008792,0.092836
75%,1.0,0.0,0.000844,0.032803,0.004759,0.004996,0.007311,0.009285,0.011016,0.012714,...,0.042892,0.063646,0.083555,0.123741,0.050406,0.083196,0.085774,0.093352,0.013529,0.128475
max,4.0,1.0,0.0045,0.510725,0.037389,0.039034,0.06428,0.064928,0.055991,0.055947,...,0.13073,0.195022,0.15854,0.226706,0.493333,2.129032,1.308129,0.49108,0.050871,0.734623


Unnamed: 0,0
count,2143.0
mean,0.555763
std,0.496997
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


Unnamed: 0,0
count,535.0
mean,0.51215
std,0.50032
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


Calculate the benchmark values, against which the learners will be assessed. A suitable benchmark for the models would be comparing to the case where one simply predicts all 1s on the test set, i.e., simply assume the market will go up every day, as it historically has on 52-54% of days in the time range under consideration here:

- Benchmark F1 score results from simply predicting all '1' on the test set.

- Benchmark accuracy score results from simply predicting all '1' on the test set.


In [12]:
baseline = [1] * len(y_test)

print ("F1 score against predicting all \"up (1)\" on test set: {:.4f}".format(
    f1_score(y_test, baseline, pos_label=1, average='binary')))

print ("Accuracy score for predicting all \"up (1)\" on test set: {:.4f}".format(
    accuracy_score(y_test, baseline)))

NameError: name 'f1_score' is not defined

# Initialize the four models, with default values

Also, setting 3 different training set sizes.

In [13]:
clf_DT = DecisionTreeClassifier(random_state=4)
clf_GBN = GaussianNB()
clf_SVC = SVC(random_state=2)
clf_RF = RandomForestClassifier(random_state=0)

# Set up the training set sizes: 1/3, 2/3, 3/3
size13 = 800 #int(len(y_train) / 3)
size23 = 1600 #int(2 * len(y_train) / 3)
size = len(y_train) 

# Execute the 'train_predict' function for each classifier and each training set size
# loop models, and then train sizes
for clf in [clf_DT, clf_GBN, clf_SVC, clf_RF]:
    print ("**** {}: \n".format(clf.__class__.__name__))
    
    for n in [size13, size23, size]:
        train_predict(clf, X_train[ : n], y_train[ : n], X_test, y_test)
        print ("\n")

**** DecisionTreeClassifier: 

Training a DecisionTreeClassifier using a training set size of 800. . .
Trained model in 0.0147 seconds
Made predictions in 0.0007 seconds.
F1 score for training set: 1.0000.
Accuracy score for training set: 1.0000.
Made predictions in 0.0004 seconds.
F1 score for test set: 0.5964.
Accuracy score for test set: 0.5850.


Training a DecisionTreeClassifier using a training set size of 1600. . .
Trained model in 0.0317 seconds
Made predictions in 0.0008 seconds.
F1 score for training set: 1.0000.
Accuracy score for training set: 1.0000.
Made predictions in 0.0005 seconds.
F1 score for test set: 0.5863.
Accuracy score for test set: 0.5701.


Training a DecisionTreeClassifier using a training set size of 2143. . .
Trained model in 0.0440 seconds
Made predictions in 0.0011 seconds.
F1 score for training set: 1.0000.
Accuracy score for training set: 1.0000.
Made predictions in 0.0006 seconds.
F1 score for test set: 0.5766.
Accuracy score for test set: 0.5607.


*

# Optimize Decision Trees

Optimize the Decision Tree parameters by using grid search cross validation

In [14]:
# Create the parameters list one wants to tune.
parameters = {'max_depth': [1,2,3,4,5,6,7,8,9],
              'criterion':('gini', 'entropy'),
              'splitter':('best', 'random') }

# Initialize the classifier
clf = DecisionTreeClassifier(random_state=4)

# Perform grid search on the classifier using the default scoring method (accuracy)
grid_obj = GridSearchCV(clf, parameters)

# Fit the grid search object to the training data and find the optimal parameters
grid_obj.fit(X_train, y_train)

# Get the estimator
clf = grid_obj.best_estimator_

# Report the final F1 score for training and testing after parameter tuning
print ("Tuned model has a training F1 score of {:.4f}.".format(
    predict_labels(clf, X_train, y_train)))
print ("Tuned model has a training accuracy score of {:.4f}.".format(
    predict_labels_accuracy(clf, X_train, y_train)))
print ("Tuned model has a testing F1 score of {:.4f}.".format(
    predict_labels(clf, X_test, y_test)))
print ("Tuned model has a testing accuracy score of {:.4f}.".format(
    predict_labels_accuracy(clf, X_test, y_test)))

print("The best parameters are %s" % (grid_obj.best_params_))


importances = clf.feature_importances_
feature_importance = pd.Series(importances,index=['Days_Since_Open','Break_Coming','Overnight_Return',
                                                  'Overnight_VIX','O2O','1d_Ret','2d_Ret',
                                                  '3d_Ret','4d_Ret','5d_Ret','21d_Ret',
                                                  '63d_Ret','126d_Ret', '189d_Ret', '252d_Ret','1d_VIX',
                                                  '5d_VIX','1d_Rel_Vol','5d_Rel_Vol','1d_PtT',
                                                  '1d_VIX_PtT'])

print('\n')
print("Feature Importance:")
print(feature_importance)

Made predictions in 0.0004 seconds.
Tuned model has a training F1 score of 0.7430.
Tuned model has a training accuracy score of 0.6710.
Made predictions in 0.0005 seconds.
Tuned model has a testing F1 score of 0.6994.
Tuned model has a testing accuracy score of 0.6336.
The best parameters are {'criterion': 'entropy', 'max_depth': 7, 'splitter': 'random'}


Feature Importance:
Days_Since_Open     0.046558
Break_Coming        0.017469
Overnight_Return    0.103550
Overnight_VIX       0.548980
O2O                 0.020349
1d_Ret              0.022080
2d_Ret              0.019130
3d_Ret              0.025878
4d_Ret              0.018321
5d_Ret              0.019807
21d_Ret             0.007460
63d_Ret             0.026032
126d_Ret            0.011674
189d_Ret            0.020407
252d_Ret            0.000000
1d_VIX              0.023159
5d_VIX              0.006051
1d_Rel_Vol          0.012907
5d_Rel_Vol          0.031269
1d_PtT              0.009487
1d_VIX_PtT          0.009431
dtype: float

In [15]:
np.savetxt("data/eval-DT.csv", clf.predict(X_test), delimiter=",")

with open('data/DT.model', 'wb') as f:
    pickle.dump(clf, f)

# Optimize SVM

Optimize the SVM parameters by using grid search cross validation

In [16]:
# Create the parameters list you wish to tune
parameters = {'kernel':('linear', 'rbf','sigmoid'), 
              'C':[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], 
              'gamma':[.000001,.000005,.00005,.0005,.001,.005,.01,.02,.04,.05,.1,.3,.5]}

# Initialize the classifier
clf = SVC(random_state=2)

# Perform grid search on the classifier using the default scoring method (accuracy)
grid_obj = GridSearchCV(clf, parameters)

# Fit the grid search object to the training data and find the optimal parameters
grid_obj.fit(X_train,y_train)

# Get the estimator
clf = grid_obj.best_estimator_

# Report the final F1 score for training and testing after parameter tuning
print ("Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print ("Tuned model has a training accuracy score of {:.4f}.".format(predict_labels_accuracy(clf, X_train, y_train)))
print ("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))
print ("Tuned model has a testing accuracy score of {:.4f}.".format(predict_labels_accuracy(clf, X_test, y_test)))

print("The best parameters are {}".format(grid_obj.best_params_))

Made predictions in 0.0973 seconds.
Tuned model has a training F1 score of 0.7331.
Tuned model has a training accuracy score of 0.6388.
Made predictions in 0.0274 seconds.
Tuned model has a testing F1 score of 0.7066.
Tuned model has a testing accuracy score of 0.6430.
The best parameters are {'C': 12, 'gamma': 1e-06, 'kernel': 'linear'}


In [None]:
np.savetxt("data/eval-SVM.csv", clf.predict(X_test), delimiter=",")

with open('data/SVM.model', 'wb') as f:
    pickle.dump(clf, f)

# Optimize Random Forest


Optimize the Random Forest parameters by using grid search cross validation

In [None]:
# Create the parameters list you wish to tune
parameters = {'n_estimators': [10, 50, 100, 150, 200],
              'criterion' : ['gini', 'entropy'],
              'oob_score' : [True, False], 
              'max_features': ['auto', 'sqrt', 'log2']}

# Initialize the classifier
clf = RandomForestClassifier(random_state=0)

# Perform grid search on the classifier using the default scoring method (accuracy)
grid_obj = GridSearchCV(clf, parameters)

# Fit the grid search object to the training data and find the optimal parameters
grid_obj.fit(X_train,y_train)

# Get the estimator
clf = grid_obj.best_estimator_

# Report the final F1 score for training and testing after parameter tuning
print ("Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print ("Tuned model has a training accuracy score of {:.4f}.".format(predict_labels_accuracy(clf, X_train, y_train)))
print ("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))
print ("Tuned model has a testing accuracy score of {:.4f}.".format(predict_labels_accuracy(clf, X_test, y_test)))

print("The best parameters are {}".format(grid_obj.best_params_))

importances = clf.feature_importances_
feature_importance = pd.Series(importances,index=['Days_Since_Open','Break_Coming','Overnight_Return',
                                                  'Overnight_VIX','O2O','1d_Ret','2d_Ret',
                                                  '3d_Ret','4d_Ret','5d_Ret','21d_Ret',
                                                  '63d_Ret','126d_Ret', '189d_Ret', '252d_Ret','1d_VIX',
                                                  '5d_VIX','1d_Rel_Vol','5d_Rel_Vol','1d_PtT',
                                                  '1d_VIX_PtT'])

print('\n')
print("Feature Importance:")
print(feature_importance)

In [None]:
np.savetxt("data/eval-RF.csv", clf.predict(X_test), delimiter=",")

with open('data/RF.model', 'wb') as f:
    pickle.dump(clf, f)