In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
plt.rcParams['figure.figsize'] = 10, 8

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import cross_validation
from sklearn.cross_validation import StratifiedKFold, KFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.externals import joblib

In [2]:
def binary_class(row):
    if row < 0.00:
        return 0
    else:
        return 1

In [3]:
def update_return_class(row, neg_cutoff, pos_cutoff):
    if row <= neg_cutoff:
        return 0
    elif row > neg_cutoff and row < pos_cutoff:
        return 1
    elif row >= pos_cutoff:
        return 2

In [4]:
data = pd.read_csv('data/price_data_68.csv')
del data['Date']

In [5]:
# Process only the main training data
X = data.loc[:,'F1':'F68']
y5 = data.loc[:,'RET5']
y10 = data.loc[:,'RET10']
y15 = data.loc[:,'RET15']
y20 = data.loc[:,'RET20']
y25 = data.loc[:,'RET25']
y30 = data.loc[:,'RET30']

y5.columns = ['RET']
y10.columns = ['RET']
y15.columns = ['RET']
y20.columns = ['RET']
y25.columns = ['RET']
y30.columns = ['RET']

In [6]:
y5_b = pd.Series(index=y5.index)
y10_b = pd.Series(index=y5.index)
y15_b = pd.Series(index=y5.index)
y20_b = pd.Series(index=y5.index)
y25_b = pd.Series(index=y5.index)
y30_b = pd.Series(index=y5.index)

y5_b = y5.apply(binary_class)
y10_b = y10.apply(binary_class)
y15_b = y15.apply(binary_class)
y20_b = y20.apply(binary_class)
y25_b = y25.apply(binary_class)
y30_b = y30.apply(binary_class)


y5_t = pd.Series(index=y5.index)
y10_t = pd.Series(index=y5.index)
y15_t = pd.Series(index=y5.index)
y20_t = pd.Series(index=y5.index)
y25_t = pd.Series(index=y5.index)
y30_t = pd.Series(index=y5.index)

y5_t = y5.apply(update_return_class, args=(-0.02, 0.02))
y10_t = y10.apply(update_return_class, args=(-0.03, 0.03))
y15_t = y15.apply(update_return_class, args=(-0.05, 0.05))
y20_t = y20.apply(update_return_class, args=(-0.05, 0.05))
y25_t = y25.apply(update_return_class, args=(-0.05, 0.07))
y30_t = y30.apply(update_return_class, args=(-0.05, 0.08))

In [7]:
X = data.loc[:,['F19','F35','F36','F37','F38','F39','F40','F41','F46','F47','F52','F54','F55','F56','F57','F58','F59','F64','F66','F67']]

In [8]:
et = ExtraTreesClassifier(n_estimators=250, max_depth=25)
stdsc = StandardScaler()
scale = MinMaxScaler(feature_range=(0,1))

validation_size = 0.20
seed = 7

In [9]:
X5b_train, X5b_test, y5b_train, y5b_test = cross_validation.train_test_split(X, y5_b, test_size=validation_size, 
                                                                                 random_state=seed)
X10b_train, X10b_test, y10b_train, y10b_test = cross_validation.train_test_split(X, y10_b, test_size=validation_size, 
                                                                                 random_state=seed)
X15b_train, X15b_test, y15b_train, y15b_test = cross_validation.train_test_split(X, y15_b, test_size=validation_size, 
                                                                                 random_state=seed)
X20b_train, X20b_test, y20b_train, y20b_test = cross_validation.train_test_split(X, y20_b, test_size=validation_size, 
                                                                                 random_state=seed)
X25b_train, X25b_test, y25b_train, y25b_test = cross_validation.train_test_split(X, y25_b, test_size=validation_size, 
                                                                                 random_state=seed)
X30b_train, X30b_test, y30b_train, y30b_test = cross_validation.train_test_split(X, y30_b, test_size=validation_size, 
                                                                                 random_state=seed)

X5t_train, X5t_test, y5t_train, y5t_test = cross_validation.train_test_split(X, y5_t, test_size=validation_size, 
                                                                                 random_state=seed)
X10t_train, X10t_test, y10t_train, y10t_test = cross_validation.train_test_split(X, y10_t, test_size=validation_size, 
                                                                                 random_state=seed)
X15t_train, X15t_test, y15t_train, y15t_test = cross_validation.train_test_split(X, y15_t, test_size=validation_size, 
                                                                                 random_state=seed)
X20t_train, X20t_test, y20t_train, y20t_test = cross_validation.train_test_split(X, y20_t, test_size=validation_size, 
                                                                                 random_state=seed)
X25t_train, X25t_test, y25t_train, y25t_test = cross_validation.train_test_split(X, y25_t, test_size=validation_size, 
                                                                                 random_state=seed)
X30t_train, X30t_test, y30t_train, y30t_test = cross_validation.train_test_split(X, y30_t, test_size=validation_size, 
                                                                                 random_state=seed)

In [10]:
def prediction_model(X_train, X_test, y_train, y_test, scale_model, predict_model):
    X_train_std = scale_model.transform(X_train)
    X_test_std = scale_model.transform(X_test)

    predict_model.fit(X_train_std, y_train)
    predict = predict_model.predict(X_test_std)

    print(accuracy_score(y_test, predict))
    print(confusion_matrix(y_test, predict))
    print(classification_report(y_test, predict))
    
    return predict_model

In [17]:
def prediction_saved_model(X, y, scale_model, predict_model):
    #X_std = scale_model.transform(X)
    scale = MinMaxScaler()
    X_std = scale.fit_transform(X)

    predict = predict_model.predict(X_std)

    print(accuracy_score(y, predict))
    print(confusion_matrix(y, predict))
    print(classification_report(y, predict))

In [None]:
# Scale data only once and re-use the scaling transformer for other models
_ = stdsc.fit_transform(X5b_train)
_ = scale.fit_transform(X5b_train)

data_split = {
    '5b':{
        'descr':'5-day Binary',
        'X_train':X5b_train,
        'X_test':X5b_test,
        'y_train':y5b_train,
        'y_test':y5b_test
    },
    '10b':{
        'descr':'10-day Binary',
        'X_train':X10b_train,
        'X_test':X10b_test,
        'y_train':y10b_train,
        'y_test':y10b_test
    },
    '15b':{
        'descr':'15-day Binary',
        'X_train':X15b_train,
        'X_test':X15b_test,
        'y_train':y15b_train,
        'y_test':y15b_test
    },
    '20b':{
        'descr':'20-day Binary',
        'X_train':X20b_train,
        'X_test':X20b_test,
        'y_train':y20b_train,
        'y_test':y20b_test
    },
    '25b':{
        'descr':'25-day Binary',
        'X_train':X25b_train,
        'X_test':X25b_test,
        'y_train':y25b_train,
        'y_test':y25b_test
    },
    '30b':{
        'descr':'30-day Binary',
        'X_train':X30b_train,
        'X_test':X30b_test,
        'y_train':y30b_train,
        'y_test':y30b_test
    },
    '5t':{
        'descr':'5-day Trinary',
        'X_train':X5b_train,
        'X_test':X5b_test,
        'y_train':y5b_train,
        'y_test':y5b_test
    },
    '10t':{
        'descr':'10-day Trinary',
        'X_train':X10t_train,
        'X_test':X10t_test,
        'y_train':y10t_train,
        'y_test':y10t_test
    },
    '15t':{
        'descr':'15-day Trinary',
        'X_train':X15t_train,
        'X_test':X15t_test,
        'y_train':y15t_train,
        'y_test':y15t_test
    },
    '20t':{
        'descr':'20-day Trinary',
        'X_train':X20t_train,
        'X_test':X20t_test,
        'y_train':y20t_train,
        'y_test':y20t_test
    },
    '25t':{
        'descr':'25-day Trinary',
        'X_train':X25t_train,
        'X_test':X25t_test,
        'y_train':y25t_train,
        'y_test':y25t_test
    },
    '30t':{
        'descr':'30-day Trinary',
        'X_train':X30t_train,
        'X_test':X30t_test,
        'y_train':y30t_train,
        'y_test':y30t_test
    }
}

models = {}

for key in data_split:
    print('-------------------------------------------------------------')
    print('Prediction for: %s ' % data_split[str(key)]['descr'])
    predict = prediction_model(data_split[key]['X_train'], data_split[key]['X_test'], 
                                data_split[key]['y_train'], data_split[key]['y_test'], 
                                scale, et)
    predict_model_file = 'models/'+key+'.mod'
    joblib.dump(predict,predict_model_file, compress=1)
    models[key] = predict

In [50]:
scaling_model_file = 'models/scale.mod'
joblib.dump(stdsc,scaling_model_file, compress=1)

predict_model_file = 'models/predict.mod'
joblib.dump(models,predict_model_file, compress=1)

['models/predict.mod']

In [None]:
prediction_models = joblib.load(predict_model_file)

for key in prediction_models:
    print('Model for: %s' % key)
    print(prediction_models[key])

In [76]:
len(X)

3713

In [25]:
test_files = {'file1':'data/price_data_1.csv',
             'file2':'data/price_data_2.csv',
             'file3':'data/price_data_3.csv',
             'file4':'data/price_data_4.csv',
             'file5':'data/price_data_5.csv',
             'file6':'data/price_data_6.csv',
             'file7':'data/price_data_7.csv',
             'file8':'data/price_data_8.csv'}

test_file = {'file1':'data/price_data_1.csv'}

In [21]:
scaling_model_file = 'models/scale.mod'
predict_model_file = 'models/predict.mod'
predict_model_files = {'5b':'models/5b.mod',
                      '10b':'models/10b.mod',
                      '15b':'models/15b.mod',
                      '20b':'models/20b.mod',
                      '25b':'models/25b.mod',
                      '30b':'models/30b.mod',
                      '5t':'models/5t.mod',
                      '10t':'models/10t.mod',
                      '15t':'models/15t.mod',
                      '20t':'models/20t.mod',
                      '25t':'models/25t.mod',
                      '30t':'models/30t.mod'}

In [22]:
def get_features_labels(file_name):
    data = pd.read_csv(file_name)
    del data['Date']
    #X = data.loc[:,'F1':'F68']
    X = data.loc[:,['F19','F35','F36','F37','F38','F39','F40','F41','F46','F47','F52','F54','F55','F56','F57',
                    'F58','F59','F64','F66','F67']]
    y5 = data.loc[:,'RET5']
    y10 = data.loc[:,'RET10']
    y15 = data.loc[:,'RET15']
    y20 = data.loc[:,'RET20']
    y25 = data.loc[:,'RET25']
    y30 = data.loc[:,'RET30']

    y5.columns = ['RET']
    y10.columns = ['RET']
    y15.columns = ['RET']
    y20.columns = ['RET']
    y25.columns = ['RET']
    y30.columns = ['RET']
    
    y5_b = pd.Series(index=y5.index)
    y10_b = pd.Series(index=y5.index)
    y15_b = pd.Series(index=y5.index)
    y20_b = pd.Series(index=y5.index)
    y25_b = pd.Series(index=y5.index)
    y30_b = pd.Series(index=y5.index)

    y5_b = y5.apply(binary_class)
    y10_b = y10.apply(binary_class)
    y15_b = y15.apply(binary_class)
    y20_b = y20.apply(binary_class)
    y25_b = y25.apply(binary_class)
    y30_b = y30.apply(binary_class)


    y5_t = pd.Series(index=y5.index)
    y10_t = pd.Series(index=y5.index)
    y15_t = pd.Series(index=y5.index)
    y20_t = pd.Series(index=y5.index)
    y25_t = pd.Series(index=y5.index)
    y30_t = pd.Series(index=y5.index)

    y5_t = y5.apply(update_return_class, args=(-0.02, 0.02))
    y10_t = y10.apply(update_return_class, args=(-0.03, 0.03))
    y15_t = y15.apply(update_return_class, args=(-0.05, 0.05))
    y20_t = y20.apply(update_return_class, args=(-0.05, 0.05))
    y25_t = y25.apply(update_return_class, args=(-0.05, 0.07))
    y30_t = y30.apply(update_return_class, args=(-0.05, 0.08))
    
    labels = {'binary':{'5b':y5_b,
                        '10b':y10_b,
                        '15b':y15_b,
                        '20b':y20_b,
                        '25b':y25_b,
                        '30b':y30_b},
             'trinary':{'5t':y5_t,
                        '10t':y10_t,
                        '15t':y15_t,
                        '20t':y20_t,
                        '25t':y25_t,
                        '30t':y30_t}}
    return X, labels

In [23]:
def model_validation(file_names, scale_model_file, predict_model_file):
    scaling_model = joblib.load(scale_model_file)
    #prediction_models = joblib.load(predict_model_file)

    for file in file_names:
        X, labels = get_features_labels(file_names[file])        
        for label_type in labels:
            for model in labels[label_type]:
                print('-------------------------------------------')
                print('File:FileNames - %s:%s' %(file,file_names[file]))
                print('Label Type: %s' %label_type)
                print('Model: %s' %model)
                prediction_models = joblib.load(predict_model_files[model])
                print('Prediction Model File: %s' %predict_model_files[model])
                label_data = labels[label_type][model]
                #prediction_saved_model(X, label_data, scaling_model, prediction_models[model])
                prediction_saved_model(X, label_data, scaling_model, prediction_models)

In [None]:
#model_validation(test_file, scaling_model_file, predict_model_file)
model_validation(test_file, scaling_model_file, '')

In [35]:
test_file = {'file1':'data/price_data_68.csv'}
X, labels = get_features_labels('data/price_data_68.csv')
X_scaled = scale.fit_transform(X)
et.fit(X_scaled, labels['binary']['20b'])
joblib.dump(et, 'models/test.mod', compress=1)

['models/test.mod']

In [44]:
model = joblib.load('models/test.mod')

scale1 = MinMaxScaler()

file = 'data/price_data_8.csv'
X, labels = get_features_labels(file)
X_scaled = scale1.fit_transform(X)

predict = model.predict(X_scaled)

print(accuracy_score(labels['binary']['20b'], predict))
print(confusion_matrix(labels['binary']['20b'], predict))
print(classification_report(labels['binary']['20b'], predict))

0.496906107076
[[1060  618]
 [1252  787]]
             precision    recall  f1-score   support

          0       0.46      0.63      0.53      1678
          1       0.56      0.39      0.46      2039

avg / total       0.51      0.50      0.49      3717

