In [11]:
import numpy as np
import pandas as pd
import pandas_datareader as web

from collections import Counter
import pickle

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%run functions.ipynb

In [12]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, scale
from sklearn.pipeline import Pipeline

#Import models
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow import keras

In [13]:
tickers = ['SPY','QQQ','AAPL','MSFT']
start = '2008/01/01'
end = '2018/12/31'

In [14]:
main_df = create_df(tickers, start, end, days_to_predict=5)

In [15]:
main_df.head()

Unnamed: 0_level_0,EMA12_pctchg,RSI,Pct_Chg,MACD_sentiment,Upper_BB_pctchg,Lower_BB_pctchg,EMA_sentiment,Volume_sentiment,bull_target,bear_target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2008-02-07,1.085848,40.966392,0.661398,1,5.707755,3.218491,0,0,1,0
2008-02-08,1.457182,39.314111,-0.642116,1,5.751686,2.536537,0,1,1,0
2008-02-11,0.80673,41.319177,0.511004,1,4.786109,2.998865,0,1,1,0
2008-02-12,0.095374,44.877554,0.927107,1,3.101548,3.528514,0,0,1,0
2008-02-13,0.943752,48.594457,1.02229,0,1.896489,4.487057,0,1,0,1


In [28]:
alternative_data = pd.read_csv('data/cleaned_alternative_data.csv', index_col='index')
alternative_data.drop('Unnamed: 0', axis=1, inplace=True)

In [29]:
alternative_data.shape

(3528, 9)

In [30]:
full_df = alternative_data.join(main_df)

In [37]:
full_df.head(3)

Unnamed: 0,Returns,Unemployment Rate,Inflation Rate,Misery Index,local_price,Bullish,Neutral,Bearish,Bull-Bear Spread,EMA12_pctchg,RSI,Pct_Chg,MACD_sentiment,Upper_BB_pctchg,Lower_BB_pctchg,EMA_sentiment,Volume_sentiment,bull_target,bear_target
2010-09-30,1.596909,0.0,-0.008696,-0.00093,0.042831,-0.054258,-0.127236,0.244488,-0.442003,-0.618518,59.713065,-0.297025,0.0,1.635451,4.419573,1.0,0.0,1.0,0.0
2010-09-30,1.596909,0.0,-0.008696,-0.00093,0.042831,-0.054258,-0.127236,0.244488,-0.442003,-0.091104,67.16869,-0.44634,0.0,2.98815,8.897915,1.0,0.0,1.0,0.0
2010-09-30,1.596909,0.0,-0.008696,-0.00093,0.042831,-0.054258,-0.127236,0.244488,-0.442003,-0.668003,64.989092,-1.259701,0.0,5.56085,13.683717,1.0,0.0,1.0,0.0


In [38]:
full_df.tail(3)

Unnamed: 0,Returns,Unemployment Rate,Inflation Rate,Misery Index,local_price,Bullish,Neutral,Bearish,Bull-Bear Spread,EMA12_pctchg,RSI,Pct_Chg,MACD_sentiment,Upper_BB_pctchg,Lower_BB_pctchg,EMA_sentiment,Volume_sentiment,bull_target,bear_target
2018-12-31,-1.067037,0.054054,-0.123853,-0.011905,0.043561,0.26876,-0.347836,0.063435,-0.164155,-1.484592,43.037151,0.843298,1.0,11.422266,7.232311,0.0,1.0,0.0,0.0
2018-12-31,-1.067037,0.054054,-0.123853,-0.011905,0.043561,0.26876,-0.347836,0.063435,-0.164155,0.109512,37.553938,0.96653,0.0,14.470893,8.385891,0.0,1.0,0.0,0.0
2018-12-31,-1.067037,0.054054,-0.123853,-0.011905,0.043561,0.26876,-0.347836,0.063435,-0.164155,-1.060629,45.277776,1.175416,1.0,11.00887,6.576955,0.0,1.0,0.0,0.0


In [31]:
full_df.sample(3)

Unnamed: 0,Returns,Unemployment Rate,Inflation Rate,Misery Index,local_price,Bullish,Neutral,Bearish,Bull-Bear Spread,EMA12_pctchg,RSI,Pct_Chg,MACD_sentiment,Upper_BB_pctchg,Lower_BB_pctchg,EMA_sentiment,Volume_sentiment,bull_target,bear_target
2019-11-09,-1.536708,0.028571,0.02924,0.028791,0.028674,0.186203,-0.048472,-0.158411,1.954206,,,,,,,,,,
2010-12-14,-1.341927,0.010309,-0.025641,0.00644,0.042831,0.068264,0.010356,-0.13926,0.300213,0.603928,58.65239,-0.429013,0.0,2.144747,6.137115,1.0,1.0,1.0,0.0
2015-10-07,-1.010613,0.0,-1.2,-0.045283,0.0,-0.124969,-0.184011,0.391189,-4.447827,1.594658,53.263009,0.599997,0.0,2.683925,5.556337,0.0,1.0,1.0,0.0


In [33]:
full_df.replace([np.inf, -np.inf], np.nan, inplace=True)
full_df.dropna(inplace=True)

In [34]:
full_df['EMA12_pctchg'] = np.log(full_df['EMA12_pctchg'])
full_df = full_df[(full_df['EMA12_pctchg'] < 3.0) & (full_df['EMA12_pctchg'] > -3.0)]
full_df = full_df[(full_df['Pct_Chg'] < 10.0) & (full_df['Pct_Chg'] > -10.0)]

In [35]:
full_df.columns

Index(['Returns', 'Unemployment Rate', 'Inflation Rate', 'Misery Index',
       'local_price', 'Bullish', 'Neutral', 'Bearish', 'Bull-Bear Spread',
       'EMA12_pctchg', 'RSI', 'Pct_Chg', 'MACD_sentiment', 'Upper_BB_pctchg',
       'Lower_BB_pctchg', 'EMA_sentiment', 'Volume_sentiment', 'bull_target',
       'bear_target'],
      dtype='object')

In [36]:
full_df['bull_target'].value_counts(1)

1.0    0.589106
0.0    0.410894
Name: bull_target, dtype: float64

In [39]:
X = full_df[['Returns','Unemployment Rate',
       'Inflation Rate', 'Misery Index', 'local_price', 'Bullish', 'Neutral',
       'Bearish', 'Bull-Bear Spread', 'EMA12_pctchg', 'RSI', 'Pct_Chg',
       'MACD_sentiment', 'Upper_BB_pctchg', 'Lower_BB_pctchg', 'EMA_sentiment',
       'Volume_sentiment']]

y = full_df['bull_target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.24, random_state=8)

### Support Vector Machine

In [40]:
%%time
#Buy Signal 
steps = [('scaler', StandardScaler()),
         ('SVM', SVC(probability=True, class_weight='balanced', ))]
pipeline = Pipeline(steps)
parameters = {'SVM__C':[1, 10, 15, 20],
              'SVM__gamma':[10.0, 5.0, 3.0, 1.0]}

svm_cv = GridSearchCV(pipeline, param_grid= parameters, cv=3)
svm_cv.fit(X_train, y_train)
y_pred = svm_cv.predict(X_test)

# Compute and print metrics
best_hyperparams = svm_cv.best_params_
print(svm_cv)
print('Best Hyperparameters:', best_hyperparams)
buy_accuracy = round(svm_cv.score(X_test, y_test)*100, 2)
print("\n")
print(Counter(y_pred))
print("Accuracy: {}".format(svm_cv.score(X_test, y_test)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('SVM',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200,
                                            class_weight='balanced', coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='scale',
                                            kernel='rbf', max_iter=-1,
                                            probability=True, random_state=None,
                                            shrinking=True, tol=0.001,
                                     

In [41]:
pickling_on = open("models/svm_cv_best.pickle","wb")
pickle.dump(svm_cv.best_estimator_, pickling_on)
pickling_on.close()

### Logistic Regression

In [42]:
#Buy signal
steps = [('scaler', StandardScaler()),
        ('logreg', LogisticRegression(class_weight='balanced',))]

pipeline = Pipeline(steps)
parameters = {'logreg__C':[0.1, 0.3,0.5,1,5,10],
              'logreg__solver':['lbfgs', 'liblinear']
              }

logreg_cv = GridSearchCV(pipeline, param_grid= parameters, cv=3)
logreg_cv.fit(X_train, y_train)
y_pred = logreg_cv.predict(X_test)

# Compute and print metrics
best_hyperparams = logreg_cv.best_params_
print(logreg_cv)
print('Best Hyperparameters:', best_hyperparams)
buy_accuracy = round(logreg_cv.score(X_test, y_test)*100, 2)
print("\n")
print(Counter(y_pred))
print("Accuracy: {}".format(logreg_cv.score(X_test, y_test)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('logreg',
                                        LogisticRegression(C=1.0,
                                                           class_weight='balanced',
                                                           dual=False,
                                                           fit_intercept=True,
                                                           intercept_scaling=1,
                                                           l1_ratio=None,
                                                           max_iter=100,
                                                           multi_class='auto',
    

In [43]:
pickling_on = open("models/logreg_cv_best.pickle","wb")
pickle.dump(logreg_cv.best_estimator_, pickling_on)
pickling_on.close()

### Random Forests

In [44]:
#Buy Signal
steps = [('scaler', StandardScaler()),
        ('rf', RandomForestClassifier(class_weight='balanced'))]

pipeline = Pipeline(steps)
parameters = {'rf__n_estimators':[50,100,150,200],
              'rf__max_depth':[2,3,4,5]
              }

rf_cv = GridSearchCV(pipeline, param_grid= parameters, cv=3)
rf_cv.fit(X_train, y_train)
y_pred = rf_cv.predict(X_test)

# Compute and print metrics
best_hyperparams = rf_cv.best_params_
print(rf_cv)
print('Best Hyperparameters:', best_hyperparams)
buy_accuracy = round(rf_cv.score(X_test, y_test)*100, 2)
print("\n")
print(Counter(y_pred))
print("Accuracy: {}".format(rf_cv.score(X_test, y_test)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('rf',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight='balanced',
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                         

In [45]:
pickling_on = open("models/rf_cv_best.pickle","wb")
pickle.dump(rf_cv.best_estimator_, pickling_on)
pickling_on.close()

### XGBoost

In [46]:
#Buy Signal
steps = [('scaler', StandardScaler()),
        ('xgb', XGBClassifier(learning_rate = 0.05, scale_pos_weight=5366/7438))]

pipeline = Pipeline(steps)
parameters = {'xgb__n_estimators':[50,100,150,200],
              'xgb__max_depth':[2,3,4,5]
              }

xgb_cv = GridSearchCV(pipeline, param_grid= parameters, cv=5)
xgb_cv.fit(X_train, y_train)
y_pred = xgb_cv.predict(X_test)

# Compute and print metrics
best_hyperparams = xgb_cv.best_params_
print(xgb_cv)
print('Best Hyperparameters:', best_hyperparams)
buy_accuracy = round(xgb_cv.score(X_test, y_test)*100, 2)
print("\n")
print(Counter(y_pred))
print("Accuracy: {}".format(xgb_cv.score(X_test, y_test)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('xgb',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
 

In [47]:
pickling_on = open("models/xgb_cv_best.pickle","wb")
pickle.dump(xgb_cv.best_estimator_, pickling_on)
pickling_on.close()

### Artificial Neural Network

In [61]:
tf.keras.backend.set_floatx('float64')

ann = keras.Sequential([
    keras.layers.Dense(units=24, activation='relu', input_shape= (X_test.shape[1],)),
    keras.layers.Dense(units=12, activation='relu'),
#    keras.layers.Dense(units=6, activation='relu'),
    keras.layers.Dense(units=6, activation='relu'),
    keras.layers.Dense(units=2, activation='softmax')
], )

ann.compile(optimizer='rmsprop',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'],
            )

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

history = ann.fit(X_train.values, y_train.values, 
    class_weight='balanced',
    callbacks=[callback],
    epochs=50, 
    validation_split=0.24
)

y_pred = np.argmax(ann.predict(X_test), axis=-1)
print(Counter(y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Train on 4644 samples, validate on 1467 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Counter({1: 1359, 0: 571})
Accuracy: 0.572020725388601
[[247 502]
 [324 857]]
              precision    recall  f1-score   support

         0.0       0.43      0.33      0.37       749
         1.0       0.63      0.73      0.67      1181

    accuracy                           0.57      1930
   macro avg       0.53      0.53      0.52      1930
weighted avg       0.55

In [62]:
ann.save('models/ann.h5')

In [63]:
X_test.columns

Index(['Returns', 'Unemployment Rate', 'Inflation Rate', 'Misery Index',
       'local_price', 'Bullish', 'Neutral', 'Bearish', 'Bull-Bear Spread',
       'EMA12_pctchg', 'RSI', 'Pct_Chg', 'MACD_sentiment', 'Upper_BB_pctchg',
       'Lower_BB_pctchg', 'EMA_sentiment', 'Volume_sentiment'],
      dtype='object')