In [88]:
from datetime import datetime
import pandas as pd
from pandas import Series
import numpy as np
import matplotlib.pyplot as plt
from iexfinance import get_historical_data
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, f1_score, confusion_matrix, r2_score
from sklearn.svm import SVC
import itertools

## Get Data, Clean, Prepare

In [89]:
def get_iex_data(stock_list, start=datetime(2013,9,1), end=datetime(2018,9,26)):
    return_list = []
    for i in stock_list:
        df = pd.DataFrame(get_historical_data(i, start, end, output_format='pandas')).interpolate()
        df['ticker'] = i
        return_list.append(df)
    return return_list

In [90]:
hun_etfs = ['SPY','IVV','VTI','VOO','QQQ','VEA','EFA','IEFA','VWO','AGG','IJH','IEMG','IWM','IJR','VTV','IWF','IWD','VUG','BND','LQD','XLF','VNQ','VIG','EEM','GLD','VB','BSV','VO','TIP','VEU','IVW','DIA','XLK','VYM','VGT','VCSH','MDY','IWB','VCIT','XLV','IWR','XLE','DVY','USMV','EWJ','VGK','PFF','SCHF','SDY','RSP','XLY','ITOT','IVE','SCHX','HYG','SHV','VBR','EMB','SHY','VV','SCHB','XLI','BIV','VT','MBB','BNDX','IWS','VXUS','FLOT','IWO','IXUS','MINT','SCZ','MTUM','IWN','IAU','IGSB','TLT','JNK','AMLP','VOE','XLP','MUB','IWP','VBK','FDN','EZU','IBB','IEF','IWV','ACWI','EFAV','VHT','SCHA','VFH','SCHD','GDX','IJK','SPLV','BKLN']
preprocessed_data = get_iex_data(hun_etfs)

In [91]:
def clean_data(data):
    for i in range(len(data)):
        data[i] = data[i].reset_index()
        data[i]['date'] = pd.to_datetime(data[i]['date'])
        data[i] = data[i].set_index('date')
        data[i]['Clf_Target'] = (np.sign(-data[i]['close'].diff(periods=-1))+1)/2
    return data

def add_past(etf_list, window):
    for i in range(len(etf_list)):
        for n in range(1,window+1):
            etf_list[i]['{}day_change'.format(n)] = -etf_list[i]['close'].diff(periods=n)
    return etf_list

def RSI(series, period):
    delta = series.diff().dropna()
    u = delta * 0
    d = u.copy()
    u[delta > 0] = delta[delta > 0]
    d[delta < 0] = -delta[delta < 0]
    u[u.index[period-1]] = np.mean( u[:period] )
    u = u.drop(u.index[:(period-1)])
    d[d.index[period-1]] = np.mean( d[:period] )
    d = d.drop(d.index[:(period-1)])
    rs = Series.ewm(u, com=period-1, adjust=False).mean() / \
    Series.ewm(d, com=period-1, adjust=False).mean()
    return 100 - 100 / (1 + rs)

def add_indicators(etf_list):
    for i in range(len(etf_list)):
        etf_list[i]['ewma7'] = etf_list[i]['close'].ewm(span=7,min_periods=0,adjust=True,ignore_na=False).mean()
        etf_list[i]['ewma50'] = etf_list[i]['close'].ewm(span=50,min_periods=0,adjust=True,ignore_na=False).mean()
        etf_list[i]['ewma200'] = etf_list[i]['close'].ewm(span=200,min_periods=0,adjust=True,ignore_na=False).mean()
        etf_list[i]['MACD'] = (etf_list[i]['close'].ewm(span=12,min_periods=0,adjust=True,ignore_na=False).mean() - \
                               etf_list[i]['close'].ewm(span=26,min_periods=0,adjust=True,ignore_na=False).mean())
        etf_list[i]['RSI'] = RSI(etf_list[i]['close'], 14)
    return etf_list

In [92]:
def save_metrics(y_true, y_pred, ticker):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc = roc_auc_score(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    conf = confusion_matrix(y_true, y_pred)
    return {'Ticker':ticker,'Accuracy':accuracy,'Precision':precision,'F1':f1,'ROC AUC':roc,'R2':r2,'conf':conf}

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

def produce_confs(results):
    plt.figure()    
    for i in rf:
        plot_confusion_matrix(i['conf'], classes =['sell', 'buy'], title=i['Ticker'])
        plt.show()

In [93]:
def time_test_split(X, y, date):
    X_train = X[X['date'] < date].drop(columns='date')
    X_test = X[X['date'] >= date].drop(columns='date')
    y_train = np.array(y[y['date'] < date].drop(columns='date')).ravel().astype('int')
    y_test = np.array(y[y['date'] >= date].drop(columns='date')).ravel().astype('int')
    return X_train, X_test, y_train, y_test

def plant_forests(df_list, split_date):
    results = []
    for i in range(len(df_list)):
        ticker = df_list[i]['ticker'].iloc[0]
        X = df_list[i][15:-1].reset_index().drop(columns=['ticker','open', 'Clf_Target'])
        y = df_list[i][15:-1]['Clf_Target'].reset_index()
        X_train, X_test, y_train, y_test = time_test_split(X, y, split_date)
        clf = RandomForestClassifier(n_estimators=100)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        results.append(save_metrics(y_test, y_pred, ticker))
    print ('Mean Accuracy: {}'.format(np.mean([i['Accuracy'] for i in results])))
    print ('Mean Precision: {}'.format(np.mean([i['Precision'] for i in results])))
    print ('Mean R2: {}'.format(np.mean([i['R2'] for i in results])))
    return results

def construct_SVMS(df_list, split_date):
    results = []
    for i in range(len(df_list)):
        ticker = df_list[i]['ticker'].iloc[0]
        X = df_list[i][15:-1].reset_index().drop(columns=['ticker','open', 'Clf_Target'])
        y = df_list[i][15:-1]['Clf_Target'].reset_index()
        X_train, X_test, y_train, y_test = time_test_split(X, y, split_date)
        clf = SVC()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        results.append(save_metrics(y_test, y_pred, ticker))
    print ('Mean Accuracy: {}'.format(np.mean([i['Accuracy'] for i in results])))
    print ('Mean Precision: {}'.format(np.mean([i['Precision'] for i in results])))
    print ('Mean R2: {}'.format(np.mean([i['R2'] for i in results])))
    return results

In [94]:
def run_models(window):
    data = clean_data(preprocessed_data)
    data = add_past(data, window)
    data = add_indicators(data)
    rf_results = plant_forests(data, '2018-02')
    svm_results = construct_SVMS(data, '2018-02')
    return rf_results, svm_results

In [95]:
rf, svm = run_models(1)

Mean Accuracy: 0.5215243902439025
Mean Precision: 0.5513413821929483
Mean R2: -0.9372791699322457


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Mean Accuracy: 0.54
Mean Precision: 0.5035365853658537
Mean R2: -0.8616067382541058


In [None]:
# Finish Gridsearch
# X = data[10][15:-1].reset_index().drop(columns=['ticker','open', 'Clf_Target'])
# y = data[10][15:-1]['Clf_Target'].reset_index()
# X_train, X_test, y_train, y_test = time_test_split(X, y, '2018-02')
# parameters = {'kernel': ['linear', 'rbf'], 'C':[1, 35, 55, 78, 100]}
# svc = SVC()
# clf = GridSearchCV(svc, parameters)
# clf.fit(X_train, y_train)
# clf.best_params_