In [2]:
#import libraries
import numpy as np
import pandas as pd
import pandas_datareader as pdr
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import tensorflow.compat.v1 as tf
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
#write function for getting data from yahoo finance given a ticker, start date, and end date
def get_ticker_returns_df(ticker, start_date, end_date):
    
    ticker_data = pdr.DataReader(ticker, start = '2017-12-01', end = '2020-01-31', data_source = 'yahoo')

    ticker_close_data = ticker_data['Adj Close']

    ticker_close_df = pd.DataFrame(data = ticker_close_data)
    ticker_close_df.columns = [column.replace(' ', '_') for column in ticker_close_df.columns]
    
    ticker_close_df['CurrDay_'+ticker+'_Return'] = ticker_close_df['Adj_Close'].pct_change()
    ticker_close_df['PrevDay_Adj_Close'] = ticker_close_df.Adj_Close.shift(1)
    ticker_close_df['PrevDay_'+ticker+'_Return'] = ticker_close_df['PrevDay_Adj_Close'].pct_change()
    ticker_close_df['5PrevDays_'+ticker+'_Return'] = ticker_close_df['PrevDay_'+ticker+'_Return'].rolling(window = 5, min_periods = 5).mean()
    ticker_close_df['10PrevDays_'+ticker+'_Return'] = ticker_close_df['PrevDay_'+ticker+'_Return'].rolling(window = 10, min_periods = 10).mean()
    
    ticker_close_df = ticker_close_df.reset_index()
    ticker_close_df = ticker_close_df[ticker_close_df.Date >= start_date]
    ticker_close_df = ticker_close_df[ticker_close_df.Date <= end_date]
    ticker_close_df = ticker_close_df.dropna()
    
    ticker_returns_df = ticker_close_df.reset_index()
    ticker_returns_df = ticker_returns_df.drop(['index','Adj_Close','PrevDay_Adj_Close'], axis=1)
    
    return ticker_returns_df

In [None]:
#scrape table data off of wikipedia to get sector classifications of S&P500 stocks
sp500_qualitative_data = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')

sp500_qualitative_df = sp500_qualitative_data[0]
sp500_qualitative_df = sp500_qualitative_df.drop(['Security','SEC filings','GICS Sub Industry','Headquarters Location','Date first added','CIK','Founded'], axis=1)
sp500_qualitative_df = sp500_qualitative_df.rename(columns = {'GICS Sector': 'Sector'})
sp500_qualitative_df.head()

In [None]:
#collect previous day returns and 5/10 day moving average previous day returns for each symbol for the time period
all_ticker_returns_df = pd.DataFrame()

for ticker in sp500_qualitative_df.Symbol:
    
    try:
    
        ticker_returns_df = get_ticker_returns_df(ticker,'2018-01-01','2019-12-31')
        ticker_returns_df.columns = [column.replace('_'+ticker+'_', '_') for column in ticker_returns_df.columns]
        ticker_returns_df['Symbol'] = ticker

        new_df = pd.merge(sp500_qualitative_df, ticker_returns_df, how='inner', on=['Symbol'])
        
        all_ticker_returns_df = all_ticker_returns_df.append(new_df, ignore_index = True)
        
        print('Got returns for: ' + ticker)
        
    except:
        
        print('Cannot get returns for: ' + ticker)
        
        continue
        
all_ticker_returns_df.head()

In [None]:
#aggregate data on sector and date to get sector previous day return and 5/10 day moving average previous day returns
merge_df1 = pd.merge(all_ticker_returns_df, all_ticker_returns_df.groupby(['Sector', 'Date'], as_index = False)['PrevDay_Return'].mean().rename(columns = {'PrevDay_Return': 'PrevDay_Sector_Return'}), how='inner', on=['Sector','Date'])
merge_df2 = pd.merge(merge_df1, all_ticker_returns_df.groupby(['Sector', 'Date'], as_index = False)['5PrevDays_Return'].mean().rename(columns = {'5PrevDays_Return': '5PrevDays_Sector_Return'}), how='inner', on=['Sector','Date'])
all_ticker_sector_returns_df = pd.merge(merge_df2, all_ticker_returns_df.groupby(['Sector', 'Date'], as_index = False)['10PrevDays_Return'].mean().rename(columns = {'10PrevDays_Return': '10PrevDays_Sector_Return'}), how='inner', on=['Sector','Date'])
all_ticker_sector_returns_df.head()

In [None]:
#get S&P500 previous day return and 5/10 day moving average previous day returns for the time frame
SP500_returns_df = get_ticker_returns_df('^GSPC','2018-01-01','2019-12-31')
SP500_returns_df = SP500_returns_df.drop(['CurrDay_^GSPC_Return'], axis=1)
SP500_returns_df.head()

In [None]:
#get VIX previous day return and 5/10 day moving average previous day returns for the time frame
VIX_returns_df = get_ticker_returns_df('^VIX','2018-01-01','2019-12-31')
VIX_returns_df = VIX_returns_df.drop(['CurrDay_^VIX_Return'], axis=1)
VIX_returns_df.head()

In [None]:
#merge all data into one dataframe to prepare for dimensionality reduction
merge_another_df1 = pd.merge(all_ticker_sector_returns_df, SP500_returns_df, how='inner', on=['Date'])
all_data_df = pd.merge(merge_another_df1, VIX_returns_df, how='inner', on=['Date'])
all_data_df.head()

In [None]:
#scale all data and apply pca for four principal components for each symbol and store results in dictionaries
train_df = all_data_df[all_data_df.Date < '2019-10-01']
test_df = all_data_df[all_data_df.Date >= '2019-10-01']

train_dict = {}
test_dict = {}
explained_variance_ratio_dict = {}

for symbol in test_df.Symbol.unique():

    ticker_train_df = train_df[train_df.Symbol == symbol].reset_index().drop(['index'], axis=1)
    ticker_test_df = test_df[test_df.Symbol == symbol].reset_index().drop(['index'], axis=1)

    train_reference_target_variables_df = ticker_train_df.iloc[:, 0:4]
    test_reference_target_variables_df = ticker_test_df.iloc[:, 0:4]

    train_explanatory_variables_df = ticker_train_df.iloc[:, 4:16]
    test_explanatory_variables_df = ticker_test_df.iloc[:, 4:16]

    scaler = StandardScaler()
    scaler.fit(train_explanatory_variables_df)

    train_explanatory_variables_scaled_df = scaler.transform(train_explanatory_variables_df)
    test_explanatory_variables_scaled_df = scaler.transform(test_explanatory_variables_df)

    pca = PCA(n_components = 4)
    pca.fit(train_explanatory_variables_scaled_df)

    explained_variance_ratio_dict[symbol] = pca.explained_variance_ratio_.sum()

    train_pca = pca.transform(train_explanatory_variables_scaled_df)
    train_pca_df = pd.DataFrame(data = train_pca, columns = ['PC1', 'PC2', 'PC3', 'PC4'])
    train_all_reduced_df = pd.merge(train_reference_target_variables_df, train_pca_df, left_index=True, right_index=True)
    
    train_dict[symbol] = train_all_reduced_df

    test_pca = pca.transform(test_explanatory_variables_scaled_df)
    test_pca_df = pd.DataFrame(data = test_pca, columns = ['PC1', 'PC2', 'PC3', 'PC4'])
    test_all_reduced_df = pd.merge(test_reference_target_variables_df, test_pca_df, left_index=True, right_index=True)
    
    test_dict[symbol] = test_all_reduced_df

In [None]:
key_max = max(explained_variance_ratio_dict.keys(), key=(lambda k: explained_variance_ratio_dict[k]))
key_min = min(explained_variance_ratio_dict.keys(), key=(lambda k: explained_variance_ratio_dict[k]))

print('Maximum Explained Variance - ', key_max, ': ', explained_variance_ratio_dict[key_max])
print('Minimum Explained Variance - ', key_min, ': ', explained_variance_ratio_dict[key_min])

In [None]:
train_dict['MSFT'].tail()

In [None]:
test_dict['MSFT'].tail()

In [None]:
for symbol in test_df.Symbol.unique():
    
    iterating_train_df = train_dict[symbol]
    iterating_test_df = test_dict[symbol]

    threshold = iterating_test_df['CurrDay_Return'].std()

    iterating_train_df['True_Label'] = '0'
    iterating_train_df.loc[iterating_train_df.CurrDay_Return > 1 * threshold, 'True_Label'] = '1'
    iterating_train_df.loc[iterating_train_df.CurrDay_Return < -1 * threshold, 'True_Label'] = '-1'
    
    iterating_train_df['OneHot_True_Label_PosOne'] = '0'
    iterating_train_df.loc[iterating_train_df.True_Label == '1', 'OneHot_True_Label_PosOne'] = '1'
    iterating_train_df['OneHot_True_Label_Zero'] = '0'
    iterating_train_df.loc[iterating_train_df.True_Label == '0', 'OneHot_True_Label_Zero'] = '1'
    iterating_train_df['OneHot_True_Label_NegOne'] = '0'
    iterating_train_df.loc[iterating_train_df.True_Label == '-1', 'OneHot_True_Label_NegOne'] = '1'

    iterating_test_df['True_Label'] = '0'
    iterating_test_df.loc[iterating_test_df.CurrDay_Return > threshold, 'True_Label'] = '1'
    iterating_test_df.loc[iterating_test_df.CurrDay_Return < -1 * threshold, 'True_Label'] = '-1'
    
    iterating_test_df['OneHot_True_Label_PosOne'] = '0'
    iterating_test_df.loc[iterating_test_df.True_Label == '1', 'OneHot_True_Label_PosOne'] = '1'
    iterating_test_df['OneHot_True_Label_Zero'] = '0'
    iterating_test_df.loc[iterating_test_df.True_Label == '0', 'OneHot_True_Label_Zero'] = '1'
    iterating_test_df['OneHot_True_Label_NegOne'] = '0'
    iterating_test_df.loc[iterating_test_df.True_Label == '-1', 'OneHot_True_Label_NegOne'] = '1'
    
    train_dict[symbol] = iterating_train_df
    test_dict[symbol] = iterating_test_df

In [None]:
train_dict['MSFT'].tail()

In [None]:
test_dict['MSFT'].tail()

In [None]:
train_input_variables = {}
test_input_variables = {}
train_output_variables = {}
test_output_variables = {}

for symbol in test_df.Symbol.unique():
    
    train_input_variables[symbol] = train_dict[symbol].iloc[:, 4:8]
    test_input_variables[symbol] = test_dict[symbol].iloc[:, 4:8]
    
    train_output_variables[symbol] = train_dict[symbol].iloc[:, 9:12]
    test_output_variables[symbol] = test_dict[symbol].iloc[:, 9:12]

In [None]:
train_input_variables['MSFT'].head()

In [None]:
train_output_variables['MSFT'].head()

In [None]:
def run_train(session, train_x, train_y):
    
    session.run(init_op)
    
    for epoch in range(n_epochs):
        
        total_batch = int(train_x.shape[0] / batch_size)
        
        for i in range(total_batch):
            
            batch_x = train_x[i*batch_size:(i+1)*batch_size]
            batch_y = train_y[i*batch_size:(i+1)*batch_size]
            
            session.run(training_op, feed_dict={x: batch_x, y: batch_y})

In [None]:
def cross_validate(session, split_size = 10):
    
    results = []
    kf = KFold(n_splits = split_size)
    
    for train_idx, val_idx in kf.split(train_input_variables_df, train_output_variables_df):
        
        train_x = train_input_variables_df[train_input_variables_df.index.isin(train_idx)]
        train_y = train_output_variables_df[train_output_variables_df.index.isin(train_idx)]
        
        val_x = train_input_variables_df[train_input_variables_df.index.isin(val_idx)]
        val_y = train_output_variables_df[train_output_variables_df.index.isin(val_idx)]
        
        run_train(session, train_x, train_y)
        
        results.append(session.run(accuracy, feed_dict={x: val_x, y: val_y}))
        
    return results

In [None]:
cross_validation_accuracies = {}
train_accuracies = {}
test_accuracies = {}
train_dict_with_predictions = {}
test_dict_with_predictions = {}

tf.disable_eager_execution()

for symbol in test_df.Symbol.unique():
    
    try:
    
        train_input_variables_df = train_input_variables[symbol]
        test_input_variables_df = test_input_variables[symbol]

        train_output_variables_df = train_output_variables[symbol]
        test_output_variables_df = test_output_variables[symbol]

        n_inputs = 4
        n_hidden_1 = 60
        n_outputs = 3
        n_sample = train_input_variables_df.shape[0]

        n_epochs = 2000
        n_batches = 10
        batch_size = int(train_input_variables_df.shape[0] / n_batches)

        with tf.Graph().as_default() as graph:

            with tf.name_scope("Inputs"):    
                x = tf.placeholder(tf.float32, shape=(None, n_inputs), name="x")
                y = tf.placeholder(tf.float32, shape=(None, n_outputs), name="y")

            with tf.name_scope("output"):
                fc1    = tf.layers.dense(x, n_hidden_1, activation = tf.nn.relu, name="fc1")
                logits = tf.layers.dense(fc1, n_outputs, name="output")
                Y_prob = tf.nn.softmax(logits, name="Y_prob")

            with tf.name_scope("train"):
                xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y, name="xentropy")
                loss = tf.reduce_mean(xentropy, name='loss')
                optimizer = tf.train.AdamOptimizer()
                training_op = optimizer.minimize(loss)

            with tf.name_scope("eval"):
                correct = tf.equal(tf.argmax(logits,axis=1), tf.argmax(y,axis=1))
                accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

            with tf.name_scope("init"):
                init_op = tf.global_variables_initializer()

            with tf.Session() as session:

                cross_validation_results = cross_validate(session)
                cross_validation_accuracy = sum(cross_validation_results) / len(cross_validation_results)

                train_accuracy = session.run(accuracy, feed_dict={x: train_input_variables_df, y: train_output_variables_df})
                test_accuracy = session.run(accuracy, feed_dict={x: test_input_variables_df, y: test_output_variables_df})

                train_probs = session.run(Y_prob, feed_dict={x: train_input_variables_df, y: train_output_variables_df})
                test_probs = session.run(Y_prob, feed_dict={x: test_input_variables_df, y: test_output_variables_df})

        cross_validation_accuracies[symbol] = cross_validation_accuracy
        train_accuracies[symbol] = train_accuracy
        test_accuracies[symbol] = test_accuracy        

        train_label = np.argmax(train_probs, axis=1)
        train_label_df = pd.DataFrame(data = train_label, columns = ['Modeled_Label'])

        train_label_df['Predicted_Label'] = ''
        train_label_df.loc[train_label_df.Modeled_Label == 0, 'Predicted_Label'] = '1'
        train_label_df.loc[train_label_df.Modeled_Label == 1, 'Predicted_Label'] = '0'
        train_label_df.loc[train_label_df.Modeled_Label == 2, 'Predicted_Label'] = '-1'

        train_label_df.drop(['Modeled_Label'], axis = 1, inplace = True)

        test_label = np.argmax(test_probs, axis = 1)
        test_label_df = pd.DataFrame(data = test_label, columns = ['Modeled_Label'])

        test_label_df['Predicted_Label'] = ''
        test_label_df.loc[test_label_df.Modeled_Label == 0, 'Predicted_Label'] = '1'
        test_label_df.loc[test_label_df.Modeled_Label == 1, 'Predicted_Label'] = '0'
        test_label_df.loc[test_label_df.Modeled_Label == 2, 'Predicted_Label'] = '-1'

        test_label_df.drop(['Modeled_Label'], axis = 1, inplace = True)

        train_dict_with_predictions_df = pd.merge(train_dict[symbol], train_label_df, left_index=True, right_index=True)
        test_dict_with_predictions_df = pd.merge(test_dict[symbol], test_label_df, left_index=True, right_index=True)

        train_dict_with_predictions_df['Matching_Prediction_Flag'] = '0'
        train_dict_with_predictions_df.loc[train_dict_with_predictions_df.True_Label == train_dict_with_predictions_df.Predicted_Label, 'Matching_Prediction_Flag'] = '1'

        test_dict_with_predictions_df['Matching_Prediction_Flag'] = '0'
        test_dict_with_predictions_df.loc[test_dict_with_predictions_df.True_Label == test_dict_with_predictions_df.Predicted_Label, 'Matching_Prediction_Flag'] = '1'

        train_dict_with_predictions_df['Opposite_Prediction_Flag'] = '0'
        train_dict_with_predictions_df.loc[(train_dict_with_predictions_df.True_Label == '1') & (train_dict_with_predictions_df.Predicted_Label == '-1'), 'Opposite_Prediction_Flag'] = '1'
        train_dict_with_predictions_df.loc[(train_dict_with_predictions_df.True_Label == '-1') & (train_dict_with_predictions_df.Predicted_Label == '1'), 'Opposite_Prediction_Flag'] = '1'

        test_dict_with_predictions_df['Opposite_Prediction_Flag'] = '0'
        test_dict_with_predictions_df.loc[(test_dict_with_predictions_df.True_Label == '1') & (test_dict_with_predictions_df.Predicted_Label == '-1'), 'Opposite_Prediction_Flag'] = '1'
        test_dict_with_predictions_df.loc[(test_dict_with_predictions_df.True_Label == '-1') & (test_dict_with_predictions_df.Predicted_Label == '1'), 'Opposite_Prediction_Flag'] = '1'

        train_dict_with_predictions[symbol] = train_dict_with_predictions_df
        test_dict_with_predictions[symbol] = test_dict_with_predictions_df
        
        print('Successfully Modeled: ' + ticker)
        
    except:
        
        print('UNSECCESSFULLY Modeled: ' + ticker)
        
        continue