In [100]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import sklearn.datasets
from numpy.random import seed
seed(1)
import pandas as pd
import collections

In [101]:
data_df = pd.read_csv("../db/stock_data_with_forward_perf.csv")
data_df.columns

Index(['name', 'ticker', 'Quarter end', 'Shares', 'Shares split adjusted',
       'Split factor', 'Assets', 'Current Assets', 'Liabilities',
       'Current Liabilities', 'Shareholders equity',
       'Non-controlling interest', 'Preferred equity',
       'Goodwill & intangibles', 'Long-term debt', 'Revenue', 'Earnings',
       'Earnings available for common stockholders', 'EPS basic',
       'EPS diluted', 'Dividend per share', 'Cash from operating activities',
       'Cash from investing activities', 'Cash from financing activities',
       'Cash change during period', 'Cash at end of period',
       'Capital expenditures', 'Price', 'Price high', 'Price low', 'ROE',
       'ROA', 'Book value of equity per share', 'P/B ratio', 'P/E ratio',
       'Cumulative dividends per share', 'Dividend payout ratio',
       'Long-term debt to equity ratio', 'Equity to assets ratio',
       'Net margin', 'Asset turnover', 'Free cash flow per share',
       'Current ratio', 'mktcap_revenue_value', '

In [102]:
data_df.head()

Unnamed: 0,name,ticker,Quarter end,Shares,Shares split adjusted,Split factor,Assets,Current Assets,Liabilities,Current Liabilities,...,two_quarter_return,one_year_return,two_year_return,three_year_return,four_year_return,five_year_return,seven_year_return,ten_year_return,twelve_year_return,fifteen_year_return
0,Alcoa Corporation,AA,2018-03-31,482832111,482832111,1,18219000000,5895000000,12937000000,2802000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Alcoa Corporation,AA,2017-12-31,482772252,482772252,1,18718000000,6378000000,13794000000,2824000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Alcoa Corporation,AA,2017-09-30,481324177,481324177,1,19237000000,6148000000,13276000000,2677000000,...,9.329564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Alcoa Corporation,AA,2017-06-30,441030999,441030999,1,19106000000,6033000000,13353000000,2658000000,...,2.460317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Alcoa Corporation,AA,2017-03-31,440826482,440826482,1,20157000000,6710000000,14662000000,2587000000,...,2.392539,10.948905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [103]:
import collections

#get features data to be used. Change for comparing for selected parameters
parameters_columns = ['EPS basic','ROE','ROA', 'P/B ratio', 'P/E ratio', 'Dividend payout ratio',
                  'Long-term debt to equity ratio', 'Equity to assets ratio', 'Free cash flow per share',
                  'Current ratio', 'mktcap_revenue_value', 'mktcap_free_cash_flow_value',
                  'mktcap_cash_value', 'cash_oper_gt_earnings_value',
                  'entvalue_earnings_value', 'marketcap_bookvalue_value']

#set target values
target_values = ['no','yes']


#change for selected different investment period
selected_target_column = 'five_year_return'

#set calculated values whether a stock is a buy (1) or not (2). Standard  used is 5% for each year, so 5% for a year, 15% for 3 years and 25 for 5 years
data_df.loc[data_df[selected_target_column] >= 40.0, 'calc_buystock'] = 1
data_df.loc[data_df[selected_target_column] < 40.0, 'calc_buystock'] = 0


#separate data into training and training.
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)
#using 1st qtr 2016, change the ending train date according to investment period (selected_target_column).
#It should that much time period behind the target_start_date to avoid using future data
target_start_date = '2009-01-01'
target_end_date = '2009-03-31'
ending_train_date = '2007-12-31'

test_data_df = data_df.loc[(data_df['Quarter end'] >= target_start_date)&(data_df['Quarter end']<=target_end_date)]
y_test = test_data_df['calc_buystock'].values

print(f"Counts for calculated buy for test data: {collections.Counter(y_test)}")
X_test = test_data_df[parameters_columns].values

train_data_df = data_df.loc[data_df['Quarter end'] <= ending_train_date]
y_train = train_data_df['calc_buystock'].values
X_train = train_data_df[parameters_columns].values
print(f"Counts for calculated buy for train data: {collections.Counter(y_train)}")

Counts for calculated buy for test data: Counter({0.0: 342, 1.0: 320})
Counts for calculated buy for train data: Counter({1.0: 15882, 0.0: 15652})


In [104]:
 from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data
X_scaler = StandardScaler().fit(X_train)
X_scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [105]:
#Transform the training and the testing data using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [106]:
!pip install keras



In [107]:
from keras.utils import to_categorical

# One-hot encoding
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

In [108]:
# first, create a normal neural network with 2 inputs, 6 hidden nodes, and 2 outputs
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(units=6, activation='relu', input_dim=16))
model.add(Dense(units=2, activation='softmax'))

In [109]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_28 (Dense)             (None, 6)                 102       
_________________________________________________________________
dense_29 (Dense)             (None, 2)                 14        
Total params: 116
Trainable params: 116
Non-trainable params: 0
_________________________________________________________________


In [110]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [111]:
 # Fit the model to the training data
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)


Epoch 1/100
 - 2s - loss: 0.6756 - acc: 0.5986
Epoch 2/100
 - 2s - loss: 0.6562 - acc: 0.6094
Epoch 3/100
 - 2s - loss: 0.6497 - acc: 0.6115
Epoch 4/100
 - 2s - loss: 0.6473 - acc: 0.6157
Epoch 5/100
 - 2s - loss: 0.6452 - acc: 0.6181
Epoch 6/100
 - 2s - loss: 0.6436 - acc: 0.6209
Epoch 7/100
 - 2s - loss: 0.6422 - acc: 0.6213
Epoch 8/100
 - 2s - loss: 0.6408 - acc: 0.6224
Epoch 9/100
 - 2s - loss: 0.6396 - acc: 0.6245
Epoch 10/100
 - 2s - loss: 0.6388 - acc: 0.6244
Epoch 11/100
 - 2s - loss: 0.6381 - acc: 0.6251
Epoch 12/100
 - 2s - loss: 0.6375 - acc: 0.6260
Epoch 13/100
 - 2s - loss: 0.6370 - acc: 0.6269
Epoch 14/100
 - 2s - loss: 0.6367 - acc: 0.6262
Epoch 15/100
 - 2s - loss: 0.6361 - acc: 0.6266
Epoch 16/100
 - 2s - loss: 0.6358 - acc: 0.6268
Epoch 17/100
 - 2s - loss: 0.6356 - acc: 0.6282
Epoch 18/100
 - 2s - loss: 0.6355 - acc: 0.6270
Epoch 19/100
 - 2s - loss: 0.6355 - acc: 0.6283
Epoch 20/100
 - 2s - loss: 0.6351 - acc: 0.6281
Epoch 21/100
 - 2s - loss: 0.6349 - acc: 0.6270
E

<keras.callbacks.History at 0x1a1fdedc50>

In [112]:
deep_model = Sequential()
deep_model.add(Dense(units=6, activation='relu', input_dim=16))
deep_model.add(Dense(units=6, activation='relu'))
deep_model.add(Dense(units=2, activation='softmax'))

In [113]:
deep_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_30 (Dense)             (None, 6)                 102       
_________________________________________________________________
dense_31 (Dense)             (None, 6)                 42        
_________________________________________________________________
dense_32 (Dense)             (None, 2)                 14        
Total params: 158
Trainable params: 158
Non-trainable params: 0
_________________________________________________________________


In [114]:
deep_model.compile(optimizer='adam',
                   loss='categorical_crossentropy',
                   metrics=['accuracy'])

deep_model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Epoch 1/100
 - 3s - loss: 0.6670 - acc: 0.6014
Epoch 2/100
 - 2s - loss: 0.6512 - acc: 0.6117
Epoch 3/100
 - 2s - loss: 0.6470 - acc: 0.6177
Epoch 4/100
 - 2s - loss: 0.6437 - acc: 0.6196
Epoch 5/100
 - 2s - loss: 0.6407 - acc: 0.6244
Epoch 6/100
 - 2s - loss: 0.6384 - acc: 0.6253
Epoch 7/100
 - 2s - loss: 0.6367 - acc: 0.6272
Epoch 8/100
 - 2s - loss: 0.6354 - acc: 0.6302
Epoch 9/100
 - 2s - loss: 0.6347 - acc: 0.6306
Epoch 10/100
 - 2s - loss: 0.6339 - acc: 0.6301
Epoch 11/100
 - 2s - loss: 0.6331 - acc: 0.6306
Epoch 12/100
 - 2s - loss: 0.6325 - acc: 0.6319
Epoch 13/100
 - 2s - loss: 0.6317 - acc: 0.6326
Epoch 14/100
 - 2s - loss: 0.6309 - acc: 0.6329
Epoch 15/100
 - 2s - loss: 0.6302 - acc: 0.6325
Epoch 16/100
 - 2s - loss: 0.6292 - acc: 0.6337
Epoch 17/100
 - 2s - loss: 0.6289 - acc: 0.6325
Epoch 18/100
 - 2s - loss: 0.6285 - acc: 0.6328
Epoch 19/100
 - 2s - loss: 0.6276 - acc: 0.6350
Epoch 20/100
 - 2s - loss: 0.6274 - acc: 0.6336
Epoch 21/100
 - 2s - loss: 0.6270 - acc: 0.6344
E

<keras.callbacks.History at 0x1a1100bb38>

In [115]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}") 

Normal Neural Network - Loss: 0.7562458124405668, Accuracy: 0.5468277942017846


In [116]:
model_loss, model_accuracy = deep_model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Deep Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

Deep Neural Network - Loss: 0.7590607845891278, Accuracy: 0.5634441086712921
