In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

2023-07-28 18:29:18.637354: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-28 18:29:18.638823: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-28 18:29:18.669428: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-28 18:29:18.669985: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
results = pd.DataFrame(columns=['model', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'])

### Data

In [3]:
df = pd.read_excel('./data.xlsx')
df = df.drop(columns=['Company'])
df = df.replace('-', np.nan)
print(df.shape)
df.head()

(899, 828)


Unnamed: 0,Custom-Return_1390,EPS_MRQ_LC_1390,EPS_TTM_1390,EPS_MRQ_1390,EPS_FY_1390,Rev_TTM_1390,Rev_MRQ_1390,Rev_FY_1390,NI_TTM_1390,NI_MRQ_1390,...,ETR_1401,CFOToRev_1401,CFOToDebt_1401,FCFFToRev_1401,FCFFToNI_1401,FCFEToRev_1401,FCFEToNI_1401,CAPEXToRev_1401,CAPEXToNI_1401,DivYield_1401
0,,11.375,83.0,11.0,24.0,1917455000000.0,839988000000.0,1685499000000.0,82667000000.0,11375000000.0,...,0.0041,-0.081,-0.0616,-0.0684,-1.7557,0.022,0.0689,-0.0123,-0.3158,0.0021
1,0.208274,117.4437,295.0,117.0,295.0,9046040000000.0,2474342000000.0,9046040000000.0,795173000000.0,317098000000.0,...,0.137,0.0267,0.0746,0.0082,0.093,0.0305,0.3474,0.0187,0.2122,0.0073
2,,88.7262,406.0,89.0,384.0,6988662000000.0,1702216000000.0,6914857000000.0,913165000000.0,199634000000.0,...,0.1034,0.0433,0.1469,0.0697,0.5587,0.0757,0.6061,0.005,0.0402,0.0048
3,-0.481625,-125.1184,-113.0,-125.0,-113.0,29911000000.0,717000000.0,29911000000.0,-25709000000.0,-28527000000.0,...,,-0.5435,-0.0327,-0.6303,,-0.6316,,0.0869,-0.3459,
4,,,,,,,,,,,...,,,,,,,,,,


In [4]:
# convert all columns to float
df = df.astype(float)
df.head()

Unnamed: 0,Custom-Return_1390,EPS_MRQ_LC_1390,EPS_TTM_1390,EPS_MRQ_1390,EPS_FY_1390,Rev_TTM_1390,Rev_MRQ_1390,Rev_FY_1390,NI_TTM_1390,NI_MRQ_1390,...,ETR_1401,CFOToRev_1401,CFOToDebt_1401,FCFFToRev_1401,FCFFToNI_1401,FCFEToRev_1401,FCFEToNI_1401,CAPEXToRev_1401,CAPEXToNI_1401,DivYield_1401
0,,11.375,83.0,11.0,24.0,1917455000000.0,839988000000.0,1685499000000.0,82667000000.0,11375000000.0,...,0.0041,-0.081,-0.0616,-0.0684,-1.7557,0.022,0.0689,-0.0123,-0.3158,0.0021
1,0.208274,117.4437,295.0,117.0,295.0,9046040000000.0,2474342000000.0,9046040000000.0,795173000000.0,317098000000.0,...,0.137,0.0267,0.0746,0.0082,0.093,0.0305,0.3474,0.0187,0.2122,0.0073
2,,88.7262,406.0,89.0,384.0,6988662000000.0,1702216000000.0,6914857000000.0,913165000000.0,199634000000.0,...,0.1034,0.0433,0.1469,0.0697,0.5587,0.0757,0.6061,0.005,0.0402,0.0048
3,-0.481625,-125.1184,-113.0,-125.0,-113.0,29911000000.0,717000000.0,29911000000.0,-25709000000.0,-28527000000.0,...,,-0.5435,-0.0327,-0.6303,,-0.6316,,0.0869,-0.3459,
4,,,,,,,,,,,...,,,,,,,,,,


In [5]:
# drop rows which has no value in TSE_FY or RE_FY column
col_tse_fy = []
col_re_fy = []
for col in df.columns:
    if ('TSE_FY' in col):
        col_tse_fy.append(col)
    if ('RE_FY' in col):
        col_re_fy.append(col)
print(len(col_tse_fy), len(col_re_fy))
print(df.shape)
df = df.dropna(subset=col_tse_fy + col_re_fy, how='all')
print(df.shape)

12 12
(899, 828)
(830, 828)


In [6]:
# mean of all col_tse_fy
df['mean_TSE_FY'] = df[col_tse_fy].mean(axis=1)
df['mean_RE_FY'] = df[col_re_fy].mean(axis=1)

# create a y column RE_FY / TSE_FY > .5 = 1 else 0
df['y'] = np.where(df['mean_RE_FY'] / df['mean_TSE_FY'] > .5, 1, 0)
print(df.shape)
print('--------')
print(df[df['y'] >= .5].shape)
print(df[df['y'] < .5].shape)

(830, 831)
--------
(370, 831)
(460, 831)


  df['mean_TSE_FY'] = df[col_tse_fy].mean(axis=1)
  df['mean_RE_FY'] = df[col_re_fy].mean(axis=1)
  df['y'] = np.where(df['mean_RE_FY'] / df['mean_TSE_FY'] > .5, 1, 0)


In [7]:
# print nan proportion
print(df.isna().sum() / df.shape[0])

Custom-Return_1390    0.597590
EPS_MRQ_LC_1390       0.121687
EPS_TTM_1390          0.019277
EPS_MRQ_1390          0.020482
EPS_FY_1390           0.010843
                        ...   
CAPEXToNI_1401        0.081928
DivYield_1401         0.279518
mean_TSE_FY           0.000000
mean_RE_FY            0.006024
y                     0.000000
Length: 831, dtype: float64


In [8]:
# df = df.fillna(df.mean())
# df = df.fillna(df.median())
# df = df.fillna(df.mode().iloc[0])

# fill NaN with 0
df = df.fillna(0)

print(df.isna().sum() / df.shape[0])

Custom-Return_1390    0.0
EPS_MRQ_LC_1390       0.0
EPS_TTM_1390          0.0
EPS_MRQ_1390          0.0
EPS_FY_1390           0.0
                     ... 
CAPEXToNI_1401        0.0
DivYield_1401         0.0
mean_TSE_FY           0.0
mean_RE_FY            0.0
y                     0.0
Length: 831, dtype: float64


In [9]:
# Scaler
scaler = MinMaxScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df.head()

Unnamed: 0,Custom-Return_1390,EPS_MRQ_LC_1390,EPS_TTM_1390,EPS_MRQ_1390,EPS_FY_1390,Rev_TTM_1390,Rev_MRQ_1390,Rev_FY_1390,NI_TTM_1390,NI_MRQ_1390,...,FCFFToRev_1401,FCFFToNI_1401,FCFEToRev_1401,FCFEToNI_1401,CAPEXToRev_1401,CAPEXToNI_1401,DivYield_1401,mean_TSE_FY,mean_RE_FY,y
0,0.042743,0.31685,2.24983e-10,0.999976,0.001849,0.04501,0.055193,0.044945,0.319484,0.49555,...,0.999684,0.893868,0.907533,0.440197,0.483342,0.6537,0.003082,0.36088,0.611768,0.0
1,0.058074,0.320473,2.267615e-10,0.999976,0.001849,0.047014,0.05698,0.047014,0.320198,0.496023,...,0.999685,0.895969,0.907542,0.442539,0.483445,0.657271,0.010713,0.36169,0.612231,0.0
2,0.042743,0.319492,2.276927e-10,0.999976,0.001849,0.046436,0.056136,0.046415,0.320316,0.495841,...,0.999685,0.896498,0.907587,0.444715,0.483399,0.656108,0.007044,0.361677,0.612518,0.0
3,0.007289,0.312188,2.233387e-10,0.999975,0.001849,0.044479,0.054275,0.044479,0.319375,0.495488,...,0.999683,0.895863,0.906876,0.439617,0.483671,0.653497,0.0,0.360505,0.611575,1.0
4,0.042743,0.330508,2.380534e-10,0.999976,0.00185,0.052995,0.063764,0.052135,0.325979,0.49808,...,0.999685,0.896048,0.907555,0.441255,0.48343,0.656267,0.08145,0.363931,0.615068,1.0


In [10]:
# # duplicate data
# DUPLICATE = 8
# x_train = np.repeat(x_train, DUPLICATE, axis=0)
# y_train = np.repeat(y_train, DUPLICATE, axis=0)
# print(x_train.shape)
# print(y_train.shape)

In [11]:
x_cols = [col for col in df.columns if col != 'y']
x_train, x_test, y_train, y_test = train_test_split(df[x_cols], df['y'], test_size=0.2, random_state=42)

print('x_train.shape:', x_train.shape)
print('x_test.shape:', x_test.shape)
print('y_train.shape:', y_train.shape)
print('y_test.shape:', y_test.shape)

x_train.shape: (664, 830)
x_test.shape: (166, 830)
y_train.shape: (664,)
y_test.shape: (166,)


### Logistic Regression

In [12]:
logistic_model = LogisticRegression()
logistic_model.fit(x_train, y_train)

In [13]:
# evaluate the model
y_pred = logistic_model.predict(x_test)

results.loc[len(results)] = [
    'Logistic Regression', 
    round(accuracy_score(y_test, y_pred), 4), 
    round(precision_score(y_test, y_pred), 4), 
    round(recall_score(y_test, y_pred), 4), 
    round(f1_score(y_test, y_pred), 4), 
    round(roc_auc_score(y_test, y_pred), 4)
]
print(results)

                 model  accuracy  precision  recall      f1  roc_auc
0  Logistic Regression    0.7952     0.7869  0.6957  0.7385   0.7808


### SVM

In [14]:
svm_model = SVC(kernel='linear', C=1, random_state=42)
svm_model.fit(x_train, y_train)

In [15]:
y_pred = svm_model.predict(x_test)

results.loc[len(results)] = [
    'SVM',
    round(accuracy_score(y_test, y_pred), 4),
    round(precision_score(y_test, y_pred), 4),
    round(recall_score(y_test, y_pred), 4),
    round(f1_score(y_test, y_pred), 4),
    round(roc_auc_score(y_test, y_pred), 4)
]
print(results)

                 model  accuracy  precision  recall      f1  roc_auc
0  Logistic Regression    0.7952     0.7869  0.6957  0.7385   0.7808
1                  SVM    0.8133     0.8393  0.6812  0.7520   0.7942


### Random Forest

In [16]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=42)
rf_model.fit(x_train, y_train)

In [17]:
y_pred = rf_model.predict(x_test)

results.loc[len(results)] = [
    'Random Forest',
    round(accuracy_score(y_test, y_pred), 4),
    round(precision_score(y_test, y_pred), 4),
    round(recall_score(y_test, y_pred), 4),
    round(f1_score(y_test, y_pred), 4),
    round(roc_auc_score(y_test, y_pred), 4)
]
print(results)

                 model  accuracy  precision  recall      f1  roc_auc
0  Logistic Regression    0.7952     0.7869  0.6957  0.7385   0.7808
1                  SVM    0.8133     0.8393  0.6812  0.7520   0.7942
2        Random Forest    0.9940     0.9857  1.0000  0.9928   0.9948


### Sequential Data

In [18]:
# split data
x = df.drop(columns=['y'])
y = df['y']
x = np.array(x)
y = np.array(y)
# [samples, timesteps, features]
x = x.reshape(x.shape[0], 1, x.shape[1])

x_train = x[:int(x.shape[0] * .7)]
y_train = y[:int(y.shape[0] * .7)]
x_valid = x[int(x.shape[0] * .7):int(x.shape[0] * .85)]
y_valid = y[int(y.shape[0] * .7):int(y.shape[0] * .85)]
x_test = x[int(x.shape[0] * .85):]
y_test = y[int(y.shape[0] * .85):]

print(x_train.shape)
print(y_train.shape)
print("------")
print(x_valid.shape)
print(y_valid.shape)
print("------")
print(x_test.shape)
print(y_test.shape)

(581, 1, 830)
(581,)
------
(124, 1, 830)
(124,)
------
(125, 1, 830)
(125,)


### LSTM

In [19]:
# LSTM
lstm = keras.Sequential()
lstm.add(keras.layers.LSTM(256, input_shape=(x_train.shape[1], x_train.shape[2]), return_sequences=True))
lstm.add(keras.layers.Dropout(0.1))
lstm.add(keras.layers.LSTM(256, return_sequences=False))
lstm.add(keras.layers.Dropout(0.1))
lstm.add(keras.layers.Dense(1, activation='sigmoid'))
lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 1, 256)            1113088   
                                                                 
 dropout (Dropout)           (None, 1, 256)            0         
                                                                 
 lstm_1 (LSTM)               (None, 256)               525312    
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense (Dense)               (None, 1)                 257       
                                                                 
Total params: 1638657 (6.25 MB)
Trainable params: 1638657 (6.25 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
# train
lstm_history = lstm.fit(x_train, y_train, epochs=100, batch_size=64, validation_data=(x_valid, y_valid), shuffle=False)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [21]:
# evaluate the model
y_pred = lstm.predict(x_test)



In [22]:
# calc metrics and add a row to results
results.loc[len(results)] = [
    'LSTM',
    round(accuracy_score(y_test, y_pred.round()), 4),
    round(precision_score(y_test, y_pred.round()), 4),
    round(recall_score(y_test, y_pred.round()), 4),
    round(f1_score(y_test, y_pred.round()), 4),
    round(roc_auc_score(y_test, y_pred.round()), 4),
]
print(results)

                 model  accuracy  precision  recall      f1  roc_auc
0  Logistic Regression    0.7952     0.7869  0.6957  0.7385   0.7808
1                  SVM    0.8133     0.8393  0.6812  0.7520   0.7942
2        Random Forest    0.9940     0.9857  1.0000  0.9928   0.9948
3                 LSTM    0.7040     0.6386  0.8833  0.7413   0.7109


In [23]:
# plot loss
fig = go.Figure()
fig.add_trace(go.Scatter(y=lstm_history.history['loss'], mode='lines', name='loss'))
fig.add_trace(go.Scatter(y=lstm_history.history['val_loss'], mode='lines', name='val_loss'))
fig.update_layout(title='Loss', xaxis_title='Epoch', yaxis_title='Loss')
fig.show()

In [24]:
# plot accuracy
fig = go.Figure()
fig.add_trace(go.Scatter(y=lstm_history.history['accuracy'], mode='lines', name='accuracy'))
fig.add_trace(go.Scatter(y=lstm_history.history['val_accuracy'], mode='lines', name='val_accuracy'))
fig.update_layout(title='Accuracy', xaxis_title='Epoch', yaxis_title='Accuracy')
fig.show()

### RNN

In [25]:
rnn_model = keras.Sequential()
rnn_model.add(keras.layers.SimpleRNN(256, input_shape=(x_train.shape[1], x_train.shape[2]), return_sequences=True))
rnn_model.add(keras.layers.Dropout(0.1))
rnn_model.add(keras.layers.SimpleRNN(256, return_sequences=False))
rnn_model.add(keras.layers.Dropout(0.1))
rnn_model.add(keras.layers.Dense(1, activation='sigmoid'))
rnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
rnn_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 1, 256)            278272    
                                                                 
 dropout_2 (Dropout)         (None, 1, 256)            0         
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 256)               131328    
                                                                 
 dropout_3 (Dropout)         (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 257       
                                                                 
Total params: 409857 (1.56 MB)
Trainable params: 409857 (1.56 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [26]:
# train
rnn_history = rnn_model.fit(x_train, y_train, epochs=100, batch_size=64, validation_data=(x_valid, y_valid), shuffle=False)

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [27]:
# evaluate the model
y_pred = rnn_model.predict(x_test)



In [28]:
# calc metrics and add a row to results
results.loc[len(results)] = [
    'RNN',
    round(accuracy_score(y_test, y_pred.round()), 4),
    round(precision_score(y_test, y_pred.round()), 4),
    round(recall_score(y_test, y_pred.round()), 4),
    round(f1_score(y_test, y_pred.round()), 4),
    round(roc_auc_score(y_test, y_pred.round()), 4),
]
print(results)

                 model  accuracy  precision  recall      f1  roc_auc
0  Logistic Regression    0.7952     0.7869  0.6957  0.7385   0.7808
1                  SVM    0.8133     0.8393  0.6812  0.7520   0.7942
2        Random Forest    0.9940     0.9857  1.0000  0.9928   0.9948
3                 LSTM    0.7040     0.6386  0.8833  0.7413   0.7109
4                  RNN    0.6800     0.6163  0.8833  0.7260   0.6878


In [29]:
# plot loss
fig = go.Figure()
fig.add_trace(go.Scatter(y=lstm_history.history['loss'], mode='lines', name='loss'))
fig.add_trace(go.Scatter(y=lstm_history.history['val_loss'], mode='lines', name='val_loss'))
fig.update_layout(title='Loss', xaxis_title='Epoch', yaxis_title='Loss')
fig.show()

In [30]:
# plot accuracy
fig = go.Figure()
fig.add_trace(go.Scatter(y=lstm_history.history['accuracy'], mode='lines', name='accuracy'))
fig.add_trace(go.Scatter(y=lstm_history.history['val_accuracy'], mode='lines', name='val_accuracy'))
fig.update_layout(title='Accuracy', xaxis_title='Epoch', yaxis_title='Accuracy')
fig.show()

### Results

In [31]:
print(results)

                 model  accuracy  precision  recall      f1  roc_auc
0  Logistic Regression    0.7952     0.7869  0.6957  0.7385   0.7808
1                  SVM    0.8133     0.8393  0.6812  0.7520   0.7942
2        Random Forest    0.9940     0.9857  1.0000  0.9928   0.9948
3                 LSTM    0.7040     0.6386  0.8833  0.7413   0.7109
4                  RNN    0.6800     0.6163  0.8833  0.7260   0.6878


In [None]:
# plot comparative metrics for different models



In [35]:
# plot roc_auc by models
# x, y from 0 to 1

fig = go.Figure()
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1],
                    mode='lines',
                    name='baseline'))
fig.add_trace(go.Scatter(x=[0, 1], y=[0.5, 0.5],
                    mode='lines',
                    name='baseline'))
fig.add_trace(go.Scatter(x=[0, 1], y=[1, 1],
                    mode='lines',
                    name='baseline'))

# add roc_auc from results df
for model in results['model'].unique():
    df = results[results['model'] == model]
    fig.add_trace(go.Scatter(x=results['roc_auc'], y=results['roc_auc'],
                    mode='lines',
                    name=model))

fig.update_layout(title='ROC-AUC',
                     xaxis_title='False Positive Rate',
                     yaxis_title='True Positive Rate')
fig.show()
