In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")
from sklearn.model_selection import GridSearchCV
from sklearn import svm, metrics
import pickle
from sklearn.model_selection import train_test_split

# environment settings
cwd = os.getcwd()
data = os.path.join(cwd, 'data', '1.0-ag-data-exploration.csv')

# read in data
df = pd.read_csv(data)

# drop not needed columns
df = df.drop('Data Year - Fiscal', axis = 1)
df = df.drop('CompanyID', axis = 1)
df = df.drop('Return on Equity', axis = 1)

min = 0
max = len(df)

X = np.array(df.iloc[min:, df.columns != "BK"])
y = df.iloc[min:, df.columns == "BK"].values.reshape(-1,)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123)

In [177]:
# get the head
df.head()

Unnamed: 0,Tobin's Q,EPS,Liquidity,Profitability,Productivity,Leverage Ratio,Asset Turnover,Operational Margin,Market Book Ratio,Assets Growth,Sales Growth,Employee Growth,BK
0,0.98,1.58,0.36,0.18,0.13,1.33,1.77,0.07,2.22,0.126,0.014,0.04,0
1,0.98,1.41,0.36,0.19,0.12,1.31,1.59,0.07,2.41,0.126,0.014,0.04,0
2,0.87,0.31,0.32,0.13,0.08,1.03,1.55,0.05,2.56,0.368,0.328,0.567,0
3,1.13,0.71,0.28,0.14,0.08,0.8,1.39,0.06,5.28,-0.021,-0.119,-0.096,0
4,1.26,0.75,0.41,0.13,0.08,0.2,1.3,0.06,8.68,0.233,0.147,0.053,0


In [178]:
# get the tail
df.tail()

Unnamed: 0,Tobin's Q,EPS,Liquidity,Profitability,Productivity,Leverage Ratio,Asset Turnover,Operational Margin,Market Book Ratio,Assets Growth,Sales Growth,Employee Growth,BK
89805,1.521,-1.488,-0.015,-0.759,-0.057,-1042.105,0.174,-0.327,-1.847,-0.073,-0.557,-0.077,0
89806,1.988,-1.808,0.094,-1.205,-0.121,-4.53,0.216,-0.561,-2.475,-0.202,-0.011,-0.208,0
89807,0.924,-0.016,0.039,0.0,-0.082,0.745,0.254,-0.324,3274.506,-0.168,-0.02,-0.105,0
89808,0.788,-0.133,0.054,-0.029,0.001,0.575,0.196,0.005,36.475,0.077,-0.171,-0.059,0
89809,0.885,-0.648,-0.037,-0.22,-0.145,0.693,0.222,-0.651,55.624,-0.064,0.065,0.063,1


In [179]:
df.shape

(89810, 13)

In [180]:
# confirm there is no null
df.isnull().any()

Tobin's Q             False
EPS                   False
Liquidity             False
Profitability         False
Productivity          False
Leverage Ratio        False
Asset Turnover        False
Operational Margin    False
Market Book Ratio     False
Assets Growth         False
Sales Growth          False
Employee Growth       False
BK                    False
dtype: bool

In [181]:
print('Training data: ', X_train)

Training data:  [[ 4.22   0.27  -0.19  ...  0.031  0.032  0.024]
 [ 1.4    2.11   0.23  ...  0.015  0.065 -0.012]
 [ 1.01   1.25  -0.04  ...  0.803  0.781  0.   ]
 ...
 [ 1.32   3.59   0.33  ... -0.038 -0.046 -0.031]
 [ 1.47  -1.07   0.13  ... -0.097  0.042  0.063]
 [ 2.14   1.88   0.12  ...  0.173  0.198  0.417]]


In [182]:
print('Training target: ', y_train)

Training target:  [0 0 0 ... 0 0 0]


In [183]:
print('Testing data: ', X_test)

Testing data:  [[ 1.1500e+00  3.5800e+00  1.8000e-01 ...  6.7000e-02  1.4400e-01
   0.0000e+00]
 [ 1.8200e+00  1.5800e+00  2.7000e-01 ...  1.2000e-01  2.0600e-01
   6.2000e-02]
 [ 1.3500e+00 -1.0800e+00  6.0000e-02 ...  3.7000e-02 -5.5000e-02
  -1.4600e-01]
 ...
 [ 1.8677e+02 -1.1000e-01 -1.3840e+02 ... -8.3000e-01 -5.0500e-01
  -9.7200e-01]
 [ 1.1100e+00 -1.8000e-01 -6.4000e-01 ... -6.3000e-02  9.3000e-01
   2.1300e-01]
 [ 1.9400e+00  2.2000e-01  2.0000e-02 ... -1.0600e-01  2.1200e-01
   2.1500e-01]]


In [184]:
print('Testing target: ', y_test)

Testing target:  [0 0 0 ... 0 0 0]


In [198]:
# read the classifier from the file
with open(os.path.join(cwd, 'classifiers', '3.0-sh-svm.pkl'), 'rb') as f:
    clf = pickle.load(f)

# get the predicted output
predicted = clf.predict(X_test)

In [199]:
cm = metrics.confusion_matrix(y_test, predicted)
print(cm)

[[17854     0]
 [   97    11]]


In [200]:
pd.crosstab(y_test, predicted, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,17854,0,17854
1,97,11,108
All,17951,11,17962


In [188]:
tp = cm[1][1]
print('True positive: ', tp)

True positive:  11


In [189]:
fp = cm[0][1]
print('False positive: ', fp)

False positive:  0


In [190]:
tn = cm[0][0]
print('True negative: ', tn)

True negative:  17854


In [191]:
fn = cm[1][0]
print('False negative: ', fn)

False negative:  97


In [192]:
accuracy = (tp + tn) / (tp + tn + fp + fn)
print('Accuracy: ', accuracy)

Accuracy:  0.9945997104999443


In [193]:
# how good is it at IDENTIFYING positives
precision = tp/(tp+fp)
print('Precision: ', precision)

Precision:  1.0


In [194]:
# how good is the classifier at RETRIEVING positives
sensitivity = tp/(tp+fn)
print('Sensitivity: ', sensitivity)

Sensitivity:  0.10185185185185185


In [195]:
# how good is the classifier at RETRIEVING negatives
specificity = tn/(tn+fp)
print('Specificity: ', specificity)

Specificity:  1.0


In [196]:
f1 = 2 * ((precision * recall) / (precision + recall))
print('F1 Score: ', f1)

F1 Score:  1.0


In [197]:
print("Summary:")
col_names =  ['Classifier', 'TP', 'TN', 'FP', 'FN', 'Accuracy', 'Precision', 'Sensitivity', 'Specificity', 'F1']
table = pd.DataFrame(columns = col_names)
table.loc[0] = ['SVM - RBF', tp, tn, fp, fn, accuracy, precision, sensitivity, specificity, f1]
table

Summary:


Unnamed: 0,Classifier,TP,TN,FP,FN,Accuracy,Precision,Sensitivity,Specificity,F1
0,SVM - RBF,11,17854,0,97,0.9946,1.0,0.101852,1.0,1.0
