In [1]:
import sys
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV


In [4]:
dataset = pd.read_csv("heart.csv")
print("Dataset :")
print(dataset.head())

print("Dimensions of the dataset : ", dataset.shape)
print("Features of the dataset :")
print(dataset.describe(include = 'all'))

Dataset :
   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  
Dimensions of the dataset :  (918, 12)
Features of the dataset :
               Age  Sex ChestPainType   RestingBP  Cholesterol   FastingBS  \
count   918.000000  918           918  918.000000   91

In [16]:
# drop non-numeric values
X = dataset.drop(['HeartDisease', 'Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], axis = 1)
y = dataset['HeartDisease']

# normalize data
scaler = MinMaxScaler(feature_range=(0, 1))
X_rescaled = scaler.fit_transform(X)
X = pd.DataFrame(data = X_rescaled, columns = X.columns)
print("Pre-processed data :")
print(X)

categories = [['0', '1']]
encoder = OneHotEncoder(categories=categories, sparse=False)
y = encoder.fit_transform(y.values.reshape(-1,1))
print("Pre-processed class :")
print(y)

Pre-processed data :
          Age  RestingBP  Cholesterol  FastingBS     MaxHR   Oldpeak
0    0.244898       0.70     0.479270        0.0  0.788732  0.295455
1    0.428571       0.80     0.298507        0.0  0.676056  0.409091
2    0.183673       0.65     0.469320        0.0  0.267606  0.295455
3    0.408163       0.69     0.354892        0.0  0.338028  0.465909
4    0.530612       0.75     0.323383        0.0  0.436620  0.295455
..        ...        ...          ...        ...       ...       ...
913  0.346939       0.55     0.437811        0.0  0.507042  0.431818
914  0.816327       0.72     0.320066        1.0  0.570423  0.681818
915  0.591837       0.65     0.217247        0.0  0.387324  0.431818
916  0.591837       0.65     0.391376        0.0  0.802817  0.295455
917  0.204082       0.69     0.290216        0.0  0.795775  0.295455

[918 rows x 6 columns]
Pre-processed class :
[[1. 0.]
 [0. 1.]
 [1. 0.]
 ...
 [0. 1.]
 [0. 1.]
 [1. 0.]]




In [24]:
data_train, data_test, class_train, class_test = train_test_split(X, y, test_size=0.2)

# Number of nodes in each hidden layer should be (10, 2)
# Learning rate should be 0.4
# Number of epochs should be 600
mlp = MLPClassifier(solver = 'sgd', random_state = 42, activation = 'logistic', learning_rate_init = 0.4, batch_size = 100, hidden_layer_sizes = (23,17,13), max_iter = 500)
mlp

In [25]:
mlp.fit(data_train, class_train)

pred = mlp.predict(data_test)
pred

array([[0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [1,

In [26]:
print("Accuracy : ", accuracy_score(class_test, pred))
print("Mean Square Error : ", mean_squared_error(class_test, pred))

print(pred[:2])
print("Confusion Matrix for each label : ")
print(multilabel_confusion_matrix(class_test, pred))

print("Classification Report : ")
print(classification_report(class_test, pred))

Accuracy :  0.7608695652173914
Mean Square Error :  0.2391304347826087
[[0 1]
 [0 1]]
Confusion Matrix for each label : 
[[[82 20]
  [24 58]]

 [[58 24]
  [20 82]]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.74      0.71      0.72        82
           1       0.77      0.80      0.79       102

   micro avg       0.76      0.76      0.76       184
   macro avg       0.76      0.76      0.76       184
weighted avg       0.76      0.76      0.76       184
 samples avg       0.76      0.76      0.76       184



For each 2x2 confusion matrix class:
- true positive: top left value
- false positive: top right value
- true negative: bottom right value
- false negative: bottom left value

In [34]:
CV = cross_validate(mlp, X, y, cv=8, scoring=['accuracy', 'neg_mean_squared_error'])
print('Accuracy')
print(CV['test_accuracy'])
print('MSE')
print(-1*CV['test_neg_mean_squared_error'])

print('Average Accuracy = ', sum(CV['test_accuracy']) / len(CV['test_accuracy']))
print('Average MSE = ', sum(-1 * CV['test_neg_mean_squared_error']) / len(CV['test_neg_mean_squared_error']))

Accuracy
[0.34782609 0.37391304 0.75652174 0.88695652 0.25217391 0.53913043
 0.73684211 0.70175439]
MSE
[0.65217391 0.62608696 0.24347826 0.11304348 0.74782609 0.46086957
 0.26315789 0.29824561]
Average Accuracy =  0.5743897787948131
Average MSE =  0.4256102212051869


In [33]:
# To find list of accuracy and MSE values
# Without using the sklearn function cross_validate()

n_splits=8
# step 1: randomize the dataset and create k equal size partitions
kf = KFold(n_splits=n_splits)

acc = 0
mse = 0

i = 0 #keep track of batch number
# step 5: iterate k times with a different testing subset
for train_indices, test_indices in kf.split(X):

    # step 2-3: use k-1/k^th partition for the training/testing model
    start_train, stop_train = train_indices[0], train_indices[-1]+1
    start_test, stop_test = test_indices[0], test_indices[-1]+1
    
    # perform the training similar to Q1
    #this was based on the requirements in Q1
    mlp = MLPClassifier(solver = 'sgd', random_state = 42, activation = 'logistic', learning_rate_init = 0.4, batch_size = 100, hidden_layer_sizes = (10, 2), max_iter = 600)
    mlp.fit(X[start_train:stop_train], y[start_train:stop_train])
    pred = mlp.predict(X[start_test:stop_test])
    
    # step 4: record the evaluating scores
    i+=1
    acc += accuracy_score(y[start_test:stop_test], pred)
    mse += mean_squared_error(y[start_test:stop_test], pred)
    
    print("\nAccuracy for batch ", i, " : ", accuracy_score(y[start_test:stop_test], pred))
    print("Mean Square Error for batch ", i, " : ", mean_squared_error(y[start_test:stop_test], pred))

# step 6: find the average and select the batch with highest evaluation scores
print('\nAverage Accuracy = ', acc / n_splits)
print('Average MSE = ', mse / n_splits)


Accuracy for batch  1  :  0.34782608695652173
Mean Square Error for batch  1  :  0.6521739130434783

Accuracy for batch  2  :  0.7478260869565218
Mean Square Error for batch  2  :  0.25217391304347825

Accuracy for batch  3  :  0.8434782608695652
Mean Square Error for batch  3  :  0.15217391304347827

Accuracy for batch  4  :  0.8608695652173913
Mean Square Error for batch  4  :  0.1391304347826087

Accuracy for batch  5  :  0.7739130434782608
Mean Square Error for batch  5  :  0.21739130434782608

Accuracy for batch  6  :  0.7391304347826086
Mean Square Error for batch  6  :  0.2608695652173913

Accuracy for batch  7  :  0.7368421052631579
Mean Square Error for batch  7  :  0.2631578947368421

Accuracy for batch  8  :  0.7017543859649122
Mean Square Error for batch  8  :  0.2982456140350877

Average Accuracy =  0.7189549961861175
Average MSE =  0.2794145690312738


In [36]:
y = dataset['HeartDisease']
set_of_classes = y.value_counts().index.tolist()
set_of_classes= pd.DataFrame({'Class': set_of_classes})
y = pd.get_dummies(y)

In [37]:
max_iterations = [500, 750, 1000, 1250, 1500]
hidden_layer_siz = [(5, 7), (7, 13), (13, 10)]
learning_rates = 0.15 * np.arange(1, 3)

param_grid = dict(learning_rate_init = learning_rates, hidden_layer_sizes = hidden_layer_siz, max_iter = max_iterations)
# set model
# mlp = MLPClassifier(solver = 'sgd', random_state = 42, activation = 'logistic', learning_rate_init = 0.4, batch_size = 100, hidden_layer_sizes = (23,17,13), max_iter = 500)

# For Grid Search
# grid = GridSearchCV(estimator = mlp, param_grid = param_grid)

# For Random Search
grid = RandomizedSearchCV(estimator = mlp, param_distributions = param_grid, n_iter = 20)

grid.fit(X,y)

print("Optimal Hyper-parameters : ", grid.best_params_)
print("Optimal Accuracy : ", grid.best_score_)

Optimal Hyper-parameters :  {'max_iter': 1250, 'learning_rate_init': 0.3, 'hidden_layer_sizes': (5, 7)}
Optimal Accuracy :  0.7700582086006178


In [38]:
# try again w/better params
mlp = MLPClassifier(solver = 'sgd', random_state = 42, activation = 'logistic', learning_rate_init = 0.3, batch_size = 100, hidden_layer_sizes = (5, 7), max_iter = 1250)
mlp.fit(data_train, class_train)

pred = mlp.predict(data_test)

print("Accuracy : ", accuracy_score(class_test, pred))
print("Mean Square Error : ", mean_squared_error(class_test, pred))

print(pred[:2])
print("Confusion Matrix for each label : ")
print(multilabel_confusion_matrix(class_test, pred))

print("Classification Report : ")
print(classification_report(class_test, pred))

Accuracy :  0.75
Mean Square Error :  0.24456521739130435
[[0 1]
 [0 1]]
Confusion Matrix for each label : 
[[[72 30]
  [16 66]]

 [[66 16]
  [28 74]]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.69      0.80      0.74        82
           1       0.82      0.73      0.77       102

   micro avg       0.75      0.76      0.76       184
   macro avg       0.75      0.77      0.76       184
weighted avg       0.76      0.76      0.76       184
 samples avg       0.76      0.76      0.76       184

