# Mounting Google Drive and Uploading the Dataset

In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/gdrive')

FOLDERNAME = "CS682_Project"
%cd /gdrive/My\ Drive/$FOLDERNAME

Mounted at /gdrive
/gdrive/My Drive/CS682_Project


In [None]:
df = pd.read_csv("heart.csv")

# Preprocessing the Data

In [None]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [None]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.compose import ColumnTransformer

# print(set(df['Sex']))
# print(set(df['ChestPainType']))
# print(set(df['RestingECG']))
# print(set(df['ExerciseAngina']))
# print(set(df['ST_Slope']))
X = ColumnTransformer([('one_hot_encoder', OneHotEncoder(categories='auto', drop='first'), [1, 2, 6, 8, 10])], remainder='passthrough', n_jobs=-1).fit_transform(np.array(df)[:, :-1])
y = np.array(df['HeartDisease'])

In [None]:
len(np.where(np.array(df['HeartDisease']) == 1)[0])

508

In [None]:
pd.DataFrame(X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1,1,0,0,1,0,0,0,1,40,140,289,0,172,0.0
1,0,0,1,0,1,0,0,1,0,49,160,180,0,156,1.0
2,1,1,0,0,0,1,0,0,1,37,130,283,0,98,0.0
3,0,0,0,0,1,0,1,1,0,48,138,214,0,108,1.5
4,1,0,1,0,1,0,0,0,1,54,150,195,0,122,0.0


In [None]:
from sklearn.model_selection import train_test_split

X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
len(X_trn), len(X_tst), len(y_trn), len(y_tst)

(642, 276, 642, 276)

#Recall

In [None]:
def recall_precision(test_data, pred):
  tp = 0 #true positive
  fn = 0 #false negative
  fp = 0 #false positives

  for i in range(len(test_data)):
    if test_data[i] == 1:
      if pred[i] == 1:
        tp += 1
      else:
        fp += 1
    else: #test_data[i]==0
      if pred[i] == 1:
        fn += 1
  
  precision = tp/(tp+fp)
  recall = tp/(tp+fn)

  return (recall, precision)

# K-Fold Cross Validation

In [None]:
from sklearn.model_selection import KFold

def Kfold_cv_accuracy(X_train, Y_train, model):
    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    accurates = list()
    for train, validate in kf.split(X_train):
        model.fit(X_train[train], Y_train[train])
        predicted = model.predict(X_train[validate])
        Y_validate = Y_train[validate]
        accurates.append(np.mean([1 if predicted[y] == Y_validate[y] else 0 for y in range(len(predicted))])) 
    return np.mean(accurates)

# Performing K-Nearest Neighbours (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

def best_trained_KNN(X_train, Y_train):
    accuracy = dict()
    for K in [4, 8, 12, 16, 20, 24]:
        accuracy[K] = Kfold_cv_accuracy(X_train, Y_train, KNeighborsClassifier(n_neighbors=K, n_jobs=-1))
    return KNeighborsClassifier(n_neighbors=max(accuracy, key=accuracy.get), n_jobs=-1).fit(X_train, Y_train)

In [None]:
KNN_model = best_trained_KNN(X_trn, y_trn)
print(KNN_model.score(X_trn, y_trn), KNN_model.score(X_tst, y_tst))
print(recall_precision(y_trn, KNN_model.predict(X_trn)), recall_precision(y_tst, KNN_model.predict(X_tst)))

0.7367601246105919 0.7065217391304348
(0.7603550295857988, 0.744927536231884) (0.7808219178082192, 0.6993865030674846)


# Performing Support Vector Machine (SVM)

In [None]:
from sklearn.svm import LinearSVC

def best_trained_SVM(X_train, Y_train):
    accuracy = dict()
    for λ in [0.0001, 0.01, 0.1, 1, 10, 100]:
        accuracy[λ] = Kfold_cv_accuracy(X_train, Y_train, SVC(C=λ, kernel='rbf', random_state=0))
    return LinearSVC(C=max(accuracy, key=accuracy.get), random_state=0).fit(X_train, Y_train)

In [None]:
SVM_model = best_trained_SVM(X_trn, y_trn)
print(SVM_model.score(X_trn, y_trn), SVM_model.score(X_tst, y_tst))
print(recall_precision(y_trn, SVM_model.predict(X_trn)), recall_precision(y_tst, SVM_model.predict(X_tst)))

0.8613707165109035 0.8297101449275363
(0.826530612244898, 0.9391304347826087) (0.8118279569892473, 0.9263803680981595)




# Performing Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

def best_trained_LR(X_train, Y_train):
    accuracy = dict()
    for λ in [0.0001, 0.01, 0.1, 1, 10, 100]:
        accuracy[λ] = Kfold_cv_accuracy(X_train, Y_train, LogisticRegression(C=λ, random_state=0, max_iter=1000, multi_class='ovr', n_jobs=-1))
    return LogisticRegression(C=max(accuracy, key=accuracy.get), random_state=0, max_iter=1000, multi_class='ovr', n_jobs=-1).fit(X_train, Y_train)

In [None]:
LR_model = best_trained_LR(X_trn, y_trn)
print(LR_model.score(X_trn, y_trn), LR_model.score(X_tst, y_tst))
print(recall_precision(y_trn, LR_model.predict(X_trn)), recall_precision(y_tst, LR_model.predict(X_tst)))

0.8800623052959502 0.8333333333333334
(0.8828571428571429, 0.8956521739130435) (0.8461538461538461, 0.8773006134969326)


# Performing Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

NB_model = GaussianNB().fit(X_trn, y_trn)
print(NB_model.score(X_trn, y_trn), NB_model.score(X_tst, y_tst))
print(recall_precision(y_trn, NB_model.predict(X_trn)), recall_precision(y_tst, NB_model.predict(X_tst)))

0.8753894080996885 0.8514492753623188
(0.8774928774928775, 0.8927536231884058) (0.8630952380952381, 0.8895705521472392)


# Performing Neural Networks: Multi-layer Perceptron

In [None]:
from sklearn.neural_network import MLPClassifier
MLP_model = MLPClassifier(hidden_layer_sizes =  (10, 10) , activation= "relu" ,random_state=0, max_iter=500).fit(X_trn, y_trn)
print(MLP_model.score(X_trn, y_trn), MLP_model.score(X_tst, y_tst))
print(recall_precision(y_trn, MLP_model.predict(X_trn)), recall_precision(y_tst, MLP_model.predict(X_tst)))

0.8753894080996885 0.855072463768116
(0.8552278820375335, 0.9246376811594202) (0.8514285714285714, 0.9141104294478528)


# Performing 1-Dimensional Covolutional Neural Network

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

net = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=5, bias=False)
optimizer=optim.Adam(net.parameters(), lr=0.01)

x = torch.tensor(list(X_trn), dtype = torch.float)
y = torch.tensor(list(y_trn), dtype = torch.float)

In [None]:
def best_trained_DT(X_trn, Y_trn):
    accuracy = dict()
    for depth in [1, 3, 6, 9, 12, 15, 18, 21]:
        accuracy[depth] = Kfold_cv_accuracy(X_trn, Y_trn, DecisionTreeClassifier(max_depth=depth))
    return DecisionTreeClassifier(max_depth = max(accuracy, key=accuracy.get)).fit(X_trn, Y_trn)
