In [None]:
# for data analysis
import pandas as pd 
import numpy as np

# for data visuals
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline 

# for machine learning 
from sklearn.ensemble import RandomForestClassifier as RFclassifier
from sklearn.neighbors import KNeighborsClassifier as KNclassifier 
from sklearn.svm import SVC as SVMclassifier
from sklearn.naive_bayes import GaussianNB as NBclassifier
from sklearn.preprocessing import MinMaxScaler

### Step 1: Reading data


In [None]:
trainData = pd.read_csv('processed_cleveland_data_train.csv')
testData = pd.read_csv('processed_cleveland_data_test.csv')

# replacing anything greater than 1 with 1 because it is a binary classification problem
def replace_predict(df):
    df['num'] = df['num'].replace([1, 2, 3, 4, 5, 6], 1)


    
replace_predict(trainData)
replace_predict(testData)


# rescaling dara
scaler = MinMaxScaler(feature_range=(0, 1))

Xtrain = trainData.drop(['num'], axis=1)
Ytrain = trainData['num']


Xtest = testData.drop(['num'], axis=1)
Ytest = testData['num']

Xtrain = scaler.fit_transform(Xtrain)
Xtest = scaler.fit_transform(Xtest)





### Step 2: Define the model
 

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score
# Create all the models

rfModel=RFclassifier()
knModel= KNclassifier()
svmModel= SVMclassifier()
nbModel=NBclassifier() 
    



### Step 3: Fit the Model

In [None]:
# Fit the best algorithm to the data
rfModel.fit(Xtrain, Ytrain)
knModel.fit(Xtrain, Ytrain)
svmModel.fit(Xtrain, Ytrain)
nbModel.fit(Xtrain, Ytrain)



### Step 4a: Evaluate model using accuracy score

In [None]:
from sklearn.metrics import confusion_matrix
finalaccu=[]
predictionsAr=[]
probabilities=[]
models=[rfModel,knModel,svmModel,nbModel]
count=0
while count<4:
    predictions = models[count].predict(Xtest)
    print(models[count])
    print(predictions)
    accuracy= accuracy_score(Ytest, predictions)
    count=count+1
    finalaccu.append(accuracy)
    predictionsAr.append(predictions)
print(finalaccu)



In [None]:
methods =  ["Random Forest","KNN", "SVM", "Naive Bayes"]
accuracy = [80.33, 67.21, 49.18, 77.04]
colors = ["purple", "green", "orange", "#CFC60E"]

sns.set_style("whitegrid")
plt.figure(figsize=(8,5))
plt.yticks(np.arange(0,100,10))
plt.ylabel("Accuracy Percent")
plt.xlabel("Algorithms")
sns.barplot(x=methods, y=accuracy, palette=colors)
plt.show()

### Step 4b : Evaluate model using confusion matrix 


In [None]:
from sklearn.metrics import confusion_matrix

cm_knn = confusion_matrix(Ytest,predictionsAr[1])
cm_svm = confusion_matrix(Ytest,predictionsAr[2])
cm_nb = confusion_matrix(Ytest,predictionsAr[3])
cm_rf = confusion_matrix(Ytest,predictionsAr[0])

In [None]:
plt.figure(figsize=(24,12))

plt.suptitle("Confusion Matrixes",fontsize=24)
plt.subplots_adjust(wspace = 0.4, hspace= 0.4)


plt.subplot(2,3,2)
plt.title("K Nearest Neighbors Confusion Matrix")
sns.heatmap(cm_knn,annot=True,cmap="Blues",fmt="d",cbar=False)

plt.subplot(2,3,3)
plt.title("Support Vector Machine Confusion Matrix")
sns.heatmap(cm_svm,annot=True,cmap="Blues",fmt="d",cbar=False)

plt.subplot(2,3,4)
plt.title("Naive Bayes Confusion Matrix")
sns.heatmap(cm_nb,annot=True,cmap="Blues",fmt="d",cbar=False)

plt.subplot(2,3,6)
plt.title("Random Forest Confusion Matrix")
sns.heatmap(cm_rf,annot=True,cmap="Blues",fmt="d",cbar=False)
plt.show()

### Step 4c: Evaluate model using ROC/AUC

In [None]:
# roc curve
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from matplotlib import pyplot
from sklearn.metrics import roc_auc_score


#RANDOM FOREST
probs = rfModel.predict_proba(Xtest)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate roc curve
fpr, tpr, thresholds = roc_curve(Ytest, probs)
# plot no skill
pyplot.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
pyplot.plot(fpr, tpr)
# show the plot
pyplot.show()
#calulate ROC AUC score
loss = roc_auc_score(Ytest, probs)
print(loss)


#KNN
probs = knModel.predict_proba(Xtest)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate roc curve
fpr, tpr, thresholds = roc_curve(Ytest, probs)
# plot no skill
pyplot.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
pyplot.plot(fpr, tpr)
# show the plot
pyplot.show()
#calulate ROC AUC score
loss = roc_auc_score(Ytest, probs)
print(loss)


#NAIVE BAYES
probs = nbModel.predict_proba(Xtest)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate roc curve
fpr, tpr, thresholds = roc_curve(Ytest, probs)
# plot no skill
pyplot.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
pyplot.plot(fpr, tpr)
# show the plot
pyplot.show()
#calulate ROC AUC score
loss = roc_auc_score(Ytest, probs)
print(loss)



### Step 4d: Evaluate model using confidence and probability

In [None]:
from sklearn.metrics import log_loss
finalconf=[]
models=[rfModel,adModel,bgModel,etModel,gbModel]
count=0
while count<5:
    confidence = models[count].predict_proba(Xtest)
    probs = confidence[:, 1]
    loss = log_loss(Ytest.values, probs)
    print(loss)
    count=count+1

### Step 4e: Evaluate model using Mathew's Correlation Coeffcient

In [None]:
from sklearn.metrics import matthews_corrcoef
count=0
while count<4:
 matthewsCoeff=matthews_corrcoef(Ytest, predictionsAr[count])   
 count=count+1
 print(matthewsCoeff)


### Step 5: Feature Importance Graph

In [None]:
features=["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", 
          "thal"]

importances = rfModel.feature_importances_
print(importances)
indices = np.argsort(importances)
print(indices)
plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')

In [None]:
methods =  ["Random Forest","KNN", "SVM", "Naive Bayes"]
MCC = [0.57, 0.35, 0.0, 0.56]
colors = ["purple", "green", "blue", "#CFC60E"]
sns.set_style("whitegrid")
plt.figure(figsize=(8,5))
plt.yticks(np.arange(-1,1,0.1))
plt.ylabel("Matthews Correlation Coefficient")
plt.xlabel("Algorithms")
sns.barplot(x=methods, y=MCC, palette=colors)
plt.show()

### Step 6: Cross Validation

In [None]:

# Cross Validation with KFold
from sklearn.model_selection import KFold

alldata = pd.read_csv('processed_cleveland_data.csv')
replace_predict(alldata)

Xall = alldata.drop(['num'], axis=1)
Yall = alldata['num']

def run_kfold(model):
    kf = KFold(n_splits=5)
    outcomes = []
    fold = 0
    for train_index, test_index in kf.split(Xall):
        fold += 1
        Xtrain, Xtest = Xall.values[train_index], Xall.values[test_index]
        Ytrain, Ytest = Yall.values[train_index], Yall.values[test_index]
        model.fit(Xtrain, Ytrain)
        print(Ytrain[0])
        predictions = model.predict(Xtest)
        print(predictions)
        accuracy = accuracy_score(Ytest, predictions)
        print(accuracy)
        outcomes.append(accuracy)
        print(outcomes)
        print("Fold {0} accuracy: {1}".format(fold, accuracy)) 
        mean_outcome = np.mean(outcomes)
        print("Mean Accuracy: {0}".format(mean_outcome)) 
        
run_kfold(model)
    