> # Introduction

In this study; first, I will try to analysis and visualize the Heart Diseases UCI data we have. Then I will try to diagnose the patient or not by using different ML algorithms on the data. And finally, I will compare the results I got from the different algorithms I used.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

> # Read Data

In [None]:
data=pd.read_csv('../input/heart-disease-uci/heart.csv')
data.head()

In [None]:
data.tail()

In [None]:
data.info()

**Attribute Information in Data:**
*     **age:** age of the person
*     **sex:** 0:female and 1:male
*     cp: chest pain type (4 values)
*     trestbps: resting blood pressure
*     chol: serum cholestoral in mg/dl
*     fbs: fasting blood sugar > 120 mg/dl
*     **restecg:** resting electrocardiographic results (values 0,1,2)
*     **thalach:** maximum heart rate achieved
*     **exang:** exercise induced angina
*     **oldpeak:** ST depression induced by exercise relative to rest
*     **slope:** the slope of the peak exercise ST segment
*     **ca:** number of major vessels (0-3) colored by flourosopy
*     **thal:** 3 = normal; 6 = fixed defect; 7 = reversable defect

> # Data Preprocessing

In [None]:
x_data=(data.drop(["target"],axis=1))
y_data=data.target

In [None]:
#scaling
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_data_scaled=pd.DataFrame(sc.fit_transform(x_data))
scaled_data=pd.concat([x_data_scaled,y_data],axis=1)
scaled_data.columns=data.columns

> # Data Exploration

In [None]:
import plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px


**0-) Correlation Map:**

In [None]:
crr=data.corr()
plt.figure(figsize=(8,8))
sns.heatmap(crr,annot=True)
plt.show()

**1-) Total Distribution by Gender**

In [None]:
gender_pie={
  "data": [
    {
      "values": data.sex.value_counts(),
      "labels": ["Female","Male"],
      "domain": {"x": [0, .8],
                "y": [0, .8]},
      "name": "Total Distribution by Gender",
      "hoverinfo":"label+percent+name+value",
      "hole": .1,
      "type": "pie"
    },],
  "layout": {
        #"title":"Population Rate of Regions",
        "annotations": [
            { "font": { "size": 20},
              "showarrow": False,
              "text": "Total Distribution by Gender",
                "x": 0.4,
                "y": 1
            },
        ]
    }
}
    
iplot(gender_pie)

**2-) Age Histogram**

In [None]:
fig = px.histogram(data, x="age",range_x=[29,79],nbins=10)
fig.show()

**3-) Have Heart Disease / not Have**

In [None]:
disease_pie={
  "data": [
    {
      "values": data.target.value_counts(),
      "labels": ["Have heart disease","No heart disease"],
      "domain": {"x": [0, .8],
                "y": [0, .8]},
      "name": " Have Disease / not Have",
      "hoverinfo":"label+percent+name+value",
      "hole": .1,
      "type": "pie"
    },],
  "layout": {
        #"title":"Population Rate of Regions",
        "annotations": [
            { "font": { "size": 20},
              "showarrow": False,
              "text": " Have Disease / not Have",
                "x": 0.3,
                "y": 1
            },
        ]
    }
}
    
iplot(disease_pie)

**4-) Gender Distribution of those with Heart Disease**

In [None]:
data_patient=data[data.target==1]
patient_gender_pie={
  "data": [
    {
      "values": data_patient.sex.value_counts(),
      "labels": ["Male","Female"],
      "domain": {"x": [0, .8],
                "y": [0, .8]},
      "name": "Gender Distribution of those with Heart Disease",
      "hoverinfo":"label+percent+name+value",
      "hole": .1,
      "type": "pie"
    },],
  "layout": {
        #"title":"Population Rate of Regions",
        "annotations": [
            { "font": { "size": 20},
              "showarrow": False,
              "text": "Gender Distribution of those with Heart Disease",
                "x": 0.4,
                "y": 1
            },
        ]
    }
}
    
iplot(patient_gender_pie)

**5-) Age Histogram of Those with Heart Disease**

In [None]:
fig = px.histogram(data_patient, x="age",range_x=[29,79])
fig.show()

**6-) Cholesterol Values Swarmplot (Distribution)**

In [None]:
plt.figure(figsize=(10,7))
sns.swarmplot(x='target',y='chol',data=data,size=7)
plt.xticks(rotation=45)
plt.xlabel('DISEASES STATUS (Target)')
plt.ylabel('CHOLESTEROL')
plt.show()

> # Train Test Split

Let's split the data into two for training and testing:

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data_scaled, y_data, test_size=0.2, random_state=0)


> # Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg=LogisticRegression(random_state=0)
log_reg.fit(x_train,y_train)
acc_log_reg=log_reg.score(x_test,y_test)*100
print("Test Accuracy = {:.3f} %".format(acc_log_reg))
y_pred_lr=log_reg.predict(x_test)

*Confusion Matrix*

In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
lr_cm=confusion_matrix(y_test,y_pred_lr)
plt.figure(figsize=(3,3))
plt.title("Confusion Matrix Map")
sns.heatmap(lr_cm,annot=True,cbar=False)
plt.show()

> # Support Vector Machine (SVM)

In [None]:
from sklearn.svm import SVC
svc=SVC(kernel='rbf',random_state=0)
svc.fit(x_train,y_train)
acc_svc=svc.score(x_test,y_test)*100
print("Test Accuracy = {:.3f} %".format(acc_svc))
y_pred_svm=svc.predict(x_test)

*Confusion Matrix*

In [None]:
# Confusion Matrix

svm_cm=confusion_matrix(y_test,y_pred_svm)
plt.figure(figsize=(3,3))
plt.title("Confusion Matrix Map")
sns.heatmap(svm_cm,annot=True,cbar=False)
plt.show()

> # Naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB
bnb=BernoulliNB()
bnb.fit(x_train,y_train)
acc_bnb=bnb.score(x_test,y_test)*100
print("Test Accuracy = {:.3f} %".format(acc_bnb))
y_pred_bnb=bnb.predict(x_test)

*Confusion Matrix*

In [None]:
# Confusion Matrix

svm_bnb=confusion_matrix(y_test,y_pred_bnb)
plt.figure(figsize=(3,3))
plt.title("Confusion Matrix Map")
sns.heatmap(svm_bnb,annot=True,cbar=False)
plt.show()

> # K-Nearest Neighbors (K-NN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=8,metric="minkowski")
knn.fit(x_train,y_train)
acc_knn=knn.score(x_test,y_test)*100
print("Test Accuracy = {:.3f} %".format(acc_knn))
y_pred_knn=knn.predict(x_test)

*Confusion Matrix*

In [None]:
# Confusion Matrix
knn_cm=confusion_matrix(y_test,y_pred_lr)
plt.figure(figsize=(3,3))
plt.title("Confusion Matrix Map")
sns.heatmap(knn_cm,annot=True,cbar=False)
plt.show()

> # Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier(random_state=0,criterion='entropy',splitter='random')
dtc.fit(x_train,y_train)
acc_dtc=dtc.score(x_test,y_test)*100
print("Test Accuracy = {:.3f} %".format(acc_dtc))
y_pred_dtc=dtc.predict(x_test)

*Confusion Matrix*

In [None]:
# Confusion Matrix
dtc_cm=confusion_matrix(y_test,y_pred_dtc)
plt.figure(figsize=(3,3))
plt.title("Confusion Matrix Map")
sns.heatmap(dtc_cm,annot=True,cbar=False)
plt.show()

> # Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=10,random_state=0)
rf.fit(x_train,y_train)
acc_rf=rf.score(x_test,y_test)*100
print("Test Accuracy = {:.3f} %".format(acc_rf))
y_pred_rf=rf.predict(x_test)

*Confusion Matrix*

In [None]:
# Confusion Matrix

svm_rf=confusion_matrix(y_test,y_pred_rf)
plt.figure(figsize=(3,3))
plt.title("Confusion Matrix Map")
sns.heatmap(svm_rf,annot=True,cbar=False)
plt.show()

> # Comparing ML Models

In [None]:
from sklearn.metrics import roc_curve, auc

y_preds=[y_pred_lr,y_pred_svm,y_pred_bnb,y_pred_knn,y_pred_dtc,y_pred_rf]

fp_rate=[]
tp_rate=[]
thresholds=[]
roc_auc=[]
for i in y_preds:
    fp,tp,th = roc_curve(y_test, i)
    fp_rate.append(fp)
    tp_rate.append(tp)
    thresholds.append(th)
    roc_auc.append(100*auc(fp,tp))

model_list=["Logistic Regression", "Support Vector Machine", "Naive Bayes","K-NN","Decision Tree","Random Forest"]
accuracy_values=[acc_log_reg,acc_svc,acc_bnb,acc_knn,acc_dtc,acc_rf]

**ROC (Receiver Operating Characteristic) Curves**

In [None]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=[0,1], y=[0,1],
                    mode='lines',
                    line = dict(color='royalblue', width=4, dash='dash')))
for j in range(len(model_list)):
    fig.add_trace(go.Scatter(x=fp_rate[j], y=tp_rate[j],
                    mode='lines+markers',
                    name=model_list[j]))
fig.show()

**Accuracy and AUC (Area Under Curve) Scores**

In [None]:
figbar = go.Figure(data=[
    go.Bar(name='Accuracy', x=model_list, y=accuracy_values),
    go.Bar(name='AUC', x=model_list, y=roc_auc)
])
# Change the bar mode
figbar.update_layout(barmode='group')
figbar.show()