In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# For data visualization
import matplotlib.pyplot as plt
import matplotlib as mpl

import seaborn as sns; sns.set()

from plotly.offline import init_notebook_mode, iplot, plot
import plotly as py
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

# For statistics, preprocessing and ML
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import scale 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score,roc_curve
import statsmodels.formula.api as smf
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("/kaggle/input/heart-disease-cleveland-uci/heart_cleveland_upload.csv")
df = data.copy()

# Purpose of the study

### We would like to find out the best model that would predict whether a patient has heart disease or not based on the given independent variables. 
### We will use the main classification approaches since we have a categorical dependent variable and compare them based on their accuracy scores. 

### For this purpose, the steps below will be followed in this study;

- Have a general idea about the data set.
- Use necessary methods to clean and prepare the data for analysis.
- Conduct Explanatory Data Analysis (EDA) and visualization.
- Pre-process the data.
- Conduct the main classification analyses, check the accuracy scores and compare them.

# General Information

__Context__

- This dataset is already presented in https://www.kaggle.com/ronitf/heart-disease-uci. However, there are some descriptions that may lead to misconceptions and inconsistencies regarding a couple of observations compared to the original data presented in https://archive.ics.uci.edu/ml/datasets/Heart+Disease.
- These discussions can be found in https://www.kaggle.com/ronitf/heart-disease-uci/discussion/105877. 
- In this framework, I have decided to use the revised dataset presented in https://www.kaggle.com/cherngs/heart-disease-cleveland-uci. More detailed explanation of the features is also given along with this revised dataset which is also stated below.

__Content__
There are 13 attributes and our target variable as condition;
1. __age__: age in years
1. __sex__: sex (1 = male; 0 = female)
1. __cp__: chest pain type
    - Value 0: typical angina
    - Value 1: atypical angina
    - Value 2: non-anginal pain
    - Value 3: asymptomatic
1. __trestbps__: resting blood pressure (in mm Hg on admission to the hospital)
1. __chol__: serum cholesterol in mg/dl
1. __fbs__: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
1. __restecg__: resting electrocardiographic results
    - Value 0: normal
    - Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
    - Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
1. __thalach__: maximum heart rate achieved
1. __exang__: exercise induced angina (1 = yes; 0 = no)
1. __oldpeak__ = ST depression induced by exercise relative to rest
1. __slope__: the slope of the peak exercise ST segment
    - Value 0: upsloping
    - Value 1: flat
    - Value 2: downsloping
1. __ca__: number of major vessels (0-3) colored by flourosopy
1. __thal__: 0 = normal; 1 = fixed defect; 2 = reversable defect
1. __condition__: 0 = no disease, 1 = disease

In [None]:
display(df.head())
display(df.tail())

### We have 297 observations and 14 features.

In [None]:
df.info()

### We do not have any nan values.

In [None]:
df.isnull().sum()

### We check the general statistical info of the data.

In [None]:
data.describe().T

### We check the number of unique values in each feature.

In [None]:
df.nunique()

### We check the correlation between the variables. When we look at the correlation between the independent variables and the target variable, only thal has a score greater than 0.5.

In [None]:
df.corr()

### We display the countplots of discrete variables. 
- We have more males (1) in our observations.
- We have asymptomatic (3) as the most frequent chest pain type, non-anginal pain (2) comes the second. 
- We have more observations with fasting blood sugar < 120 mg/dl (0).
- We have normal (0) as the most frequent resting electrocardiographic results, showing probable or definite left ventricular hypertrophy by Estes' criteria (2) comes the second. 
- We have more observations with no exercise induced angina (0).
- We have upsloping (0) as the most frequent slope, flat (1) comes the second.
- We have 0 as the most frequent number of major vessels colored by flourosopy, 1 comes the second.
- We have normal (0) as the most frequent category in thal, reversable defect (2) comes the second. 
- Finally, we have more patients without a heart condition.

In [None]:
df1 = df.drop(['age','trestbps','chol','thalach','oldpeak'],axis=1)

for i, col in enumerate(df1.columns):
    plt.figure(i)
    plt.title(col, color = 'blue',fontsize=15)
    sns.countplot(x=col, data=df1)

### Countplots with heart condition as the second factor. 
- Males have more heart condition than females.
- It is interesting to observe that patients who have asymptomatic chest pain type have higher heart condition compared to the other cp types. Then, chest pain type may not be a good indicator of heart condition.
- According to the resting electrocardiographic results, patient with the value of 1 and 2 tend to have higher heart condition.
- Patients who have exercise induced angina tend to have more heart condition.
- Regarding the slope of the peak exercise ST segment, patients with flat slope tend to have higher heart condition.
- It can be see that proportion of heart condition increases as the number of major vessels (0-3) colored by flourosopy increases.
- Regarding thal, patients with reversable defect tend to have higher heart condition.

In [None]:
sns.catplot(x="sex", hue="condition", kind="count",
            palette="pastel", edgecolor=".6",
            data=df);
plt.title('Sex and Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.catplot(x="cp", hue="condition", kind="count",
            palette="pastel", edgecolor=".6",
            data=df);
plt.title('Chest Pain Type and Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.catplot(x="fbs", hue="condition", kind="count",
            palette="pastel", edgecolor=".6",
            data=df);
plt.title('Fasting Blood Sugar and Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.catplot(x="restecg", hue="condition", kind="count",
            palette="pastel", edgecolor=".6",
            data=df);
plt.title('Resting Electrocardiographic Results and Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.catplot(x="exang", hue="condition", kind="count",
            palette="pastel", edgecolor=".6",
            data=df);
plt.title('Exercise Induced Angina and Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.catplot(x="slope", hue="condition", kind="count",
            palette="pastel", edgecolor=".6",
            data=df);
plt.title('Slope of the Peak Exercise ST Segment and Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.catplot(x="ca", hue="condition", kind="count",
            palette="pastel", edgecolor=".6",
            data=df);
plt.title('Number of Major Vessels Colored by Flourosopy and Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.catplot(x="thal", hue="condition", kind="count",
            palette="pastel", edgecolor=".6",
            data=df);
plt.title('Thal and Heart Condition', color = 'blue', fontsize=15)
plt.show()

### Displaying continuous variables through barplots and boxplots together with heart condition as the second factor. 
- The patients with heart condition are older, have higher resting blood pressure, slightly higher cholesterol, lower maximum heart rate achieved, higher ST depression.

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x=df.condition, y=df.age);
plt.xticks(rotation= 0)
plt.xlabel('condition', fontsize=14)
plt.ylabel('age', fontsize=14)
plt.title('Average Age by Heart Condition', color = 'blue', fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x=df.condition, y=df.trestbps);
plt.xticks(rotation= 0)
plt.xlabel('condition', fontsize=14)
plt.ylabel('trestbps', fontsize=14)
plt.title('Average Resting Blood Pressure by Heart Condition', color = 'blue', fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x=df.condition, y=df.chol);
plt.xticks(rotation= 0)
plt.xlabel('condition', fontsize=14)
plt.ylabel('chol', fontsize=14)
plt.title('Average Cholesterol by Heart Condition', color = 'blue', fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x=df.condition, y=df.thalach);
plt.xticks(rotation= 0)
plt.xlabel('condition', fontsize=14)
plt.ylabel('thalach', fontsize=14)
plt.title('Average Maximum Heart Rate Achieved by Heart Condition', color = 'blue', fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x=df.condition, y=df.oldpeak);
plt.xticks(rotation= 0)
plt.xlabel('condition', fontsize=14)
plt.ylabel('oldpeak', fontsize=14)
plt.title('Average ST depression by Heart Condition', color = 'blue', fontsize=15)
plt.show()

In [None]:
sns.boxplot(x="condition", y="age", data=df, palette="PRGn")
plt.title('Age by Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.boxplot(x="condition", y="trestbps", data=df, palette="PRGn")
plt.title('Resting Blood Pressure by Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.boxplot(x="condition", y="chol", data=df, palette="PRGn")
plt.title('Cholesterol by Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.boxplot(x="condition", y="thalach", data=df, palette="PRGn")
plt.title('Maximum Heart Rate Achieved by Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.boxplot(x="condition", y="oldpeak", data=df, palette="PRGn")
plt.title('ST Depression by Heart Condition', color = 'blue', fontsize=15)
plt.show()

### When we add categorical and discrete variables of exercise induced angina, the slope of the peak exercise ST segment, number of major vessels (0-3) colored by flourosopy and thal as the third factor into the swarm plots, we can see that these variables are useful to differentiate a patient with heart condition.

In [None]:
sns.swarmplot(x="condition", y="age", hue='exang' ,data=df)
plt.title('Age by Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.swarmplot(x="condition", y="trestbps",hue='exang' ,data=df)
plt.title('Resting Blood Pressure by Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.swarmplot(x="condition", y="chol",hue='exang' ,data=df)
plt.title('Cholesterol by Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.swarmplot(x="condition", y="thalach",hue='exang' ,data=df)
plt.title('Maximum Heart Rate Achieved by Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.swarmplot(x="condition", y="oldpeak",hue='exang' ,data=df)
plt.title('ST depression by Heart Condition', color = 'blue', fontsize=15)
plt.show()

In [None]:
sns.swarmplot(x="condition", y="age", hue='slope' ,data=df)
plt.title('Age by Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.swarmplot(x="condition", y="trestbps",hue='slope' ,data=df)
plt.title('Resting Blood Pressure by Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.swarmplot(x="condition", y="chol",hue='slope' ,data=df)
plt.title('Cholesterol by Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.swarmplot(x="condition", y="thalach",hue='slope' ,data=df)
plt.title('Maximum Heart Rate Achieved by Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.swarmplot(x="condition", y="oldpeak",hue='slope' ,data=df)
plt.title('ST depression by Heart Condition', color = 'blue', fontsize=15)
plt.show()

In [None]:
sns.swarmplot(x="condition", y="age", hue='ca' ,data=df)
plt.title('Age by Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.swarmplot(x="condition", y="trestbps",hue='ca' ,data=df)
plt.title('Resting Blood Pressure by Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.swarmplot(x="condition", y="chol",hue='ca' ,data=df)
plt.title('Cholesterol by Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.swarmplot(x="condition", y="thalach",hue='ca' ,data=df)
plt.title('Maximum Heart Rate Achieved by Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.swarmplot(x="condition", y="oldpeak",hue='ca' ,data=df)
plt.title('ST depression by Heart Condition', color = 'blue', fontsize=15)
plt.show()

In [None]:
sns.swarmplot(x="condition", y="age", hue='thal' ,data=df)
plt.title('Age by Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.swarmplot(x="condition", y="trestbps",hue='thal' ,data=df)
plt.title('Resting Blood Pressure by Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.swarmplot(x="condition", y="chol",hue='thal' ,data=df)
plt.title('Cholesterol by Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.swarmplot(x="condition", y="thalach",hue='thal' ,data=df)
plt.title('Maximum Heart Rate Achieved by Heart Condition', color = 'blue', fontsize=15)
plt.show()

sns.swarmplot(x="condition", y="oldpeak",hue='thal' ,data=df)
plt.title('ST depression by Heart Condition', color = 'blue', fontsize=15)
plt.show()

### As mentioned above, when we look at the correlation between the independent variables and the target variable, only thal has a score greater than 0.5.

In [None]:
f,ax = plt.subplots(figsize=(15, 10))
sns.heatmap(df.corr(), annot=True, linewidths=0.5, linecolor="red", fmt= '.2f',ax=ax)
plt.show()

In [None]:
sns.pairplot(df, hue='condition', vars=['age','trestbps','chol','thalach','oldpeak'],kind='reg')
plt.show()

# Data Preprocessing
- First, we need to create dummy variables for cp, restecg, thal and slope features through one-hot encoding in order to prevent any misinterpretation by algorithms as having some sort of order in the categorical values of these variables.
- Secondly, when we look at the continuous variables, we can see that they have different scales and we need to use a scaling method. In this study, I have decided to MinMax Scaling. Other scaling methods can be used additionally for further comparison and improvement of the prediction models. 

In [None]:
a = pd.get_dummies(df['cp'], prefix = "cp")
b = pd.get_dummies(df['restecg'], prefix = "restecg")
c = pd.get_dummies(df['slope'], prefix = "slope")
d = pd.get_dummies(df['thal'], prefix = "thal")

In [None]:
frames = [df, a, b, c, d]
df = pd.concat(frames, axis = 1)

In [None]:
df = df.drop(columns = ['cp','restecg','slope','thal'])
df.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
X = df.drop(["condition"],axis = 1)
y = df.condition

In [None]:
scaler = MinMaxScaler().fit(X)

X_scaled = scaler.transform(X)

### Train-Test Splitting
- I have decided to split the data as 0.7/0.3 since we have relatively small number of observations. In this respect, it would be a better idea to have more observations as test data. 

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size = 0.3, random_state = 42)

# Classification Models

## Logistic Regression

### Model

In [None]:
log_reg = LogisticRegression().fit(X_train,y_train)
log_reg

In [None]:
log_reg.intercept_

In [None]:
log_reg.coef_

### Prediction

In [None]:
y_pred = log_reg.predict(X_test)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
log_reg.predict_proba(X_test)[0:10]

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
logit_roc_auc = roc_auc_score(y, log_reg.predict(X_scaled))

fpr, tpr, thresholds = roc_curve(y, log_reg.predict_proba(X_scaled)[:,1])

plt.figure()
plt.plot(fpr, tpr, label='AUC (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Ratio')
plt.ylabel('True Positive Ratio')
plt.title('ROC')
plt.show()

### Cross Validation

In [None]:
log_reg_final = cross_val_score(log_reg, X_test, y_test, cv = 10).mean()
log_reg_final

## Gaussian Naive Bayes

In [None]:
nb = GaussianNB()
nb_model = nb.fit(X_train, y_train)
nb_model

In [None]:
nb_model.predict(X_test)[0:10]

In [None]:
nb_model.predict_proba(X_test)[0:10]

In [None]:
y_pred = nb_model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

### Cross Validation

In [None]:
nb_final = cross_val_score(nb_model, X_test, y_test, cv = 10).mean()
nb_final

## KNN

In [None]:
knn = KNeighborsClassifier()
knn_model = knn.fit(X_train, y_train)
knn_model

In [None]:
y_pred = knn_model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

### Model Tuning

In [None]:
knn_params = {"n_neighbors": np.arange(1,50)}

In [None]:
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, knn_params, cv=10)
knn_cv.fit(X_train, y_train)

In [None]:
print("Best score:" + str(knn_cv.best_score_))
print("Best parameters: " + str(knn_cv.best_params_))

In [None]:
knn = KNeighborsClassifier(n_neighbors = 5)
knn_tuned = knn.fit(X_train, y_train)

In [None]:
y_pred = knn_tuned.predict(X_test)

In [None]:
knn_final = accuracy_score(y_test, y_pred)
knn_final

## SVC

### 1- Linear

In [None]:
svm_model = SVC(kernel = "linear").fit(X_train, y_train)
svm_model

In [None]:
y_pred = svm_model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
svc_params = {"C": [0.0001, 0.001, 0.1, 1, 5, 10 ,50 ,100]}

svc = SVC(kernel = "linear")

svc_cv_model = GridSearchCV(svc,svc_params, 
                            cv = 10, 
                            n_jobs = -1, 
                            verbose = 2 )

svc_cv_model.fit(X_train, y_train)

In [None]:
print("Best parameters: " + str(svc_cv_model.best_params_))

In [None]:
svc_tuned = SVC(kernel = "linear", C = 1).fit(X_train, y_train)

In [None]:
y_pred = svc_tuned.predict(X_test)

In [None]:
svc_linear_final = accuracy_score(y_test, y_pred)
svc_linear_final

### 2- Rbf

In [None]:
svc_model = SVC(kernel = "rbf").fit(X_train, y_train)

In [None]:
y_pred = svc_model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
svc_params = {"C": [0.00001, 0.0001, 0.001, 0.1, 1, 5, 10 ,50 ,100],
             "gamma": [0.0001, 0.001, 0.1, 1, 5, 10 ,50 ,100]}

In [None]:
svc = SVC(kernel = "rbf")
svc_cv_model = GridSearchCV(svc, svc_params, 
                         cv = 10, 
                         n_jobs = -1,
                         verbose = 2)

svc_cv_model.fit(X_train, y_train)

In [None]:
print("Best parameters: " + str(svc_cv_model.best_params_))

In [None]:
svc_tuned = SVC(kernel = "rbf", C = 5, gamma = 0.1).fit(X_train, y_train)

In [None]:
y_pred = svc_tuned.predict(X_test)
svc_rbf_final = accuracy_score(y_test, y_pred)
svc_rbf_final

### 3- Poly

In [None]:
svc_model = SVC(kernel = "poly").fit(X_train, y_train)

In [None]:
y_pred = svc_model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
svc_params = {"C": [0.00001, 0.0001, 0.001, 0.1, 1, 5, 10 ,50 ,100],
             "gamma": [0.0001, 0.001, 0.1, 1, 5, 10 ,50 ,100]}

In [None]:
svc = SVC(kernel = "poly")
svc_cv_model = GridSearchCV(svc, svc_params, 
                         cv = 10, 
                         n_jobs = -1,
                         verbose = 2)

svc_cv_model.fit(X_train, y_train)

In [None]:
print("Best parameters: " + str(svc_cv_model.best_params_))

In [None]:
svc_tuned = SVC(kernel = "poly", C = 0.001, gamma = 1).fit(X_train, y_train)

In [None]:
y_pred = svc_tuned.predict(X_test)
svc_poly_final = accuracy_score(y_test, y_pred)
svc_poly_final

## CART

In [None]:
cart = DecisionTreeClassifier()
cart_model = cart.fit(X_train, y_train)

In [None]:
y_pred = cart_model.predict(X_test)
accuracy_score(y_test, y_pred)

### Model Tuning

In [None]:
cart_grid = {"max_depth": range(1,10),
            "min_samples_split" : list(range(2,50)) }

In [None]:
cart = tree.DecisionTreeClassifier()
cart_cv = GridSearchCV(cart, cart_grid, cv = 10, n_jobs = -1, verbose = 2)
cart_cv_model = cart_cv.fit(X_train, y_train)

In [None]:
print("Best Parameters: " + str(cart_cv_model.best_params_))

In [None]:
cart = tree.DecisionTreeClassifier(max_depth = 3, min_samples_split = 18)
cart_tuned = cart.fit(X_train, y_train)

In [None]:
y_pred = cart_tuned.predict(X_test)
cart_final = accuracy_score(y_test, y_pred)
cart_final

## Random Forest

In [None]:
rf_model = RandomForestClassifier().fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)
accuracy_score(y_test, y_pred)

### Model Tuning

In [None]:
rf_params = {"max_depth": [2,5,8,10],
            "max_features": [2,3,5,7],
            "n_estimators": [10,100,200,500,1000],
            "min_samples_split": [2,5,10]}

In [None]:
rf_model = RandomForestClassifier()

rf_cv_model = GridSearchCV(rf_model, 
                           rf_params, 
                           cv = 10, 
                           n_jobs = -1, 
                           verbose = 2) 

In [None]:
rf_cv_model.fit(X_train, y_train)

In [None]:
print("Best Parameters: " + str(rf_cv_model.best_params_))

In [None]:
rf_tuned = RandomForestClassifier(max_depth = 5, 
                                  max_features = 2, 
                                  min_samples_split = 2,
                                  n_estimators = 200)

rf_tuned.fit(X_train, y_train)

In [None]:
y_pred = rf_tuned.predict(X_test)
rf_final = accuracy_score(y_test, y_pred)
rf_final

In [None]:
pd.DataFrame(X_train).head()

In [None]:
X_train_pd = pd.DataFrame(X_train)
df_x = df.drop(['condition'], axis=1)
X_train_pd.columns = df_x.columns[:22]
X_train_pd.head()

In [None]:
Importance = pd.DataFrame({"Importance": rf_tuned.feature_importances_*100},
                         index = X_train_pd.columns)

In [None]:
Importance.sort_values(by = "Importance", 
                       axis = 0, 
                       ascending = True).plot(kind ="barh", color = "r")

plt.xlabel("Importance Levels of Independent Variables");

## Gradient Boosting Machines

In [None]:
gbm_model = GradientBoostingClassifier().fit(X_train, y_train)

In [None]:
y_pred = gbm_model.predict(X_test)
accuracy_score(y_test, y_pred)

### Model Tuning

In [None]:
gbm_params = {"learning_rate" : [0.001, 0.01, 0.05, 0.1],
             "n_estimators": [100,500,1000],
             "max_depth": [3,5,10],
             "min_samples_split": [2,5,10]}

In [None]:
gbm = GradientBoostingClassifier()

gbm_cv = GridSearchCV(gbm, gbm_params, cv = 10, n_jobs = -1, verbose = 2)

In [None]:
gbm_cv.fit(X_train, y_train)

In [None]:
print("Best parameters: " + str(gbm_cv.best_params_))

In [None]:
gbm = GradientBoostingClassifier(learning_rate = 0.05, 
                                 max_depth = 3,
                                min_samples_split = 5,
                                n_estimators = 1000)

In [None]:
gbm_tuned =  gbm.fit(X_train,y_train)

In [None]:
y_pred = gbm_tuned.predict(X_test)
gbm_final = accuracy_score(y_test, y_pred)
gbm_final

## XGBoost

In [None]:
xgb_model = XGBClassifier().fit(X_train, y_train)

In [None]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_test, y_pred)

### Model Tuning

In [None]:
xgb_params = {
        'n_estimators': [100, 500, 1000, 2000],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5,6],
        'learning_rate': [0.1,0.01,0.02,0.05]}

In [None]:
xgb = XGBClassifier()

xgb_cv_model = GridSearchCV(xgb, xgb_params, cv = 10, n_jobs = -1, verbose = 2)

In [None]:
xgb_cv_model.fit(X_train, y_train)

In [None]:
print("Best parameters: " + str(xgb_cv_model.best_params_))

In [None]:
xgb = XGBClassifier(learning_rate = 0.1, 
                    max_depth = 3,
                    n_estimators = 100,
                    subsample = 1.0)

In [None]:
xgb_tuned =  xgb.fit(X_train,y_train)

In [None]:
y_pred = xgb_tuned.predict(X_test)
xgb_final = accuracy_score(y_test, y_pred)
xgb_final

## Light GBM

In [None]:
lgbm_model = LGBMClassifier().fit(X_train, y_train)

In [None]:
y_pred = lgbm_model.predict(X_test)
accuracy_score(y_test, y_pred)

### Model Tuning

In [None]:
lgbm_params = {
        'n_estimators': [100, 500, 1000, 2000],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5,6],
        'learning_rate': [0.1,0.01,0.02,0.05],
        "min_child_samples": [5,10,20]}

In [None]:
lgbm = LGBMClassifier()

lgbm_cv_model = GridSearchCV(lgbm, lgbm_params, 
                             cv = 10, 
                             n_jobs = -1, 
                             verbose = 2)

In [None]:
lgbm_cv_model.fit(X_train, y_train)

In [None]:
lgbm_cv_model.best_params_

In [None]:
lgbm = LGBMClassifier(learning_rate = 0.01, 
                       max_depth = 3,
                       min_child_samples = 10,
                       n_estimators = 500,
                       subsample = 0.6,
                       )

In [None]:
lgbm_tuned = lgbm.fit(X_train,y_train)

In [None]:
y_pred = lgbm_tuned.predict(X_test)
lgbm_final = accuracy_score(y_test, y_pred)
lgbm_final

## Cat Boost

In [None]:
cat_model = CatBoostClassifier().fit(X_train, y_train)

In [None]:
y_pred = cat_model.predict(X_test)
accuracy_score(y_test, y_pred)

### Model Tuning

In [None]:
catb_params = {
    'iterations': [200,500],
    'learning_rate': [0.01,0.05, 0.1],
    'depth': [3,5,8] }

In [None]:
catb = CatBoostClassifier()
catb_cv_model = GridSearchCV(catb, catb_params, cv=5, n_jobs = -1, verbose = 2)
catb_cv_model.fit(X_train, y_train)

In [None]:
catb_cv_model.best_params_

In [None]:
catb = CatBoostClassifier(iterations = 200, 
                          learning_rate = 0.01, 
                          depth = 3)

catb_tuned = catb.fit(X_train, y_train)
y_pred = catb_tuned.predict(X_test)

In [None]:
y_pred = catb_tuned.predict(X_test)
catb_final = accuracy_score(y_test, y_pred)
catb_final

# Model Comparison
### Different approaches are used and compared in this study to predict whether a patient has a heart condition or not. 
### According to accuracy scores of different methods, CatBoost has the best score.

In [None]:
models = {
'log_reg_final': log_reg_final,
'nb_final': nb_final,
'knn_final': knn_final,
'svc_linear_final': svc_linear_final,
'svc_rbf_final': svc_rbf_final,
'svc_poly_final': svc_poly_final,
'cart_final': cart_final,
'rf_final': rf_final,
'gbm_final': gbm_final,
'xgb_final': xgb_final,
'lgbm_final': lgbm_final,
'catb_final': catb_final
}

for model,score in models.items():
    print("-"*28)
    print(model + ":" )
    print("Accuracy: {:.4%}".format(score))

In [None]:
indexes = ["Log","NB","KNN","SVC_Lin","SVC_Rbf", "SVC_Poly", "CART", "RF", "GBM", "XGB", "LGBM", "CATB"]
scores = [
     log_reg_final,
nb_final,
knn_final,
svc_linear_final,
svc_rbf_final,
svc_poly_final,
cart_final,
    rf_final,
    gbm_final,
xgb_final,
lgbm_final,
    catb_final]

plt.figure(figsize=(12,8))
sns.barplot(x=indexes,y=scores)
plt.xticks()
plt.title('Model Comparision',color = 'orange',fontsize=20);