In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Context and Variable Details

This data set dates from 1988 and consists of four databases: Cleveland, Hungary, Switzerland, and Long Beach V. It contains 76 attributes, including the predicted attribute, but all published experiments refer to using a subset of 14 of them. The "target" field refers to the presence of heart disease in the patient. It is integer valued 0 = no disease and 1 = disease.

* age
* sex
* chest pain type (4 values)
* resting blood pressure
* serum cholestoral in mg/dl
* fasting blood sugar > 120 mg/dl
* resting electrocardiographic results (values 0,1,2)
* maximum heart rate achieved
* exercise induced angina
* oldpeak = ST depression induced by exercise relative to rest
* the slope of the peak exercise ST segment
* number of major vessels (0-3) colored by flourosopy
* thal: 0 = normal; 1 = fixed defect; 2 = reversable defect
* The names and social security numbers of the patients were recently removed from the database, replaced with dummy values.

**Our Goal is to predict wheather a person is diseased or not**

# Data Inspection

Import libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings
warnings.filterwarnings("ignore")
from sklearn.tree import DecisionTreeClassifier 
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


Read data

In [None]:
heartdf = pd.read_csv('../input/heart-disease-dataset/Heart Disease Dataset.csv')
heartdf.head()

check shape of data

In [None]:
heartdf.shape

check info

In [None]:
heartdf.info()

Null value check

In [None]:
heartdf.isnull().sum()

No null values in the dataframe

check statistical description of data

In [None]:
heartdf.describe()

check name of all variables

In [None]:
heartdf.columns

created list of numerical variables

In [None]:
num_cols = list(heartdf.columns[0:len(heartdf.columns)-1])
num_cols.remove('sex')

# EDA

plot numerical variables

In [None]:
plt.figure(figsize=(30,15))
for i in enumerate(num_cols):
    plt.subplot(3,4,i[0]+1)
    ax = sns.boxplot(heartdf[i[1]])
    ax.set_xlabel(i[1],fontsize=20)

plt.tight_layout()
plt.show()

check for unique values in every variable

In [None]:
heartdf.nunique(axis=0)

plot distribution of numerical variables divided by targets

In [None]:
fig = plt.figure(figsize = (25, 8))


# ----------------------------------------------------------------------------------------------------
# plot the data
# the idea is to iterate over each class
# extract their data ad plot a sepate density plot
for i in heartdf["target"].unique():
    # extract the data
    x = heartdf[heartdf["target"] == i]["chol"]
    # plot the data using seaborn
    plt.subplot(1,4,1)
    sns.kdeplot(x, shade=True, label = "{} target".format(i))

# set the title of the plot
plt.title("Density Plot of chol by target")

# ----------------------------------------------------------------------------------------------------
# plot the data
# the idea is to iterate over each class
# extract their data ad plot a sepate density plot
for i in heartdf["target"].unique():
    # extract the data
    x = heartdf[heartdf["target"] == i]["trestbps"]
    # plot the data using seaborn
    plt.subplot(1,4,2)
    sns.kdeplot(x, shade=True, label = "{} target".format(i))

# set the title of the plot
plt.title("Density Plot of trestbps by target")

# ----------------------------------------------------------------------------------------------------
# plot the data
# the idea is to iterate over each class
# extract their data ad plot a sepate density plot
for i in heartdf["target"].unique():
    # extract the data
    x = heartdf[heartdf["target"] == i]["thalach"]
    # plot the data using seaborn
    plt.subplot(1,4,3)
    sns.kdeplot(x, shade=True, label = "{} target".format(i))

# set the title of the plot
plt.title("Density Plot of thalach by target")

# ----------------------------------------------------------------------------------------------------
# plot the data
# the idea is to iterate over each class
# extract their data ad plot a sepate density plot
for i in heartdf["target"].unique():
    # extract the data
    x = heartdf[heartdf["target"] == i]["oldpeak"]
    # plot the data using seaborn
    plt.subplot(1,4,4)
    sns.kdeplot(x, shade=True, label = "{} target".format(i))

# set the title of the plot
plt.title("Density Plot of oldpeak by target")

plt.tight_layout()
plt.show()

In [None]:
## plot the data based on different target to show the ditribution of chol and trestbps as per different sex

plt.figure(figsize=(25,8),dpi=80)
plt.subplot(1,2,1)
ax = sns.violinplot(x = "sex", y = "chol", hue = "target", split = True, data = heartdf)
ax.set_title('Distribution of chol for different target by sex', fontsize = 15)

plt.subplot(1,2,2)
ay = sns.violinplot(x = "sex", y = "trestbps", hue = "target", split = True, data = heartdf)
ay.set_title('Distribution of trestbps for different target by sex', fontsize = 15)

plt.tight_layout()
plt.show()

In [None]:
## plot the data based on different target to show the ditribution of thalach and oldpeak as per different sex


plt.figure(figsize=(25,8),dpi=80)
plt.subplot(1,2,1)
ax = sns.violinplot(x = "sex", y = "thalach", hue = "target", split = True, data = heartdf)
ax.set_title('Distribution of thalach for different target by sex', fontsize = 15)

plt.subplot(1,2,2)
ay = sns.violinplot(x = "sex", y = "oldpeak", hue = "target", split = True, data = heartdf)
ay.set_title('Distribution of oldpeak for different target by sex', fontsize = 15)

plt.tight_layout()
plt.show()

**comments**

* Maximum number of patience those who have heart disease have chol around 200-300
* Maximum number of patience those who don't have heart disease have chol kind of same
* Maximum number of patience those who have heart disease have trestbps around 120-140
* Maximum number of patience those who don't have heart disease have trestbps kind of same
* Those who have heart disease have more thalach
* Those who don't have heart disease have more oldpeak
* Diseased females have more chol
* Maximum diseased male have lower chol
* Not diseased females have more trestbps rather than male
* Diseased males have slightly more thalach than females
* Diseased males and females have less oldpeak rather than females


created list of categorical variables

In [None]:
cat_cols = ['sex','cp','fbs','restecg','exang','slope','ca','thal']

created different dataframe for deceased and not deceased

In [None]:
heart_des = heartdf[heartdf['target']==1]
heart_notdes = heartdf[heartdf['target']==0]

plot distribution of every unique value of categorical columns for those who have deceased

In [None]:
plt.figure(figsize=(30,15))
for i in enumerate(cat_cols):
    plt.subplot(2,4,i[0]+1)
    ax = heart_des[i[1]].value_counts(normalize=True).plot.barh()
    ax.set_title("Deceased showing by "+i[1],fontsize=15)
plt.show()

**comments:**
* Males are more deceased than females
* Deceased having higher cp
* Deceased having lower fbs
* Maximum deceased having restecg as 1
* Deceased having lower exang
* Deceased having higher slope
* Deceased having lower ca
* Deceased having higher thal

plot distribution of every unique value of categorical columns for those who have deceased

In [None]:
plt.figure(figsize=(30,15))
for i in enumerate(cat_cols):
    plt.subplot(2,4,i[0]+1)
    ax = heart_notdes[i[1]].value_counts(normalize=True).plot.barh()
    ax.set_title("Not deceased showing by "+i[1],fontsize=15)
plt.show()

**comments:**
* Males are more not deceased than males
* not Deceased having lower cp
* not Deceased having lower fbs
* not deceased having lower restecg 
* not Deceased having higher exang
* not Deceased having higher slope
* maximum not Deceased having ca as 1
* not Deceased having higher thal

# Model Building

divided train and test set

In [None]:
df_train,df_test = train_test_split(heartdf,train_size=0.7,random_state=50)

divided x and y of train data

In [None]:
y_train = df_train.pop('target')
X_train = df_train

divided x an y of test data

In [None]:
y_test = df_test.pop('target')
X_test = df_test

In [None]:
## creat function for check train and test set
def check_model(dt):
    print("train confusion matrix : ",confusion_matrix(y_train,dt.predict(X_train)))
    print("train accuracy score : ",accuracy_score(y_train,dt.predict(X_train)))
    print("__"*50)
    print("test confusion matrix : ",confusion_matrix(y_test,dt.predict(X_test)))
    print("test accuracy score : ",accuracy_score(y_test,dt.predict(X_test)))    
    

In [None]:
dt_default = DecisionTreeClassifier(random_state=0)
dt_res = dt_default.fit(X_train,y_train)

check our default model

In [None]:
check_model(dt_res)

In [None]:
## create function to visualize graphs

def tree_graph(dt):

    fig = plt.figure(figsize=(25,20))

    dt_plot = tree.plot_tree(dt,feature_names=X_train.columns,class_names=['Not Deceased','Deceased'],filled=True)

check graph of our default model

In [None]:
tree_graph(dt_res)

# Hyper parameter Tuning using Grid Search 

**GRID SEARCH**

Grid search is the process of performing hyper parameter tuning in order to determine the optimal values for a given model. This is significant as the performance of the entire model is based on the hyper parameter values specified.

For more information check the following link : https://medium.com/datadriveninvestor/an-introduction-to-grid-search-ff57adcc0998

In [None]:
params = {'max_depth':[2,3,4,5,6,7,8,9,10],
          'min_samples_split':[5,10,25,50,75,100,150]}

In [None]:
grid_search = GridSearchCV(estimator=dt_default,param_grid=params,scoring='accuracy',n_jobs=-1,verbose=1) ## create grid search object

In [None]:
grid_search.fit(X_train,y_train)

choseing the best estimator from our estimators

In [None]:
grid_search.best_estimator_

In [None]:
best_dt = grid_search.best_estimator_

# Final Model

check the best model with test data

In [None]:
check_model(best_dt)

plotting of our best model

In [None]:
tree_graph(best_dt)