1. [Load and Check Data](#1)
2. [Variable Description](#2)
3. [Univariate Variable Analysis](#3)
4. [Basic Analysis](#4)
5. [Outlier Detection](#5)
6. [Missing Value](#6)
> 6.1[Finding Missing Value](#6.1)
7. [Visualization](#7)
> 7.1 [Correlation](#7.1)
8. [Feature Engineering](#8)
9. [Modeling](#9)
> 9.1 [Train-test split](#9.1)  
> 9.2 [Simple Logistic Regression](#9.2)  
> 9.3 [Hyperparameter Tuning -- Grid Search -- Cross Validation](#9.3)   
> 9.4 [Ensemble Modeling](#9.4)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')


import seaborn as sns

from collections import Counter

import warnings 
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id= '1'> <a/>

# 1. Load and Check Data

In [None]:
df = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')


In [None]:
df.columns


In [None]:
df.head()

In [None]:
df.describe().T


<a id= '2'> <a/>

# 2. Variable Description

1. age
2. sex
3. chest pain type (4 values)
4. resting blood pressure
5. serum cholestoral in mg/dl
6. fasting blood sugar > 120 mg/dl
7. resting electrocardiographic results (values 0,1,2)
8. maximum heart rate achieved
9. exercise induced angina
10. oldpeak = ST depression induced by exercise relative to rest
11. the slope of the peak exercise ST segment
12. number of major vessels (0-3) colored by flourosopy
13. thal: 3 = normal; 6 = fixed defect; 7 = reversable def2c13

In [None]:
df.info()

<a id= '3'> <a/>

# 3. Univariate Variable Analysis

In [None]:
def plot_hist(variable):
    plt.figure(figsize = (9,3))
    plt.hist(df[variable],bins = 10)
    plt.xlabel(variable)
    plt.ylabel("Frequency")
    plt.title("{} distribution with histogram".format(variable))
    plt.show()

In [None]:
numericalVar = ['age', 'sex', 'cp', 'trestbps','chol','fbs','restecg',
                'thalach','exang','oldpeak','slope','ca', 'thal']
for n in numericalVar:
    plot_hist(n)

In [None]:
df.plot(subplots=True,figsize=(18,18))
plt.show()

<a id= '4'> <a/>

# 4. Basic Analysis

Let's make an analysis between the target and the variables given below:  
  
* age
* sex
* cp
* trestbps
* chol
* fbs
* restecg
* thalach
* exang
* oldpeak
* slope
* ca
* thal

In [None]:
# Age - Target
df[['age', 'target']].groupby(['age'], as_index = False).mean().sort_values(by = 'target', ascending = False)

In [None]:
# Sex - Target
df[['sex', 'target']].groupby(['sex'], as_index = False).mean().sort_values(by = 'target', ascending = False)

In [None]:
# Cp - Target
df[['cp', 'target']].groupby(['cp'], as_index = False).mean().sort_values(by = 'target', ascending = False)

In [None]:
# trestbps - Target
df[['trestbps', 'target']].groupby(['trestbps'], as_index = False).mean().sort_values(by = 'target', ascending = False)

In [None]:
# Sex - Target
df[['sex', 'target']].groupby(['sex'], as_index = False).mean().sort_values(by = 'target', ascending = False)

In [None]:
# Chol - Target
df[['chol', 'target']].groupby(['chol'], as_index = False).mean().sort_values(by = 'target', ascending = False)

In [None]:
# Fbs - Target
df[['fbs', 'target']].groupby(['fbs'], as_index = False).mean().sort_values(by = 'target', ascending = False)

In [None]:
# restecg - Target
df[['restecg', 'target']].groupby(['restecg'], as_index = False).mean().sort_values(by = 'target', ascending = False)

In [None]:
# thalach - Target
df[['thalach', 'target']].groupby(['thalach'], as_index = False).mean().sort_values(by = 'target', ascending = False)

In [None]:
# exang - Target
df[['exang', 'target']].groupby(['exang'], as_index = False).mean().sort_values(by = 'target', ascending = False)

In [None]:
# oldpeak - Target
df[['oldpeak', 'target']].groupby(['oldpeak'], as_index = False).mean().sort_values(by = 'target', ascending = False)

In [None]:
# slope - Target
df[['slope', 'target']].groupby(['slope'], as_index = False).mean().sort_values(by = 'target', ascending = False)

In [None]:
# ca - Target
df[['ca', 'target']].groupby(['ca'], as_index = False).mean().sort_values(by = 'target', ascending = False)

In [None]:
# thal - Target
df[['thal', 'target']].groupby(['thal'], as_index = False).mean().sort_values(by = 'target', ascending = False)

In [None]:
df['age']=df['age']
bins=[29,47,55,61,77]
labels=["Young Adult","Early Adult","Adult","Senior"]
df['age_group']=pd.cut(df['age'],bins,labels=labels)
fig=plt.figure(figsize=(20,5))
sns.barplot(x='age_group',y='sex',data=df)
plt.show()

<a id= '5'> <a/>

# 5. Outlier Detection

In [None]:
def detect_outlier(df, features):
    outlier_indices = []
    
    for c in features:
        # 1st quartile
        Q1 = np.percentile(df[c],25)
        # 3rd quartile
        Q3 = np.percentile(df[c],75)
        # IQR
        IQR = Q3-Q1
        # Outlier step
        outlier_step = IQR * 1.5
        # Detect Outlier and Their Indices
        outlier_list_col = df[(df[c] < Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index
        # Store Indices
        outlier_indices.extend(outlier_list_col)
        
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 2)
    
    return multiple_outliers


In [None]:
df.loc[detect_outlier(df, ['age', 'sex', 'cp', 'trestbps','chol','fbs','restecg',
                'thalach','exang','oldpeak','slope','ca', 'thal'])]

Perfect ! there is no outlier values in our dataset

<a id= '6'> <a/>

# 6. Missing Value

In [None]:
df.head()


<a id= '6.1'> <a/>

## 6.1 Finding Missing Value

In [None]:
df.columns[df.isnull().any()]


In [None]:
df.isnull().sum()


<a id= '7'> <a/>

# 7. Visualization

<a id= '7.1'> <a/>

## 7.1 Correlation

In [None]:
sns.heatmap(df.corr(), annot = True, fmt = '.2f' )
plt.show()

We can see from the table above that there is a link between age and the variables in the list below:  
  

* age - trestbps 
* age - chol
* age - thalach


In [None]:
g = sns.factorplot(x = 'age_group', y = 'target', data = df, kind = 'bar', size = 7)
g.set_ylabels('Probability')
plt.show()

In [None]:
g = sns.factorplot(x = 'age_group', y = 'chol', data = df, kind = 'bar', size = 7)
g.set_ylabels('Probability')
plt.show()

In [None]:
g = sns.factorplot(x = 'age_group', y = 'thalach', data = df, kind = 'bar', size = 7)
g.set_ylabels('Probability')
plt.show()

In [None]:
grp =df.groupby("age")
x= grp["chol"].agg(np.mean)
y=grp["trestbps"].agg(np.mean)
z=grp["thalach"].agg(np.mean)

In [None]:
plt.figure(figsize=(16,5))
plt.plot(x,'ro',color='r')
plt.xticks(rotation=90)
plt.title("Age wise Chol")
plt.xlabel("Age")
plt.ylabel("Chol")
plt.show()

In [None]:
plt.figure(figsize=(15,5))
plt.plot(y,'r--',color='b')
plt.xticks(rotation=90)
plt.title("Age wise Trestbps")
plt.xlabel("Age")
plt.ylabel("Trestbps")
plt.show()

In [None]:
plt.figure(figsize=(16,5))
plt.plot(z,"g^",color='g')
plt.xticks(rotation=90)
plt.xlabel("Age")
plt.ylabel("Thalach")
plt.show()

In [None]:
fig=plt.figure(figsize=(20,5))
sns.violinplot(x ='age', y = 'trestbps', data = df)
plt.show()

In [None]:
ax = df.trestbps.plot.kde()
ax = df.chol.plot.kde()
ax = df.thalach.plot.kde()
ax.legend()
plt.show()


In [None]:
g = sns.FacetGrid(df, col = 'target')
g.map(sns.distplot, 'age', bins = 25)
plt.show()

* Target means diagnosis of heart disease (angiographic disease status)
 >- Value 0: < 50% diameter narrowing
 >- Value 1: > 50% diameter narrowing  
* age <=~30 has a small heart disease rate  
* large number btw 40-60 have heart d isease
* most patiants are in 40-60 age range

In [None]:
sns.factorplot(x = 'sex', y = 'age', data = df, kind = 'box')
plt.show()

Sex is not informative for age prediction, age distribution seems to be same.

In [None]:
plt.figure(figsize=(20,5))
sns.factorplot(x  ='sex', y = 'chol', hue  ='age_group', data = df, kind = 'box')
plt.show()

<a id= '8'> <a/>

# 8. Feature Engineering

In [None]:
df["sex"] = df["sex"].astype("category")
df = pd.get_dummies(df, columns=["sex"])
df.head()


In [None]:
df["age_group"] = df["age_group"].astype("category")
df = pd.get_dummies(df, columns=["age_group"])
df.head(25)


In [None]:
df["slope"] = df["slope"].astype("category")
df = pd.get_dummies(df, columns=["slope"])
df.head(25)


In [None]:
df["cp"] = df["cp"].astype("category")
df = pd.get_dummies(df, columns=["cp"])
df.head(25)


In [None]:
df["fbs"] = df["fbs"].astype("category")
df = pd.get_dummies(df, columns=["fbs"])
df.head()


In [None]:
df["restecg"] = df["restecg"].astype("category")
df = pd.get_dummies(df, columns=["restecg"])
df.head()


In [None]:
df["exang"] = df["exang"].astype("category")
df = pd.get_dummies(df, columns=["exang"])
df.head()

<a id= '9'> <a/>

# 9. Modeling

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

<a id= '9.1'> <a/>

## 9.1 Train-test split

In [None]:
train_df_len  = int(df.shape[0]*0.66)

In [None]:
test = df[train_df_len:]
test.drop(labels = ['target'], axis = 1, inplace = True)

In [None]:
test.head()

In [None]:
train = df[:train_df_len]
X_train = train.drop(labels = 'target', axis = 1)
y_train = train['target']
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = .33, random_state = 123)
print('X_train', len(X_train))
print('X_test', len(X_test))
print('y_train', len(y_train))
print('y_test', len(y_test))

<a id= '9.2'> <a/>

## 9.2 Simple Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
acc_log_train = round(logreg.score(X_train, y_train)*100,2) 
acc_log_test = round(logreg.score(X_test,y_test)*100,2)
print("Training Accuracy: % {}".format(acc_log_train))
print("Testing Accuracy: % {}".format(acc_log_test))

<a id= '9.3'> <a/>

## 9.3 Hyperparameter Tuning -- Grid Search -- Cross Validation
  
Compare 5 ML classifier and evaluate mean accuracy of each of them by stratified cross validation

* Decision Tree
* SVM
* Random Forest
* KNN
* Logistic Regression

In [None]:
random_state = 1001
classifier = [DecisionTreeClassifier(random_state = random_state),
              SVC(random_state = random_state),
              RandomForestClassifier(random_state = random_state),
              LogisticRegression(random_state = random_state),
              KNeighborsClassifier()]

In [None]:
dt_param_grid = {'min_samples_split': range(10,500,20),
                'max_depth': range(1,20,2)}

In [None]:
svc_param_grid = {'kernel': ['rbf'],
                  'gamma' : [0.001, 0.01, 0.1, 1],
                  'C'     : [1,10,50,100,200,300,1000]}

In [None]:
rf_param_grid = {"max_features": [1,3,10],
                 "min_samples_split":[2,3,10],
                 "min_samples_leaf":[1,3,10],
                 "bootstrap":[False],
                 "n_estimators":[100,300],
                 "criterion":["gini"]}

In [None]:
logreg_param_grid = {'C'      : np.logspace(-3,3,7),
                     'penalty':['l1', 'l2']}

In [None]:
knn_param_grid = {"n_neighbors": np.linspace(1,19,10, dtype = int).tolist(),
                  "weights"    : ["uniform","distance"],
                  "metric"     :["euclidean","manhattan"]}

In [None]:
classifier_param = [dt_param_grid,
                    svc_param_grid,
                    rf_param_grid,
                    logreg_param_grid,
                    knn_param_grid]

In [None]:
cv_result = []
best_estimators = []
for i in range(len(classifier)):
    clf = GridSearchCV(classifier[i], param_grid=classifier_param[i], cv = StratifiedKFold(n_splits = 10), scoring = "accuracy", n_jobs = -1,verbose = 1)
    clf.fit(X_train,y_train)
    cv_result.append(clf.best_score_)
    best_estimators.append(clf.best_estimator_)
    print(cv_result[i])

In [None]:
cv_results = pd.DataFrame({"Cross Validation Means":cv_result,
                           "ML Models":["DecisionTreeClassifier", "SVM","RandomForestClassifier","LogisticRegression","KNeighborsClassifier"]})

g = sns.barplot("Cross Validation Means", "ML Models", data = cv_results)
g.set_xlabel("Mean Accuracy")
g.set_title("Cross Validation Scores")

<a id= '9.4'> <a/>

## 9.4 Ensemble Modeling

In [None]:
votingC = VotingClassifier(estimators = [("dt",best_estimators[0]),
                                         ("rfc",best_estimators[2]),
                                         ("lr",best_estimators[3])],
                                         voting = "soft", n_jobs = -1)
votingC = votingC.fit(X_train, y_train)
print(accuracy_score(votingC.predict(X_test),y_test))