In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}
sns.set(color_codes=True)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score,recall_score

import warnings
warnings.simplefilter('ignore')

# Load and Explore Dataset

In [None]:
df = pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv")
print("Data Shape:", df.shape)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## Univariate Analysis

In [None]:
# Visualize the frequency of categorical values
fig, ax = plt.subplots(nrows=3, ncols=3, sharey=True, figsize=(10,8))
sns.countplot(x='sex', data=df, palette='winter', ax=ax[0][0])
sns.countplot(x='cp', data=df, palette='winter', ax=ax[0][1])
sns.countplot(x='fbs', data=df, palette='winter', ax=ax[0][2])
sns.countplot(x='restecg', data=df, palette='winter', ax=ax[1][0])
sns.countplot(x='exng', data=df, palette='winter', ax=ax[1][1])
sns.countplot(x='slp', data=df, palette='winter', ax=ax[1][2])
sns.countplot(x='caa', data=df, palette='winter', ax=ax[2][0])
sns.countplot(x='thall', data=df, palette='winter', ax=ax[2][1])
sns.countplot(x='output', data=df, palette='winter', ax=ax[2][2])
plt.tight_layout() 
plt.show()

In [None]:
# Visualize the probability density of continuous variables
fig, ax = plt.subplots(nrows=1, ncols=5, sharey=False, figsize=(14,3))
sns.distplot(df['age'], color='royalblue', ax=ax[0])
sns.distplot(df['trtbps'], color='royalblue', ax=ax[1])
sns.distplot(df['chol'], color='royalblue', ax=ax[2])
sns.distplot(df['thalachh'], color='royalblue', ax=ax[3])
sns.distplot(df['oldpeak'], color='royalblue', ax=ax[4])
plt.tight_layout() 
plt.show()

## Segmented univariate analysis

In [None]:
# Visualize the frequency of categorical values by output
fig, ax = plt.subplots(nrows=3, ncols=3, sharey=True, figsize=(10,8))
sns.countplot(x='sex', data=df, palette='winter', hue='output', ax=ax[0][0])
sns.countplot(x='cp', data=df, palette='winter', hue='output', ax=ax[0][1])
sns.countplot(x='fbs', data=df, palette='winter', hue='output', ax=ax[0][2])
sns.countplot(x='restecg', data=df, palette='winter', hue='output', ax=ax[1][0])
sns.countplot(x='exng', data=df, palette='winter', hue='output', ax=ax[1][1])
sns.countplot(x='slp', data=df, palette='winter', hue='output', ax=ax[1][2])
sns.countplot(x='caa', data=df, palette='winter', hue='output', ax=ax[2][0])
sns.countplot(x='thall', data=df, palette='winter', hue='output', ax=ax[2][1])
plt.tight_layout() 
plt.show()

In [None]:
# Visualize the distribution of continuous variables by output
fig, ax = plt.subplots(nrows=1, ncols=5, sharey=False, figsize=(15,4))
sns.boxenplot(x='output', y='age', data=df, palette='winter', linewidth=0.1, ax=ax[0])
sns.boxenplot(x='output',y='trtbps', data=df, palette='winter', linewidth=0.1, ax=ax[1])
sns.boxenplot(x='output',y='chol', data=df, palette='winter', linewidth=0.1, ax=ax[2])
sns.boxenplot(x='output',y='thalachh', data=df, palette='winter', linewidth=0.1, ax=ax[3])
sns.boxenplot(x='output',y='oldpeak', data=df, palette='winter', linewidth=0.1, ax=ax[4])
plt.tight_layout() 
plt.show()

In [None]:
# Visualize the correlationship between variables

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(8, 8))

# Draw the heatmap with correct aspect ratio
sns.heatmap(df.corr(), cmap="coolwarm", vmax=.3, center=0, square=True, linewidths=2.5, 
            cbar_kws={"shrink": .5}, annot=False, fmt="1.1f")

# Data Preprocessing

In [None]:
# Split data into X and y
X = df.drop('output', axis=1)
y = df['output']

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# Standardization
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Make Prediction

In [None]:
# Create a model dictionary
models = {"Logistic Regression   ": LogisticRegression(),
          "K-Nearest Neighbors   ": KNeighborsClassifier(),
          "Support Vector Machine": SVC(probability=True),
          "Decision Tree         ": DecisionTreeClassifier(),
          "Random Forest         ": RandomForestClassifier(),
          "Ada Boost             ": AdaBoostClassifier(),
          "XGBoost               ": XGBClassifier(),
          "LightGBM              ": LGBMClassifier(),
          "CatBoost              ": CatBoostClassifier(verbose=0),
          "Mulilayer Perceptron  ": MLPClassifier()
         }

In [None]:
# Fit the models
for name, model in models.items():
    model.fit(X_train, y_train)

# Model Evaluation

In [None]:
# Accuracy
print("Accuracy")
for name, model in models.items():
    print(name + ": {:.3f}".format(accuracy_score(y_test, model.predict(X_test))))

In [None]:
# AUC Score
print("AUC Score")
for name, model in models.items():
    print(name + ": {:.3f}".format(roc_auc_score(y_test,model.predict_proba(X_test)[:,1])))

In [None]:
# Confusin matrix
log_reg = LogisticRegression().fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

# Transform to df for easier plotting
cm_df = pd.DataFrame(cm,
                     index = ['Negative','Positive'], 
                     columns = ['Negative','Positive'])

plt.figure(figsize=(5.5,4))
sns.heatmap(cm_df, annot=True, cmap='Blues')
plt.title('Logistic Regression \nAccuracy:{0:.3f}'.format(accuracy_score(y_test, y_pred)))
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

## Feature Importance

In [None]:
# get importance
importance = log_reg.coef_[0]

# Plot feature importance
df_importance = pd.DataFrame(importance)
df_importance.columns = ['feature_importance']
df_importance['features'] = X.columns
sns.barplot(x='feature_importance', y='features', data=df_importance)