In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df[df.duplicated()]

In [None]:
print('Shape befroe drop duplicates {}'.format(df.shape))
df.drop_duplicates(inplace=True)
print('Shape befroe drop duplicates {}'.format(df.shape))

In [None]:
df['thall'].value_counts()

## Exploratory Data Analysis

1. age - age in years

2. sex - sex (1 = male; 0 = female)

3. cp - chest pain type (1 = typical angina; 2 = atypical angina; 3 = non-anginal pain; 4 = asymptomatic)

4. trestbps - resting blood pressure (in mm Hg on admission to the hospital)

5. chol - serum cholestoral in mg/dl

6. fbs - fasting blood sugar > 120 mg/dl (1 = true; 0 = false)

7. restecg - resting electrocardiographic results (0 = normal; 1 = having ST-T; 2 = hypertrophy)

8. thalach - maximum heart rate achieved

9. exang - exercise induced angina (1 = yes; 0 = no)

10. oldpeak - ST depression induced by exercise relative to rest

11. slope - the slope of the peak exercise ST segment (1 = upsloping; 2 = flat; 3 = downsloping)

12. ca - number of major vessels (0-3) colored by flourosopy

13. thal - 2 = normal; 1 = fixed defect; 3 = reversable defect

14. num - the predicted attribute - diagnosis of heart disease (angiographic disease status) (Value 0 = < diameter narrowing; Value 1 = > 50% diameter narrowing)

In [None]:
num_feature = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']
cat_feature = ['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall']

In [None]:
plt.figure(figsize=(13,13))
plt.subplot(2,3,1)
sns.violinplot(x = 'sex', y = 'output', data = df)
plt.subplot(2,3,2)
sns.violinplot(x = 'thall', y = 'output', data = df)
plt.subplot(2,3,3)
sns.violinplot(x = 'exng', y = 'output', data = df)
plt.subplot(2,3,4)
sns.violinplot(x = 'restecg', y = 'output', data = df)
plt.subplot(2,3,5)
sns.violinplot(x = 'cp', y = 'output', data = df)
plt.xticks(fontsize=9, rotation=45)
plt.subplot(2,3,6)
sns.violinplot(x = 'fbs', y = 'output', data = df)
plt.show()

In [None]:
for idx, feature in enumerate(cat_feature):
    sns.countplot(data=df, x=feature)
    plt.show()

In [None]:
fig = plt.figure(figsize=(10,10))
df_corr = df[num_feature].corr().transpose()
mask = np.triu(np.ones_like(df_corr))
sns.heatmap(df_corr,mask=mask,fmt=".1f",annot=True,cmap='YlGnBu')
plt.show()

In [None]:
fig = plt.figure(figsize=(10,10))
df_corr = df[cat_feature].corr().transpose()
mask = np.triu(np.ones_like(df_corr))
sns.heatmap(df_corr,mask=mask,fmt=".1f",annot=True,cmap='YlGnBu')
plt.show()

In [None]:
fig = plt.figure(figsize=(10,10))
df_corr = df.corr().transpose()
mask = np.triu(np.ones_like(df_corr))
sns.heatmap(df_corr,mask=mask,fmt=".1f",annot=True,cmap='YlGnBu')
plt.show()

In [None]:
for feature in num_feature:
    sns.distplot(df[feature])
    plt.show()

## Data preprocessing and Modeling

In [None]:
X = df.drop(['output'], axis=1)
y = df['output']

In [None]:
df_cat = X[cat_feature].drop(['fbs'], axis=1)
df_num = X[num_feature]
df_cat.shape, df_num.shape, y.shape

In [None]:
df_cat.head()

In [None]:
df_num.head()

In [None]:
sex = {1: 'Male', 0: 'Female'}
cp = {0: 'typical angina', 1: 'atypical angina', 2: 'non-anginal pain', 3: 'asymptomatic'}
restecg = {0: 'Normal_restecg', 1: 'Having ST-T', 2: 'Hypertrophy'}
exng = {0: 'exang_no', 1: 'exang_yes'}
slope = {0: 'upsloping', 1: 'flat', 2: 'downsloping'}
ca = {0: 'ca_level1', 1: 'ca_level2', 2: 'ca_level3'}
thal = {0: None, 1: 'fixed defect', 2: 'Normal', 3: 'reversable defect'}

In [None]:
df_cat = df_cat.replace({'sex': sex})
df_cat = df_cat.replace({'cp': cp})
df_cat = df_cat.replace({'restecg': restecg})
df_cat = df_cat.replace({'exng': exng})
df_cat = df_cat.replace({'slp': slope})
df_cat = df_cat.replace({'caa': ca})
df_cat = df_cat.replace({'thall': thal})
df_cat = df_cat.reset_index()
df_cat = df_cat.drop(['index'], axis=1)
df_cat

In [None]:
df_cat = pd.get_dummies(df_cat, columns=['sex', 'cp', 'restecg', 'exng', 'slp', 'caa', 'thall'])
df_cat.shape

In [None]:
## Scaler for num feature
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler()
df_num_scale = pd.DataFrame(std_scale.fit(df_num).transform(df_num))
df_num_scale.columns = df_num.columns
df_num_scale

In [None]:
df_ex = pd.concat([df_num_scale, df_cat], axis=1)
df_ex

In [None]:
print('Shape before feature engineering has Rows = {0} and Columns = {1}'.format(df.shape[0], df.shape[1]))
print('Shape before Modeling has Rows = {0} and Columns = {1}'.format(df_ex.shape[0], df_ex.shape[1]))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_ex, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression
clf1 = LogisticRegression(random_state=0, penalty='l2').fit(X_train, y_train)
clf1.score(X_test, y_test), clf1.score(X_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier(criterion='entropy', n_estimators=50, max_depth=2,random_state=0).fit(X_train, y_train)
clf2.score(X_test, y_test), clf2.score(X_train, y_train)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
clf3 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, max_depth=1, random_state=0).fit(X_train, y_train)
clf3.score(X_test, y_test), clf3.score(X_train, y_train)

In [None]:
from sklearn.naive_bayes import GaussianNB
clf4 = GaussianNB().fit(X_train, y_train)
clf4.score(X_test, y_test), clf4.score(X_train, y_train)

In [None]:
from sklearn.ensemble import VotingClassifier
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gbc', clf3), ('gnb', clf4)], voting='hard')

In [None]:
from sklearn.model_selection import cross_val_score
for clf, label in zip([clf1, clf2, clf3, clf4, eclf], ['Logistic Regression', 'Random Forest', 'GradientBoostingClassifier','naive Bayes', 'Ensemble']):
    scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=5)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))