# Importing Library

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Exploration

In [None]:
#Read Data
df = pd.read_csv("../input/heart-disease-uci/heart.csv")

In [None]:
#First 5 rows of data
df.head()

In [None]:
#Dimension of data
df.shape

In [None]:
#Getting the data types of the variable
df.info()

In [None]:
#statistical properties of dataset
df.describe()

In [None]:
#columns present in dataset
df.columns

# Analyse the target variable (Univariate Analysis)

In [None]:
#view the unique value in target variable
df['target'].unique()

In [None]:
#frequency distribution of target variable
df['target'].value_counts()

In [None]:
#graphical representation of target variable
f, ax = plt.subplots(figsize=(6, 4))
ax = sns.countplot(x="target", data=df)
plt.show()

# Data Profiling

In [None]:
profile = pandas_profiling.ProfileReport(df)
profile

From Data profiling, we found that there is no missing value in the data. So we don't have to do missing value treatment.

# Bivariate Analysis

In [None]:
#Correlation matrix to see how features are correlated with target
plt.rcParams['figure.figsize'] = (20, 15)
plt.style.use('ggplot')

corrmat = df.corr()
sns.heatmap(corrmat, cmap = 'Wistia', annot=True)
plt.show()


In [None]:
Num=corrmat['target'].sort_values(ascending=False).head(20).to_frame()

Num

**Interpretation of correlation coefficient**

* The correlation coefficient ranges from -1 to +1.

* When it is close to +1, this signifies that there is a strong positive correlation. So, we can see that there is no variable which has strong positive correlation with target variable.

* When it is close to -1, it means that there is a strong negative correlation. So, we can see that there is no variable which has strong negative correlation with target variable.

* When it is close to 0, it means that there is no correlation. So, there is no correlation between target and fbs.

We can see that the cp and thalach variables are mildly positively correlated with target variable.
And exang, oldpeak, ca, thal are negatively correlated. So, I will analyze the interaction between these positively correlated features and target variable.


In [None]:
#We can visualize the value counts of the cp variable wrt target as follows -
f, ax = plt.subplots(figsize=(8, 6))
ax = sns.countplot(x="cp", hue="target", data=df)
plt.show()

In [None]:
#We can visualize the value counts of the thalach variable wrt target as follows -
f, ax = plt.subplots(figsize=(8, 6))
sns.stripplot(x="target", y="thalach", data=df)
plt.show()

let's change the names of the  columns for better understanding

In [None]:
df.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved',
       'exercise_induced_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']

df.columns

In [None]:
df['sex'][df['sex'] == 0] = 'female'
df['sex'][df['sex'] == 1] = 'male'

df['chest_pain_type'][df['chest_pain_type'] == 1] = 'typical angina'
df['chest_pain_type'][df['chest_pain_type'] == 2] = 'atypical angina'
df['chest_pain_type'][df['chest_pain_type'] == 3] = 'non-anginal pain'
df['chest_pain_type'][df['chest_pain_type'] == 4] = 'asymptomatic'

df['fasting_blood_sugar'][df['fasting_blood_sugar'] == 0] = 'lower than 120mg/ml'
df['fasting_blood_sugar'][df['fasting_blood_sugar'] == 1] = 'greater than 120mg/ml'

df['rest_ecg'][df['rest_ecg'] == 0] = 'normal'
df['rest_ecg'][df['rest_ecg'] == 1] = 'ST-T wave abnormality'
df['rest_ecg'][df['rest_ecg'] == 2] = 'left ventricular hypertrophy'

df['exercise_induced_angina'][df['exercise_induced_angina'] == 0] = 'no'
df['exercise_induced_angina'][df['exercise_induced_angina'] == 1] = 'yes'

df['st_slope'][df['st_slope'] == 1] = 'upsloping'
df['st_slope'][df['st_slope'] == 2] = 'flat'
df['st_slope'][df['st_slope'] == 3] = 'downsloping'

df['thalassemia'][df['thalassemia'] == 1] = 'normal'
df['thalassemia'][df['thalassemia'] == 2] = 'fixed defect'
df['thalassemia'][df['thalassemia'] == 3] = 'reversable defect'

In [None]:
df['sex'] = df['sex'].astype('object')
df['chest_pain_type'] = df['chest_pain_type'].astype('object')
df['fasting_blood_sugar'] = df['fasting_blood_sugar'].astype('object')
df['rest_ecg'] = df['rest_ecg'].astype('object')
df['exercise_induced_angina'] = df['exercise_induced_angina'].astype('object')
df['st_slope'] = df['st_slope'].astype('object')
df['thalassemia'] = df['thalassemia'].astype('object')

Dummy Variable Creation

In [None]:
df = pd.get_dummies(df, drop_first=True)

# Splitting the Data

In [None]:
# splitting the dependent and independent variables from the data

x = df.drop('target', axis=1)
y = df.target

# checking the shapes of x and y
print("Shape of x:", x.shape)
print("Shape of y:", y.shape)

In [None]:
# splitting the sets into training and test sets

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

# getting the shapes
print("Shape of x_train :", x_train.shape)
print("Shape of x_test :", x_test.shape)
print("Shape of y_train :", y_train.shape)
print("Shape of y_test :", y_test.shape)

# Model Building

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score 

1. Logistic Regression

In [None]:
model1 = LogisticRegression()
model1.fit(x_train, y_train)
model1.score(x_train, y_train)

2. Random Classifier

In [None]:
model2 = RandomForestClassifier(n_estimators = 50, max_depth = 5)
model2.fit(x_train, y_train)
model2.score(x_train, y_train)

3. XgBoost

In [None]:
model3 = xgb.XGBClassifier(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213,
                             random_state =7, nthread = -1)
model3.fit(x_train, y_train)
model3.score(x_train, y_train)

# Voting

In [None]:
from sklearn.ensemble import VotingClassifier 
estimator = [] 
estimator.append(('LR',  
                  LogisticRegression(solver ='lbfgs',  
                                     multi_class ='multinomial',  
                                     max_iter = 200))) 
estimator.append(('RFC', RandomForestClassifier())) 
estimator.append(('XGB', XGBClassifier())) 

In [None]:
# Voting Classifier with soft voting 
vot_soft = VotingClassifier(estimators = estimator, voting ='soft') 
vot_soft.fit(x_train, y_train) 
y_pred = vot_soft.predict(x_test) 
y_pred

In [None]:
# using accuracy_score 
score = accuracy_score(y_test, y_pred) 
print("Soft Voting Score % d" % score) 