In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### My first dataset in kaggle

In [None]:
# Importing the required libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing the csv file
df = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
# Checking the first five rows of the dataset
df.head()

In [None]:
# Checking the shape of the dataset
df.shape

In [None]:
# Checking the basic overview of the dataset
df.info()

In [None]:
# Checking for null values in the dataset
df.isnull().sum()

We can see that, there are no null values in our dataset

In [None]:
# Checking if there are null values in the form of question mark
df[df=='?'].count()

We can see that there are no question marks as well.

In [None]:
# The categorical columns in the dataset, except the target variable are these
df_cat = df[['sex','exng','caa','cp','fbs','restecg','slp','thall']]

In [None]:
# Changing the datatypes of the categorical variables which are coded
df['sex'] = df['sex'].astype('object')
df['exng']=df['exng'].astype('object')
df['caa']=df['caa'].astype('object')
df['cp'] = df['cp'].astype('object')
df['fbs']= df['fbs'].astype('object')
df['restecg'] = df['restecg'].astype('object')
df['slp'] = df['slp'].astype('object')
df['thall'] = df['thall'].astype('object')
df['output'] = df['output'].astype('object')

In [None]:
# The numerical columns in the dataset are these
df_num = df[['age','trtbps','chol','thalachh','oldpeak']]

In [None]:
# Checking for the percentage of output column
plt.rcParams['figure.figsize'] = 8,4
fig,ax = plt.subplots(1,2)
df['output'].value_counts().plot(kind='bar',rot=0, cmap='summer',ax=ax[0])
df['output'].value_counts().plot(kind='pie',cmap='icefire',ax=ax[1],autopct='%.1f%%')
plt.show()

We can see that there is no imbalance in our dataset.

So, we could see from our target variable that , there are more number of people, that is around 54% of people have high chances of having a heart attack.
Let's see how it is getting affected based on the other features.

In [None]:
# Five point summary of numerical columns
df_num.describe()

We can see that, age is varying between 29 and 77. The mean and median are moreover equal, so age is not skewed.

The bp level varies between 94 and 200. We can observe that few people have very high bp, but the mean is normal.

We can see that cholestrol varies between 126 and 564. The mean and median are different
So, we can tell that chol distribution is skewed. And also the standard deviation value is very high,
so there can be outliers in this column.

The thalachh column is varying between 71 to 202. We can observe that, few people are 
having very high heart beat rates, which could be risky. The mean and median are also very much different, so the column is skewed.
The mean is little higher than usual.

Old peak is between 0 and 6.2 , there is not much info given about this feature.
But we can tell from mean and median that , it is skewed.



**Univariate Analysis**

In [None]:
# Checking the distribution of the numerical columns
plt.rcParams['figure.figsize']= 16,6
i=1
for col in df_num:
    fig,ax= plt.subplots(1,3)
    sns.boxplot(df_num[col],ax=ax[0],palette='Greens_r')
    sns.distplot(df_num[col],ax=ax[1],color='green')
    sns.violinplot(df_num[col],ax=ax[2],palette='Greens_r')
    i+=1
    plt.show()

We can see in the Age column that, there are more number of people in the age group of
50-60.

We can see in the trtbps column that, most people's bp is in the range between 120 and 140
which is the normal range, but there are also plenty of people with high bp value

Most of the people have cholestrol in the range 200-300 which is little high than normal cholestrol value
But since most of the data is about 50-70 age people, this is the trend we are getting

The thalachh column is having a peak between the range 150-175
which is high for people of the age group 50-70

People with value 0 of old peak is more compared to other values.

We can see that, the numerical columns trtbps, chol, thalachh and oldpeak has outliers. It will not be wise to drop the outliers so we can transform them.

In [None]:
# Univariate analysis of categorical features
plt.rcParams['figure.figsize']= 12,6
for col in df_cat:
    print('\n',col,':')
    fig,ax = plt.subplots(1,2)
    df_cat[col].value_counts().plot(kind='bar',ax=ax[0],rot=0, cmap='Set2')
    df_cat[col].value_counts().plot(kind='pie',autopct='%.1f%%',ax=ax[1], cmap='Set3')
    plt.show()
plt.show()

In the sex column, we do not know which is Male and which is female
We have data more about gender type 1 compared to gender type 2
Gender type 1 is more than double gender type 2

In the exercise induced angina column, we can see that only very less percentage of people
32 % of them has got pain in the chest due to some physical activity

In the caa column, we can see 0th value of caa is taking more than 50%

In the cp column, we see that around 50% percentage of people have typical angina

The fasting blood sugar column says, around 85% of people's blood sugar is normal
and only 15% of people have fasting blood sugar higher than normal value

From restecg, we see that, around 50% of the people's ecg is normal

In slp column , the values 1 and 2 are contributing more

The thallium stress test tells that the value 2 is more compared to other values

#### Bivariate Analysis

In [None]:
# Numerical columns with output
for col in df_num:
    fig,ax= plt.subplots(1,2)
    sns.boxplot(df['output'], df_num[col],ax=ax[0],palette='spring')
    sns.violinplot(df['output'], df_num[col],ax=ax[1],palette='spring')
    plt.show()

When comparing age with the output we can see an anomaly that
People between age 50-65 have less chances of getting a heart attack
Whereas, for having chances of getting heart attack, it is uniformly distributed
for all the ages between 40-70.

With trtbps with output, we can see that people with high bp have less
chances of getting heart attack, whereas, those with bp between 120 and
140 are having higher chances.

From cholestrol with output distribution we can see that, the people
with high cholestrol, that is more than 400 are surely having chances 
of getting heart attack

When comparing thalachh with output we can see that, people with less heart rates
have very less chances of getting heart attack, and similarly the people with higher
heart rates are having more chances of getting heart attack

From old peak with output we can see that, the people with old peak value 0 are
having more chances of getting heart attack, also we can see that, as old peak value
increases, the chance of getting heart attack is reduced.

In [None]:
# Categorical columns with Output
plt.rcParams['figure.figsize']= 10,4
for col in df_cat:
    print(col,'Vs','output:')
    fig,ax = plt.subplots(1,2)
    sns.countplot(df_cat[col],hue= df['output'],ax=ax[0],palette='autumn')
    ax_1= pd.crosstab(df_cat[col],df['output']).apply(lambda r: r/r.sum()*100, axis=1).plot(kind='bar',stacked=True,ax=ax[1],rot=0,cmap='summer')
    for rec in ax_1.patches:
        height = rec.get_height()
        ax_1.text(rec.get_x() + rec.get_width() / 2, 
              rec.get_y() + height / 2,
              "{:.0f}%".format(height),
              ha='center', 
              va='bottom')
    plt.show()

When comparing sex with the output, we can see that, the gender 0 is having 
very high chance of getting heart attack, while the gender 1 is having around 50% 
chance of getting the heart attack

Comparing exng with output, we can see an anomaly that, of the people who does
not get pain due to physical activity are having more chance of getting the heart attack

The caa with output is also having an anomaly, when the number of vessels is very less
that is when 0, the chances of getting heart attack are around 75% and also
when the number of major vessels is 4, there is 80% chance for getting heart attack
The intermediate number of vessels are having less percentage of people getting heart attack

We can see from cp that, the atypical angina is having more than 80% of people getting heart attack
Followed by people with non-anginal and asymptomatic, where both of them are having
more than 70% chance of getting heart attack

The fasting blood sugar is not seeming to have an effect on output, as 
people with blood sugar and also people without blood sugar is also having
almost equal chances of getting heart attack

When comparing restecg with output, we can see that, people with normal ecg 
is also having around 46% chances of getting heart attack. Also, when there is abnormality
in the restecg, there is around 63% chance of getting heart attack.

While comparing slp with output, we can see that, the slp value 2 is having very high
chance of getting heart attack, the other two slope values too have some significant
chance of getting heart attack

From thallium stress test result we can see that, the value 2 is having 78% chance
of getting heart attack and also value 0 is having 50% chance, the other two values
are having less chances.

In [None]:
# Correlation plot
sns.heatmap(df.corr(),annot=True, cmap='coolwarm')
plt.show()

There is no multicollinearity between the features.


### Hypothesis testings

#### Categorical Features
Null hypothesis: The feature does not have effect on Output

Alternate hypothesis: The feature has an effect on Output

In [None]:
from scipy.stats import chi2_contingency
for col in df_cat:
    print(col,'Vs','Output')
    print(chi2_contingency(pd.crosstab(df_cat[col],df['output'])))
    print('\n')

So, fbs column does not have an effect on the output column, so we can drop it.

In [None]:
df = df.drop('fbs',axis=1)

Normality test

Null hypothesis: Data is not skewed; skewness=0

Alternate hypothesis: Data is skewed; skewness!=0

In [None]:
# For the numerical features, we first perform the shapiro test
import scipy.stats as st
from scipy.stats import shapiro
for i in df_num:
    print(i,'Vs','output')
    st1= df[df['output']==0][i]
    st2= df[df['output']==1][i]
    print(shapiro(st1))
    print(shapiro(st2))
    print('\n')

Numerical Columns

Null hypothesis: The feature does not have an effect on Output

Alternate hypothesis: The feature has an effect on Output

In [None]:
from scipy.stats import mannwhitneyu
for i in df_num:
    z=df[df['output']==1][i]
    w=df[df['output']==0][i]
    print('ManwhitneyU test for %s with admit pvalue is:'%i,mannwhitneyu(z,w)[1])
    print('\n')

So, other than fbs all the other columns has an effect on the target variable. So, we can build the model.

In [None]:
# Transformation
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
for i in df.select_dtypes(np.number):
    df[i]= pt.fit_transform(df[[i]])

### Model Building

In [None]:
for i in df.columns:
    df[i]=df[i].astype('int64')

In [None]:
X = df.drop('output',axis=1)
y = df['output']

In [None]:
from sklearn.model_selection import train_test_split
xtrain , xtest , ytrain , ytest = train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
xtrain.shape , xtest.shape , ytrain.shape , ytest.shape

In [None]:
# Scaling the train and test set seperately
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
for i in xtrain.columns:
    xtrain[i] = sc.fit_transform(xtrain[[i]])
for i in xtest.columns:
    xtest[i] = sc.fit_transform(xtest[[i]])

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, roc_curve, recall_score, f1_score, accuracy_score, precision_score

In [None]:
def models(model,xtrain,xtest,ytrain,ytest):
    md = model.fit(xtrain,ytrain)
    ypred = md.predict(xtest)
    yprob = md.predict_proba(xtest)[:,1]
    print(classification_report(ytest,ypred))
    print('Train Accuracy:', md.score(xtrain,ytrain),'\nTest Accuracy',md.score(xtest,ytest))
    print('AUC Score:', roc_auc_score(ytest,yprob))
    print('Confusion Matrix:\n',confusion_matrix(ytest,ypred))

In [None]:
# Logistic Regression Model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='sag')
models(lr,xtrain,xtest,ytrain,ytest)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['liblinear']}]
gs= GridSearchCV(LogisticRegression(),
                      param_grid = param_grid,
                      cv=3,
                      n_jobs=-1,
                      verbose=3)
gs.fit(xtrain,ytrain)
print('Best parameters for Decision Tree Classifier: ', gs.best_params_, '\n')

In [None]:
lr_t = LogisticRegression(C=0.0001,penalty='l2',solver='liblinear')
models(lr_t,xtrain,xtest,ytrain,ytest)

In [None]:
# KNeighbors Classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
models(knn,xtrain,xtest,ytrain,ytest)

In [None]:
from sklearn.model_selection import GridSearchCV
g= {'n_neighbors' : range(2,20)}
gs= GridSearchCV(KNeighborsClassifier(),
                      param_grid =g,
                      cv=3,
                      n_jobs=-1,
                      verbose=3)
gs.fit(xtrain,ytrain)
print('Best parameters for Decision Tree Classifier: ', gs.best_params_, '\n')

In [None]:
knn_t = KNeighborsClassifier(n_neighbors=19)
models(knn_t,xtrain,xtest,ytrain,ytest)

In [None]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
models(dt,xtrain,xtest,ytrain,ytest)

In [None]:
# Finding best parameters from randomized search cv to avoid overfitting
from sklearn.model_selection import GridSearchCV
g= {'criterion': ['entropy', 'gini'],
                     'max_depth': range(2, 20),
                     'min_samples_split': range(2,10)}
gs= GridSearchCV(DecisionTreeClassifier(),
                      param_grid =g,
                      cv=3,
                      n_jobs=-1,
                      verbose=3)
gs.fit(xtrain,ytrain)
print('Best parameters for Decision Tree Classifier: ', gs.best_params_, '\n')

In [None]:
dt_t = DecisionTreeClassifier(min_samples_split=2, max_depth=3,criterion='entropy')
models(dt_t,xtrain,xtest,ytrain,ytest)

In [None]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
models(rf,xtrain,xtest,ytrain,ytest)

In [None]:
# Ada Boost Classifier
from sklearn.ensemble import AdaBoostClassifier
adb = AdaBoostClassifier()
models(adb,xtrain,xtest,ytrain,ytest)

In [None]:
final = pd.DataFrame({'Models':['Logistic Regression','KNeighborsClassifier' ,'AdaBoostClassifier','DecisionTreeClassifier']
                     ,'Accuracy':[accuracy_score(ytest,lr_t.predict(xtest)),accuracy_score(ytest,knn_t.predict(xtest))
                                 ,accuracy_score(ytest,adb.predict(xtest)),accuracy_score(ytest,dt_t.predict(xtest))]
                     ,'AUC Score':[roc_auc_score(ytest,lr_t.predict_proba(xtest)[:,1]),roc_auc_score(ytest,
                                                                                                    knn_t.predict_proba(xtest)[:,1])
                                  , roc_auc_score(ytest,adb.predict_proba(xtest)[:,1]), roc_auc_score(ytest,dt_t.predict_proba(xtest)[:,1])]
                     ,'Recall Score':[recall_score(ytest,lr_t.predict(xtest)), recall_score(ytest,knn_t.predict(xtest))
                                     ,recall_score(ytest,adb.predict(xtest)), recall_score(ytest,dt_t.predict(xtest))]
                      ,'Precision Score':[precision_score(ytest,lr_t.predict(xtest)), precision_score(ytest,knn_t.predict(xtest))
                                     ,precision_score(ytest,adb.predict(xtest)), precision_score(ytest,dt_t.predict(xtest))]
                     ,'F1 Score':[f1_score(ytest,lr_t.predict(xtest)), f1_score(ytest,knn_t.predict(xtest))
                                     ,f1_score(ytest,adb.predict(xtest)),precision_score(ytest,dt_t.predict(xtest))]
                     })
final

In [None]:
feat_imp = pd.DataFrame(columns={'Features':X.columns,'Importance': dt_t.feature_importances_})

In [None]:
important_features = pd.DataFrame({'Features': xtrain.columns, 
                                   'Importance': adb.feature_importances_})

# print the dataframe
important_features.sort_values(by='Importance', ascending=False, inplace=True)
important_features

In [None]:
sns.barplot(x = 'Importance', y = 'Features', data = important_features)
plt.title('Feature Importance', fontsize = 15)
plt.xlabel('Importance', fontsize = 15)
plt.ylabel('Features', fontsize = 15)
plt.show()

We can see from the feature importance graph that, the caa feature is having more influence in the output column followed by cp and slp, sex and thalachh features. Even old peak,thall, age,restecg, exng seems to have some effect on Output. The trtbps and chol are not influencing the output feature much. 

We saw from bivariate that when the number of major vessels(caa value) is too small and when too large it was having more chances of getting heart attack. Also the chest pain type is having high influence on Output, so whether the person will get heart attack or not is mainly determined by the number of major vessels and the type of chest pain that person gets.

The slp value is also having very high influence and an interesting feature is the sex feature, where we saw from bivariate analysis that the gender type 1 is specifically having more chances of getting heart attack.

The normal heart rate is also influencing very much, we saw that when that person is having higher heart rate then that person has more chances of getting heart attack and we also saw that when the heart rate is very less, then that person does not have a chance of getting heart attack at all.

So, any health industry should focus on these 4 features to determine if the person will get heart attack or not.