In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings('ignore')

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

![](https://cdn-images-1.medium.com/max/800/1*tGeiO5zee6exueRC8iBuaQ.jpeg)

**Problem statement: Predict the Heart Attack based on the features provided.**

**Objective: The features are provided with output feature as 0 & 1, 0 is the no heart attack and 1 is yes to heart attack. since it is medical related information. Our object is to minimize the False Negative (model predicting the actual patient as "no")**

Please review and provide your feedback

# Read Data

In [None]:
df= pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')


In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

All features are either float or int, which is convenient for modeling, however there are feature which are discreate.

In [None]:
df.isnull().sum()

No Null values, seems the dataset is well managed

# Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
colors = ['#003f5c','#58508d','#bc5090','#ff6361','#ffa600']
sns.set(palette=colors, font='Serif', style='white', rc={'axes.facecolor':'whitesmoke', 'figure.facecolor':'whitesmoke'})

In [None]:
df.describe()

In [None]:
fig, ax = plt.subplots(ncols=4, nrows=4 ,figsize=(15,8), constrained_layout=True )
ax=ax.flatten()
plt.suptitle("Univariated Analysis", size=20, weight='bold')
for i,a in enumerate(df.columns):
    if df[a].nunique() >5:
        sns.kdeplot(x=df[a],ax=ax[i], fill=True)
    else:
        sns.countplot(data=df, x=a, ax=ax[i])
    for s in ['left','right','top','bottom']:
        ax[i].spines[s].set_visible(False)
        ax[14].spines[s].set_visible(False)
        ax[15].spines[s].set_visible(False)
    ax[14].set_yticks([])
    ax[15].set_yticks([])
    ax[14].set_xticks([])
    ax[15].set_xticks([])

In [None]:
num_cols=df.loc[:,df.nunique()>5].columns
fig, ax = plt.subplots(ncols=2, nrows=5 ,figsize=(15,10), constrained_layout=True )
plt.suptitle("Bivariated Analysis (comparing numeric features with target feature)", size=20, weight='bold')
ax=ax.flatten()
i=0
for a in num_cols:
    if df[a].nunique() >5:
        sns.boxplot(y=df[a],ax=ax[i], x=df['output'])
        i=i+1
        sns.kdeplot(x=df[a],ax=ax[i], hue=df['output'], fill=True, linewidth=2)
        i=i+1
    
        
   

In [None]:
def clean_outliers(df1, features):
    for i in features:
        Q1=df1[i].quantile(0.25)
        Q2=df1[i].quantile(0.75)
        IQR= (Q2-Q1)
        print("Feature {} has min value: {} max value: {}".format(i, Q1-IQR*1.5,Q2+IQR*1.5))
        df1=df1[((df1[i]>(Q1-IQR*1.5))&(df1[i]<(Q2+IQR*1.5)))]
        #df1=df_c
    return df1

In [None]:
df_clean=clean_outliers(df, num_cols)

In [None]:
df_clean.shape

In [None]:
fig, ax = plt.subplots(ncols=2, nrows=5 ,figsize=(15,10), constrained_layout=True )
ax=ax.flatten()
plt.suptitle("Bivariated Analysis after outlier removal (comparing numeric features with target feature)", size=20, weight='bold')
i=0
for a in num_cols:
    if df[a].nunique() >5:
        sns.boxplot(y=df_clean[a],ax=ax[i], x=df_clean['output'])
        i=i+1
        sns.kdeplot(x=df_clean[a],ax=ax[i], hue=df_clean['output'], fill=True, linewidth=2)
        i=i+1

In [None]:
sns.heatmap(df_clean[num_cols].corr(), annot=True)

# Hypothesis Testing

### Statistical Hypotheis testing for Contineous features

In [None]:
from scipy.stats import f_oneway, ttest_ind
from statsmodels.stats.multicomp import pairwise_tukeyhsd 


for i in num_cols:
    yes = df_clean[df_clean['output']==1][i]
    no = df_clean[df_clean['output']==0][i]
    stat,p_value=ttest_ind(yes, no)
   
    if p_value < 0.05:
        print(f"Feature {i} has significant difference in Output feature with p_value {np.round(p_value,3)}")
    else:
        print(f"Feature {i} has no significant difference in Output feature with p_value {np.round(p_value,3)}")

### Statistical Hypotesis testing for Categorical variable

In [None]:
cat_cols=df.loc[:,df.nunique()<5].columns
from scipy.stats import f_oneway, ttest_ind, chi2_contingency


for i in cat_cols:
    crosstab = pd.crosstab(df_clean['output'], df[i])
    #print(crosstab)
    stat,p_value,_,_=chi2_contingency(crosstab)
   
    if p_value < 0.05:
        print(f"Feature {i} has significant difference in Output feature with p_value {np.round(p_value,3)}")
    else:
        print(f"Feature {i} has no significant difference in Output feature with p_value {np.round(p_value,3)}")

# Model Creation

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix

scale = StandardScaler()
X=df_clean.drop(['output'], axis=1)
y=df_clean['output']

from sklearn.model_selection import cross_val_score, KFold, GridSearchCV,train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state = 42)
#X_train[num_cols]=scale.fit_transform(X_train[num_cols])
#X_test[num_cols]=scale.transform(X_test[num_cols])

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
model = [DecisionTreeClassifier(), RandomForestClassifier(), GradientBoostingClassifier()]
for i in model:
    params={'max_leaf_nodes':[i for i in range(2,20)]}
    score = GridSearchCV(i, param_grid=params, scoring='recall')
    score.fit(X_train,y_train)
    print(score.best_params_)
    print(score.best_estimator_)
    print(f"Recall score: {score.best_score_}")
    pred=score.predict(X_test)
    print(f"Recall score of test data: {recall_score(y_test, pred)}")
    print(classification_report(y_test,pred))
    sns.heatmap(confusion_matrix(y_test,pred), annot=True)
    plt.show()


**Observation:**
From the above trained models,the Decision tree with node=4 perfrom well compared to other model on the Recall value. our objective here is to minimse the False negative than the False positive. so, let us train the model with full Train data set.


## Training RandomForest with full trainset

In [None]:
from sklearn.tree import plot_tree
model = DecisionTreeClassifier(max_leaf_nodes=4)
model.fit(X_train,y_train)
print(f"Recall score of Train Data: {model.score(X_train,y_train)}")
pred=model.predict(X_test)
print(f"Recall score of Test Data: {recall_score(y_test, pred)}")
print(f"Accuracy Score: {accuracy_score(y_test,pred)}")
print(classification_report(y_test,pred))
sns.heatmap(confusion_matrix(y_test,pred), annot=True)
plot_tree(model);
feat_imp=pd.DataFrame(columns=['feature','value'])
feat_imp['feature']=X_train.columns
feat_imp['value']=model.feature_importances_
feat_imp.sort_values(by=['value'], ascending=False)

**Observation:**
Accuracy may be less - 78%, howeverfor the Medical related data, the model should avoid the false negative than the accuracy. So, i have considered Recall as the metrics to consider for my model. Here the recall value is 91%, means that ther is less False negative than the False positive.  
**So, may i consider the efficiency of my model is 90%** :)

There are rooms for further improvement, but i hope this will be right starting point.  

Please provide your valuable feedback. 