In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt


import tensorflow as tf
from tensorflow import keras
# Layers for our neural networks
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense
# A pretrained model for transfer learning
from keras.models import Model
from keras.applications import vgg19

# Our normal python data science stack you've come to know and love


import sys

import warnings
warnings.filterwarnings("ignore")



# Helper fuctions to evaluate our model.
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

from sklearn import tree
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import GridSearchCV

import statsmodels.api as sm

import xgboost as xgb
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_o2sat = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/o2Saturation.csv')
df_heart = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
print('Shape of o2 Saturation dataset ' + str(df_o2sat.shape))
print('Shape of heart attack dataset ' + str(df_heart.shape))
df_heart

In [None]:
df_heart

In [None]:
df_heart.output.value_counts().plot(kind ='bar')
plt.title('Heart attack frequency')

***Assuming 1 is a heart attack and 0 is no heart attack, 165 heart attacks in the dataset****

In [None]:
df_heart.isna().sum()
#no missing values

In [None]:
fig, ax = plt.subplots(figsize=(20,20))     
sns.heatmap(df_heart.corr(),annot=True,cmap='coolwarm')
plt.title('Correlation Plot ')

In [None]:
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exng','slp', 'caa','thall'] # 8
continous_cols = ['age', 'trtbps', 'chol','thalachh', 'oldpeak'] # 5
label_col = ['output']
#X = df['']

In [None]:
cnt = 0
max_in_row = 3
for x in continous_cols:
    data = df_heart[x]
    plt.figure(cnt//max_in_row, figsize=(40,4))
    plt.subplot(1, max_in_row, (cnt)%max_in_row + 1)
    plt.title(x)
    sns.histplot(data, bins = 50, kde=50);
    cnt += 1

In [None]:
max_in_row = 3
for x in continous_cols:
    plt.figure(cnt//max_in_row, figsize=(25,4))
    plt.subplot(1, max_in_row, (cnt)%max_in_row + 1)
    plt.title(x)
    sns.kdeplot(data=df_heart, x=x, hue="output", fill=True, common_norm=1, alpha=.5, linewidth=0);
    cnt += 1

In [None]:
max_in_row = 3
for x in categorical_cols:
    plt.figure(cnt//max_in_row, figsize=(25,4))
    plt.subplot(1, max_in_row, (cnt)%max_in_row + 1)
    plt.title(x)
    sns.kdeplot(data=df_heart, x=x, hue="output", fill=True, common_norm=False, alpha=.5, linewidth=0,);
    cnt += 1

# Some statistical analysis

In [None]:
#some statistical libraries
import statsmodels.api as sm
from scipy.stats import shapiro
import scipy.stats as stats
from scipy.stats import anderson
from scipy.stats import norm, skew

### Are any of the continous variables in the dataset normal?
Lets use graphs and normality tests to verify normality

In [None]:
max_in_row = 3
cnt=0
for x in continous_cols:
    plt.figure(cnt//max_in_row, figsize=(25,4))
    plt.subplot(1, max_in_row, (cnt)%max_in_row + 1)
    plt.title(x)
    sns.boxplot(df_heart[x],orient='v')
    cnt += 1

## Let's remove the outliers from thalachh, oldpeak, chol, trtbps & test for normality

In [None]:
Q1 = df_heart.thalachh.quantile(.25)
Q3= df_heart.thalachh.quantile(.75)
IQR = Q3 - Q1 # the 50% between .25 & .75
filter = (df_heart.thalachh >= Q1 - 1.5 * IQR) & (df_heart.thalachh <= Q3 + 1.5 *IQR)
sns.boxplot(df_heart.loc[filter].thalachh,orient='v')
plt.title('Thalachh boxplot after removing outliers ')
shapiro(df_heart.loc[filter].thalachh)
fig = plt.figure()
res = stats.probplot(df_heart.loc[filter].thalachh, plot=plt)
plt.show()


### Thalachh does not have a pvalue > .05. so we reject the null hypothesis & it does not fit the QQ plot

In [None]:
Q1 = df_heart.oldpeak.quantile(.25)
Q3= df_heart.oldpeak.quantile(.75)
IQR = Q3 - Q1 # the 50% between .25 & .75
filter = (df_heart.oldpeak >= Q1 - 1.5 * IQR) & (df_heart.oldpeak <= Q3 + 1.5 *IQR)
sns.boxplot(df_heart.loc[filter].oldpeak,orient='v')
plt.title('Oldpeak boxplot after removing outliers ')
shapiro(df_heart.loc[filter].oldpeak)
fig = plt.figure()
res = stats.probplot(df_heart.loc[filter].oldpeak, plot=plt)
plt.show()

### Oldpeak does not have a pvalue > .05. so we reject the null hypothesis & it does not fit the QQ plot

In [None]:
Q1 = df_heart.chol.quantile(.25)
Q3= df_heart.chol.quantile(.75)
IQR = Q3 - Q1 # the 50% between .25 & .75
filter = (df_heart.chol >= Q1 - 1.5 * IQR) & (df_heart.chol <= Q3 + 1.5 *IQR)
sns.boxplot(df_heart.loc[filter].chol,orient='v')
plt.title('Chol boxplot after removing outliers ')
print(shapiro(df_heart.loc[filter].chol))

fig = plt.figure()
res = stats.probplot(df_heart.loc[filter].chol, plot=plt)
plt.show()

### Cholesterol is normal, pvalue > .05 so we accept the null hypothesis

In [None]:
Q1 = df_heart.trtbps.quantile(.25)
Q3= df_heart.trtbps.quantile(.75)
IQR = Q3 - Q1 # the 50% between .25 & .75
filter = (df_heart.trtbps >= Q1 - 1.5 * IQR) & (df_heart.trtbps <= Q3 + 1.5 *IQR)
sns.boxplot(df_heart.loc[filter].trtbps,orient='v')
plt.title('Trtbps boxplot after removing outliers ')
print(shapiro(df_heart.loc[filter].trtbps))

fig = plt.figure()
res = stats.probplot(df_heart.loc[filter].trtbps, plot=plt)
plt.show()

### Trtbps  does not have a pvalue > .05. so we reject the null hypothesis & it does not fit the QQ plot

## Cholesterol is normal so let's analyze it a bit more

### What is the average cholesterol for people that are more likely to have heart attack?

In [None]:
Q1 = df_heart.chol.quantile(.25)
Q3= df_heart.chol.quantile(.75)
IQR = Q3 - Q1 # the 50% between .25 & .75
filter0 = (df_heart.chol >= Q1 - 1.5 * IQR) & (df_heart.chol <= Q3 + 1.5 *IQR) & (df_heart.output == 1)
sm.stats.DescrStatsW(df_heart[filter]['chol']).zconfint_mean()

### What does this mean? This means that the cholesterol population average of people that are more likely to have heart attacks have a mean cholesterole between 231 - 245.

## What about the population average of cholesterol for people that are less likely to have a heart attack?

In [None]:
Q1 = df_heart.chol.quantile(.25)
Q3= df_heart.chol.quantile(.75)
IQR = Q3 - Q1 # the 50% between .25 & .75
filter1 = (df_heart.chol >= Q1 - 1.5 * IQR) & (df_heart.chol <= Q3 + 1.5 *IQR) & (df_heart.output == 0)
sm.stats.DescrStatsW(df_heart[filter]['chol']).zconfint_mean()

### What does this mean? This means that the cholesterol population average of people that are more likely to have heart attacks have a mean cholesterole between 241 - 256.

## These population averages are rather similiar, is there a statistically significant difference between the the population that is more likely to have a heart attack and the population that is less likely to have a heart attack?
* Null Hypothesis- There is no statistically significant difference between the two populations.
* Alternate Hypothesis - There does exist a statiscally significant difference betweent he two populations
## We will use t-test's to see if there is a difference.
### Do the 2 populations share the same variance? Levenne test will tell us.

*  Null hypothesis - the two populations have the same variance
*  Alternate hypothesis- the two populations have different variances

In [None]:
leveneTest = stats.levene(df_heart[filter0].chol, df_heart[filter1].chol)
leveneTest

### The p-value is greater than .05, so we don't accept the alternate hypothesis, and can proceed with a t-test

In [None]:
ttest = stats.ttest_ind(df_heart[filter0]['chol'], df_heart[filter1]['chol'], equal_var=1)
ttest


## Since the p-value is less than .05, we can safely infer that the population of people that have heart attacks come from a different distribution than the population that does not have heart attacks.

# Start of machine learning models

In [None]:
df_heart = pd.get_dummies(df_heart, columns = categorical_cols, drop_first = True)

X = df_heart.drop(['output'],axis=1) 

y = df_heart['output']
scalerX = MinMaxScaler(feature_range=(0, 1))
X[X.columns] = scalerX.fit_transform(X[X.columns])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


# Logistic Regression Model

In [None]:
model = LogisticRegression(max_iter=550)
# Train our model using our training data.

model.fit(X_train, y_train)
#model.predict(X_test,y_test)

In [None]:
y_pred = model.predict(X_test)

In [None]:


# Calculate our accuracy
accuracy  = accuracy_score(y_test, y_pred)

# Calculate our precision score
precision = precision_score(y_test, y_pred)

# Calculate our recall score
recall = recall_score(y_test, y_pred)

f1 = f1_score(y_test, y_pred)

# Print each of our scores to inspect performance.
print("Accuracy Score: %f" % accuracy)
print("Precision Score: %f" % precision)
print("Recall Score: %f" % recall)
print('F1 Score %f' % f1)

# Logistic Regression model gives accuracy of 90%

# Neural Network Model

In [None]:
model = tf.keras.models.Sequential([                
  tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer=tf.keras.optimizers.Adam(), loss='binary_crossentropy', metrics=['acc'])
epochs = 100
model.fit(X_train, y_train, epochs=epochs, validation_split=0.1,verbose=0)
model.evaluate(X_test, y_test)

## Neural Network accuracy: 87%

In [None]:
#model = DecisionTreeClassifier(max_depth=)
params = {
    'criterion' : ['gini', 'entropy'],
    'max_depth': [2, 4, 8, 16,32,64,128], 
    'min_samples_split': [2, 4, 8, 16,32,64,128],
    'min_samples_leaf': [2, 4, 8, 16,32,64,128],
         }
grid_search_cv =  GridSearchCV( 
    estimator = DecisionTreeClassifier(), 
    param_grid = params, 
    scoring = 'accuracy')
grid_search_cv.fit(X_train, y_train)


In [None]:
model = DecisionTreeClassifier(grid_search_cv.best_estimator_)

In [None]:
y_pred = grid_search_cv.predict(X_test)

In [None]:
#y_pred = model.predict(X_test)

accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy Score: %f" % accuracy)

## Decision Tree Classifier not so good

# Random Forrest 82% accuracy

In [None]:

model = RandomForestClassifier()
model.fit(X_train, y_train)

pred = model.predict(X_test)
print(accuracy_score(pred,y_test))

# K Neighbors classifier 89% accuracy

In [None]:
model =  KNeighborsClassifier(n_neighbors=9)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_pred,y_test))

# Support Vector Machine Classifier 85%

In [None]:
params = {
            'max_iter' : [5,7,9,10,12,-1],
            'degree' : [2,3,4,5,6],
            'kernel' : [ 'poly','sigmoid','rbf','linear'],
            'gamma' : ['scale','auto'],
        
         }
grid_search_cv =  GridSearchCV( 
    estimator = SVC(), 
    param_grid = params, 
    scoring = 'accuracy')
grid_search_cv.fit(X_train, y_train)
y_pred = grid_search_cv.predict(X_test)
print(grid_search_cv.best_estimator_)
accuracy_score(y_pred=y_pred,y_true=y_test)

In [None]:
model = SVC(max_iter=10,degree=3,kernel='poly')
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_pred,y_test))