In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
import seaborn as sns
sns.set()
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
import tensorflow as tf 
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from collections import  Counter
import nltk
nltk.download('stopwords')
import re

from collections import defaultdict
from sklearn.preprocessing import LabelEncoder, StandardScaler,OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from scipy import stats
from scipy.stats import norm, skew 
from scipy.special import boxcox1p
from sklearn.preprocessing import RobustScaler

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
o2 = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/o2Saturation.csv')

In [None]:
data

In [None]:
missing_value = data.isnull().sum().sort_values(ascending = False)
missing_perc = (data.isnull().sum()*100/data.shape[0]).sort_values(ascending = False)
value = pd.concat([missing_value,missing_perc],axis=1,keys=['Count','%'])
display(value.head(20).style.background_gradient(cmap = 'Reds', axis = 0))

In [None]:
# There is no missing value
data.describe(include='all')

In [None]:
# Plot of some Discrete variables
fig = plt.figure(figsize=(18,15))
gs = fig.add_gridspec(3,3)
ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[0,2])
sns.countplot(ax=ax0,data = data, x= 'sex')
sns.countplot(ax=ax1,data = data, x= 'exng')
sns.countplot(ax=ax2,data = data, x= 'restecg')

In [None]:
# Plot of some continous variables
fig = plt.figure(figsize=(18,15))
gs = fig.add_gridspec(3,3)
ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[0,2])
sns.boxenplot(ax=ax0,data = data, y= 'thalachh')
sns.boxenplot(ax=ax1,data = data, y= 'trtbps')
sns.boxenplot(ax=ax2,data = data, y= 'age')

In [None]:
# Plot of the target variable

sns.countplot(data = data, x= 'output')
# plt.set_xticklabels(["Low chances of attack(0)","High chances of attack(1)"])


In [None]:
cat_cols = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
con_cols = ["age","trtbps","chol","thalachh","oldpeak"]
df_corr = data[con_cols].corr()
sns.heatmap(df_corr,fmt=".1f",annot=True,cmap='YlGnBu')

In [None]:
fig = plt.figure(figsize=(18,15))
gs = fig.add_gridspec(3,3)
ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[0,2])
sns.kdeplot(ax=ax0,data = data, x= 'thalachh',hue='output',fill=True)
sns.kdeplot(ax=ax1,data = data, x= 'trtbps',hue='output',fill=True)
sns.kdeplot(ax=ax2,data = data, x= 'age',hue='output',fill=True)

In [None]:
d1 = data
cat_cols = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
con_cols = ["age","trtbps","chol","thalachh","oldpeak"]
X = d1.drop(['output'],axis=1)
y = d1[['output']]

scaler = RobustScaler()

# scaling the continuous feature
X[con_cols] = scaler.fit_transform(X[con_cols])


In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size = 0.3)

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
y_pred = gaussian.predict(X_valid)
acc_gaussian = round(accuracy_score(y_pred, y_valid) * 100, 2)
print(acc_gaussian)

In [None]:
from sklearn.ensemble import RandomForestClassifier

randomforest = RandomForestClassifier()
randomforest.fit(X_train, y_train)
y_pred = randomforest.predict(X_valid)
acc_randomforest = round(accuracy_score(y_pred, y_valid) * 100, 2)
print(acc_randomforest)

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier(random_state=1, n_estimators = 90, learning_rate = 0.01)
model.fit(X_train, y_train)
y_pred = model.predict(X_valid)
print(accuracy_score(y_valid,y_pred)*100)


In [None]:
#Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

gbk = GradientBoostingClassifier()
gbk.fit(X_train, y_train)
y_pred = gbk.predict(X_valid)
acc_gbk = round(accuracy_score(y_pred, y_valid) * 100, 2)
print(acc_gbk)

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
lo = logreg.fit(X_train,y_train)
y_pred = lo.predict(X_valid)
acc_log = round(accuracy_score(y_pred, y_valid) * 100, 2)
print(acc_log)