In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from scipy import stats
from scipy.stats import norm, skew
from sklearn.metrics import accuracy_score
from numpy.random import seed
from tensorflow.random import set_seed 
from tensorflow import keras
from tensorflow.keras import layers
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
data = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
y=data.output
X=data.copy()
X.drop(['output'],axis=1,inplace=True)
data.head() 

In [None]:
data.info()
print("*"*100)
categorical_col=[cols for cols in X.columns if X[cols].nunique() < 10]
print("Categorical features of our dataset : ",categorical_col)
print("\n")
numeric_col=[cols for cols in X.columns if X[cols].nunique() > 10]
print("Numerical features of our dataset : ",numeric_col)

#t = {}
#for i in list(X.columns):
  #  t[i] = X[i].value_counts().shape[0]
#pd.DataFrame(t,index=["unique value count"])

In [None]:
def label_function(val):
    return f'{val / 100 * len(data):.0f}\n{val:.0f}%'
labels = ["0(low risk)","1(high risk)"]
data.groupby('output').size().plot(kind='pie', autopct=label_function, textprops={'fontsize': 15}, colors=['#00FF00','#FF0000'],labels=labels)
#plt.legend()
plt.show()
#print("*"*80)


counts = data["output"].value_counts().sort_values()

diag_cols = ["0(low_risk)", "1(high_risk)"]

diag_counts = [counts[0], counts[1]]

plt.figure(figsize = (8, 6))
clrs = ['#FFA500' if (x < max(diag_counts)) else '#FF4500' for x in diag_counts ]
sns.barplot(x = diag_cols, y = diag_counts, palette=clrs)
plt.show()

In [None]:
s=data[["sex", "output"]].groupby(['sex'], as_index=False).mean()
print (s)

In [None]:
print (data[["cp", "output"]].groupby(['cp'], as_index=False).mean())

In [None]:
print (data[["fbs", "output"]].groupby(['fbs'], as_index=False).mean())

In [None]:
print (data[["exng", "output"]].groupby(['exng'], as_index=False).mean())

In [None]:
print (data[["exng", "output"]].groupby(['exng'], as_index=False).mean())

In [None]:
print (data[["slp", "output"]].groupby(['slp'], as_index=False).mean())

In [None]:
print (data[["caa", "output"]].groupby(['caa'], as_index=False).mean())

In [None]:
print (data[["thall", "output"]].groupby(['thall'], as_index=False).mean())

In [None]:
data['cat_age'] = pd.cut(data['age'], 5)
print (data[["cat_age", "output"]].groupby(['cat_age'], as_index=False).mean())

In [None]:
data['cat_trtbps'] = pd.cut(data['trtbps'], 5)
print (data[["cat_trtbps", "output"]].groupby(['cat_trtbps'], as_index=False).mean())

In [None]:
data['cat_thalachh'] = pd.cut(data['thalachh'], 5)
print (data[["cat_thalachh", "output"]].groupby(['cat_thalachh'], as_index=False).mean())

In [None]:
data['cat_chol'] = pd.cut(data['chol'], 5)
print (data[["cat_chol", "output"]].groupby(['cat_chol'], as_index=False).mean())

In [None]:
x=[data]
for dataset in x:
    dataset['exng_fbs'] = dataset['fbs'] + dataset['exng'] 
data[['exng_fbs','output']].groupby(['exng_fbs'], as_index=False).mean().sort_values(by='output',ascending=False)

In [None]:
sns.catplot(x="caa", y="thalachh", hue="output", kind="swarm", data=data)

In [None]:
sns.catplot(x="fbs", y="thalachh", hue="output", kind="swarm", data=data)

In [None]:
sns.catplot(x="fbs", y="oldpeak", hue="output", kind="swarm", data=data)

In [None]:
z = np.abs(stats.zscore(X[numeric_col]))
threshold = 3
print(np.where(z >=threshold))
print(("value of z[28][2] = "),(z[28][2]))

In [None]:
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))
sns.boxplot(x=X["age"],ax=axis1)
sns.boxplot(x=X["trtbps"],ax=axis2)
sns.boxplot(x=X["chol"],ax=axis3)

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))
sns.boxplot(x=X["thalachh"],ax=axis1)
sns.boxplot(x=X["oldpeak"],ax=axis2)

In [None]:
seed(2)
features_num = ["age","trtbps","chol","thalachh","oldpeak"]

features_cat = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
random_state=2

transformer_num = make_pipeline(
    StandardScaler(),
)
transformer_cat = make_pipeline(
    OneHotEncoder(handle_unknown='ignore'),
)

preprocessor = make_column_transformer(
    (transformer_num, features_num),
    (transformer_cat, features_cat),
)

X_train, X_valid, y_train, y_valid =train_test_split(X, y, train_size=0.75)

X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
pred = logreg.predict(X_valid)
print("test accuracy score of  Logistic Regression = ", accuracy_score(y_valid, pred)*100)
a_log = round(accuracy_score(y_valid, pred) * 100, 2)

In [None]:
svc = SVC()
svc.fit(X_train, y_train)
pred = svc.predict(X_valid)
print("test accuracy score of Support Vector Machine = ", accuracy_score(y_valid, pred)*100)
a_svc = round(accuracy_score(y_valid, pred) * 100, 2)

In [None]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
pred = sgd.predict(X_valid)
print("test accuracy score of Stochastic Gradient Descentt = ", accuracy_score(y_valid, pred)*100)
a_sgd = round(accuracy_score(y_valid, pred) * 100, 2)

In [None]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
pred = decision_tree.predict(X_valid)
print("test accuracy score of Decision Tree = ", accuracy_score(y_valid, pred)*100)
a_decision_tree = round(accuracy_score(y_valid, pred) * 100, 2)

In [None]:
random_forest = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=2)
random_forest.fit(X_train, y_train)
pred = random_forest.predict(X_valid)
random_forest.score(X_train, y_train)
print("test accuracy score of Random Forest = ", accuracy_score(y_valid, pred)*100)
a_random_forest = round(accuracy_score(y_valid, pred) * 100,2)

In [None]:
linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
pred = linear_svc.predict(X_valid)
print("test accuracy score of  Linear SVC = ", accuracy_score(y_valid, pred)*100)
a_linear_svc= round(accuracy_score(y_valid, pred) * 100, 2)

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
pred = knn.predict(X_valid)
print("test accuracy score of KNN = ", accuracy_score(y_valid, pred)*100)
a_knn = round(accuracy_score(y_valid, pred) * 100, 2)

In [None]:
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
pred = gaussian.predict(X_valid)
print("test accuracy score of Gaussian Naive Bayes = ", accuracy_score(y_valid, pred)*100)
a_gaussian = round(accuracy_score(y_valid, pred)* 100, 2)

In [None]:
seed(0)
set_seed(0)

features_num = ["age","trtbps","chol","thalachh","oldpeak"]

features_cat = ['sex','exng','caa','cp','fbs','restecg','slp','thall']

transformer_num = make_pipeline(
    
    StandardScaler(),
)
transformer_cat = make_pipeline(
    
    OneHotEncoder(handle_unknown='ignore'),
)

preprocessor = make_column_transformer(
    (transformer_num, features_num),
    (transformer_cat, features_cat),
)

# stratify - make sure classes are evenlly represented across splits
X_train, X_valid, y_train, y_valid =train_test_split(X, y, stratify=y, train_size=0.80)

X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)
input_shape = [X_train.shape[1]]


model = keras.Sequential([
    layers.Dense(1056,activation='relu',input_shape=input_shape),
    layers.Dropout(rate=0.3),
    layers.Dense(512,activation='relu'),
    layers.Dropout(rate=0.3),
    layers.Dense(1) ,
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy'],
)

early_stopping = keras.callbacks.EarlyStopping(
    patience=5,
    min_delta=0.001,
    restore_best_weights=True,
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=128,
    epochs=100,
    callbacks=[early_stopping],
)

#acc_ann=model.evaluate(X_valid,y_valid)
#ann = round(acc_ann[1]*100,2)

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Accuracy")

In [None]:
acc_ann=model.evaluate(X_valid,y_valid)
ann = round(acc_ann[1]*100,2)

In [None]:
models = pd.DataFrame({
    'model': ['KNN','Logistic Regression', 
               'Gaussian Naive Bayes','Random Forest','Linear SVC', 
              'Support Vector Machines','Stochastic Gradient Decent','Decision Tree','ANN'],
    'score': [a_knn, a_log,  a_gaussian,
              a_random_forest, a_linear_svc, 
              a_svc, a_sgd, a_decision_tree, ann]})

sns.barplot(x='score', y='model', data=models)

models.sort_values(by='score', ascending=False)