In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

import sklearn
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier



from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score



from skopt import BayesSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedKFold

import warnings
warnings.filterwarnings("ignore")

from sqlalchemy import Table, Column, Float, Integer, BigInteger



In [None]:
df=pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
df.head()

In [None]:
df.shape

### Check for null value
We can see from below output no feature in data have missing value

In [None]:
df.isnull().sum()

#### This will help us to know about data type i.e. categorical or numerical

In [None]:
df.info()

## Univariate Analysis

#### check feature distribution

#### To check the feature in normally distributed for continuous feature

In [None]:
cat_features = ['sex','cp','fbs','restecg', 'exng', 'slp', 'caa','thall']
cont_features = ['age','trtbps','chol','thalachh']

#### categorical feature distribution

In [None]:
for feature in cat_features:
    plt.figure(figsize=(7,4))
    sns.set_theme(style="whitegrid")
    ax = sns.countplot(df[feature])
    plt.xticks(rotation=90)
    plt.show()

In [None]:
for feature in cat_features:
    plt.figure(figsize=(7,4))
    sns.set_theme(style="whitegrid")
    ax = sns.violinplot(x=feature,y='output',data=df)
    plt.xticks(rotation=90)
    plt.show()

#### continuous feature distribution

In [None]:
for feature in cont_features:
    plt.figure(figsize=(7,4))
    sns.set_theme(style="whitegrid")
    ax = sns.distplot(df[feature])
    plt.xticks(rotation=90)
    plt.show()

In [None]:
for feature in cont_features:
    plt.figure(figsize=(7,4))
    sns.set_theme(style="whitegrid")
    ax = sns.boxplot(df[feature])
    plt.xticks(rotation=90)
    plt.show()

from the above boxplot we can say that some outlier is there in trtbps and chol

In [None]:
for feature in cont_features:
    plt.figure(figsize=(7,4))
    sns.set_theme(style="whitegrid")
    ax = sns.violinplot(feature,hue='output',data=df)
    plt.xticks(rotation=90)
    plt.show()

### By count plot we can check dependent variable distribution
From the below plot we can clearly see the distribution is balanced

In [None]:
plt.figure(figsize=(8,5))
sns.set_theme(style="whitegrid")
ax = sns.countplot(x='output', data=df)
plt.xticks(rotation=90)
plt.show()

## Bivariate analysis

### With the help of corelation matrix we  see that how the features are related to each other

In [None]:
plt.figure(figsize=(14,12))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()

In [None]:
y = df['output']
X = df.drop('output', axis=1)

In [None]:
X.shape, y.shape

## Splitting the data into train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24, test_size=0.2)

In [None]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

## Standardization

In [None]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)

### Applying PCA

In [None]:
pc = PCA(n_components=len(X.columns))
X_train_pc=pc.fit_transform(X_train_sc)
PC_df_train=pd.DataFrame(X_train_pc,columns=['PC_' +str(i) for i in range(1,pc.n_components_+1)])

In [None]:
PC_df_train

## Scree Plot - PCA Analysis
In multivariate statistics, a scree plot is a line plot of the eigenvalues of factors or principal components in an analysis. The scree plot is used to determine the number of factors to retain in an exploratory factor analysis (FA) or principal components to keep in a principal component analysis (PCA)

#### To select number of principal components elbow method is used

We can clearly, proper elbow is not formed in the below graph, so we can select all the components

In [None]:
plt.figure(figsize=(12,6))
plt.plot(PC_df_train.std())
plt.title('Scree Plot - PCA components')
plt.xlabel('Principal Component')
plt.ylabel('Standard deviation')
plt.show()

## Model building

In [None]:
print(PC_df_train.shape)
y_train.shape

In [None]:
classifier = LogisticRegression()
classifier.fit(pc.fit_transform(X_train_sc),y_train)
X_test_sc = sc.transform(X_test)
X_test_pc = pc.transform(X_test_sc)
y_lr=classifier.predict(X_test_pc)

In [None]:
print('Confusion Matrix \n',confusion_matrix(y_lr,y_test))
print()
print('Accuracy Score \n', accuracy_score(y_lr,y_test))
print()
print('Classification Report \n',classification_report(y_lr,y_test))


### SVC Classifier

In [None]:
classifier = SVC()
classifier.fit(pc.fit_transform(X_train_sc),y_train)
X_test_sc = sc.transform(X_test)
X_test_pc = pc.transform(X_test_sc)
y_svc=classifier.predict(X_test_pc)

In [None]:
print('Confusion Matrix \n',confusion_matrix(y_svc,y_test))
print()
print('Accuracy Score \n', accuracy_score(y_svc,y_test))
print()
print('Classification Report \n',classification_report(y_svc,y_test))


### RandomForest Classifier

In [None]:
classifier = RandomForestClassifier()
classifier.fit(pc.fit_transform(X_train_sc),y_train)
X_test_sc = sc.transform(X_test)
X_test_pc = pc.transform(X_test_sc)
y_rfc=classifier.predict(X_test_pc)

In [None]:
print('Confusion Matrix \n',confusion_matrix(y_rfc,y_test))
print()
print('Accuracy Score \n', accuracy_score(y_rfc,y_test))
print()
print('Classification Report \n',classification_report(y_rfc,y_test))


#### Gradient Boosting Classifier

In [None]:
classifier = GradientBoostingClassifier()
classifier.fit(pc.fit_transform(X_train_sc),y_train)
X_test_sc = sc.transform(X_test)
X_test_pc = pc.transform(X_test_sc)
y_gbc=classifier.predict(X_test_pc)

In [None]:
print('Confusion Matrix \n',confusion_matrix(y_gbc,y_test))
print()
print('Accuracy Score \n', accuracy_score(y_gbc,y_test))
print()
print('Classification Report \n',classification_report(y_gbc,y_test))


### Random Forest Classifier

In [None]:
classifier = RandomForestClassifier(n_estimators=100,
                                    min_samples_split=5,
                                    min_samples_leaf=1,
                                    max_depth=5)
classifier.fit(X_train_sc,y_train)
X_test_sc = sc.transform(X_test)
y_rfc=classifier.predict(X_test_sc)

In [None]:
print('Confusion Matrix \n',confusion_matrix(y_rfc,y_test))
print()
print('Accuracy Score \n', accuracy_score(y_rfc,y_test))
print()
print('Classification Report \n',classification_report(y_rfc,y_test))


### Naive Bayes

In [None]:
classifier = GaussianNB()
classifier.fit(pc.fit_transform(X_train_sc),y_train)
X_test_sc = sc.transform(X_test)
X_test_pc = pc.transform(X_test_sc)
y_gb=classifier.predict(X_test_pc)

In [None]:
print('Confusion Matrix \n',confusion_matrix(y_gb,y_test))
print()
print('Accuracy Score \n', accuracy_score(y_gb,y_test))
print()
print('Classification Report \n',classification_report(y_gb,y_test))


### XGboost Classifier

In [None]:
Xgboost=XGBClassifier(random_state=28)
params = {'n_estimators': (100,300),
                  'learning_rate': (0.01, 0.6),
                  'subsample': (0.3, 0.9),
                  'max_depth': (2,5),
                  'colsample_bytree': (0.5, 0.9),
                  'min_child_weight': (1,5)
                 }

In [None]:
rf_classifier = RandomForestClassifier(random_state=34)

Param_rf={'max_depth':(2,5),
                         'min_samples_split':(5,10), 
                         'n_estimators':(100,300),
                         'min_samples_leaf':(1,3)

         }

In [None]:
cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=1)
search = RandomizedSearchCV(rf_classifier, Param_rf, cv=cv)
search.fit(pc.fit_transform(X_train_sc), y_train)
print(search.best_params_)

In [None]:

classifier = XGBClassifier(subsample= 0.9,
                           n_estimators=300,
                           min_child_weight=5,
                           max_depth=2,
                           learning_rate=0.01,
                           colsample_bytree= 0.9)
classifier.fit(pc.fit_transform(X_train_sc),y_train)
X_test_sc = sc.transform(X_test)
X_test_pc = pc.transform(X_test_sc)
y_xg=classifier.predict(X_test_pc)

In [None]:
print('Confusion Matrix \n',confusion_matrix(y_xg,y_test))
print()
print('Accuracy Score \n', accuracy_score(y_xg,y_test))
print()
print('Classification Report \n',classification_report(y_xg,y_test))


In [None]:
lr_df = pd.DataFrame(data=[f1_score(y_test,y_lr),accuracy_score(y_test, y_lr), recall_score(y_test, y_lr), precision_score(y_test, y_lr), roc_auc_score(y_test, y_lr)], 
             columns=['Logistic Regression'], index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])
rf_df = pd.DataFrame(data=[f1_score(y_test,y_rfc),accuracy_score(y_test, y_rfc), recall_score(y_test, y_rfc),precision_score(y_test, y_rfc), roc_auc_score(y_test, y_rfc)], 
             columns=['Random Forest Score'],index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])
nb_df = pd.DataFrame(data=[f1_score(y_test,y_gb),accuracy_score(y_test, y_gb), recall_score(y_test, y_gb), precision_score(y_test, y_gb), roc_auc_score(y_test, y_gb)], 
             columns=['Naive Bayes'], index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])

xg_df = pd.DataFrame(data=[f1_score(y_test,y_xg),accuracy_score(y_test, y_xg), recall_score(y_test, y_xg), precision_score(y_test, y_xg), roc_auc_score(y_test, y_xg)], 
             columns=['XG Boost'], index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])
gbc_df = pd.DataFrame(data=[f1_score(y_test,y_gbc),accuracy_score(y_test, y_gbc), recall_score(y_test, y_gbc), precision_score(y_test, y_gbc), roc_auc_score(y_test,y_gbc)], 
             columns=['Gradient Boosting'], index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])
svc_df = pd.DataFrame(data=[f1_score(y_test,y_svc),accuracy_score(y_test, y_svc), recall_score(y_test, y_svc), precision_score(y_test, y_svc), roc_auc_score(y_test,y_svc)], 
             columns=['Gradient Boosting'], index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])


df_models = round(pd.concat([lr_df,rf_df,nb_df,gbc_df,xg_df,svc_df], axis=1),3)
colors = ["bisque","ivory","sandybrown","steelblue","lightsalmon"]
colormap = matplotlib.colors.LinearSegmentedColormap.from_list("", colors)

background_color = "white"

fig = plt.figure(figsize=(18,26)) # create figure
gs = fig.add_gridspec(4, 2)
gs.update(wspace=0.1, hspace=0.5)
ax0 = fig.add_subplot(gs[0, :])

sns.heatmap(df_models.T, cmap=colormap,annot=True,fmt=".1%",vmin=0,vmax=0.95, linewidths=2.5,cbar=False,ax=ax0,annot_kws={"fontsize":16})
fig.patch.set_facecolor(background_color) # figure background color
ax0.set_facecolor(background_color) 

ax0.text(0,-0.5,'Model Comparison',fontsize=20,fontweight='bold',fontfamily='serif')
plt.show()

# Conclusion
### We can conclude that almost all machine learning model perform well. However, Naive Bias gives the best accuracy 0f 90.2%