In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
os.listdir('/kaggle/input/heart-attack-analysis-prediction-dataset')

#### data file name and folder location

In [None]:
df = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
sat_df = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/o2Saturation.csv')

In [None]:
sat_df.head(2)

In [None]:
df.head(2)

In [None]:
df.shape

In [None]:
sat_df.shape

In [None]:
sat_df.head()

In [None]:
df.head()

In [None]:
df.columns

In [None]:
sns.heatmap(df.isnull(),cmap='viridis',cbar=False)

### No missing values

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
sns.histplot(data=df)

## Univariate Analysis

### Lets seperate our features into Categorical Feature and Contineous Feature

In [None]:
cat_feat = ['sex','cp','fbs','restecg','exng','slp','caa','thall']
cont_feat = ['age','trtbps','chol','thalachh','oldpeak']

### Categorical Feature Distribution

In [None]:
for feature in cat_feat:
    plt.figure(figsize=(7,4))
    sns.set_theme(style='whitegrid')
    ax = sns.countplot(df[feature])
    plt.xticks(rotation=90)
    plt.xlabel(feature)
    plt.show()

In [None]:
for feature in cat_feat:
    plt.figure(figsize=(7,4))
    sns.set_theme(style='whitegrid')
    ax = sns.violinplot(x=feature,y='output',data=df)
    plt.xticks(rotation=90)
    plt.xlabel(feature)
    plt.show()
    

## Contineous Feature distribution

In [None]:
for feature in cont_feat:
    plt.figure(figsize=(7,4))
    sns.set_theme(style='whitegrid')
    ax = sns.histplot(df[feature], kde=True)
    plt.xlabel(feature)
    plt.show()
    

In [None]:
for feature in cont_feat:
    plt.figure(figsize=(7,4))
    sns.set_theme(style='whitegrid')
    ax = sns.boxplot(df[feature])
    plt.xticks(rotation=90)
    plt.xlabel(feature)
    plt.show()

#### From the above Boxplot we can say that we have outliers in trtbps, chol and oldpeak

In [None]:
for feature in cont_feat:
    plt.figure(figsize=(7,4))
    sns.set_theme(style='whitegrid')
    ax = sns.violinplot(df[feature])
    plt.xticks(rotation=90)
    plt.xlabel(feature)
    plt.show()

### Lets check for our dataset is balanced or not

In [None]:
plt.figure(figsize=(7,4))
sns.set_theme(style='whitegrid')
ax = sns.countplot(df['output'])
plt.xticks(rotation=90)
plt.xlabel(feature)
plt.show()

### From this countplot distribution we can say that our dataset is balance

## Bivariate Analysis

In [None]:
plt.figure(figsize=(14,12))
ax = sns.heatmap(df.corr(),annot=True,cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
X = df.drop('output',axis=1)
y = df['output']

In [None]:
print(X.shape)
y.shape

### Split Data into train test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=101)

In [None]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

## Standardization
### Lets Scale our data so, that everything will be in a same scale

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc = StandardScaler()

In [None]:
X_train_sc = sc.fit_transform(X_train)

### To choose best feature lets apply PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
pc = PCA(n_components=len(X.columns))
X_train_pc = pc.fit_transform(X_train_sc)
pc_df_train = pd.DataFrame(X_train_pc,columns=['PC_'+str(i) for i in range(1,pc.n_components_+1)])

In [None]:
pc_df_train

### Scree Plot - PCA Analysis
#### In multivariate statistics, a scree plot is a line plot of the eigenvalues of factors or principal components in an analysis. The scree plot is used to determine the number of factors to retain in an exploratory factor analysis (FA) or principal components to keep in a principal component analysis (PCA)

### To select number of principal components elbow method is used
#### We can see a proper elbow is not formed in the below graph, so we can select all the components

In [None]:
plt.figure(figsize=(12,6))
plt.plot(pc_df_train.std())
plt.title('Scree Plot (Principal Component Analysis)')
plt.xlabel('Principal Component')
plt.ylabel('Standard Deviation')
plt.show()

### Model Building

In [None]:
print(pc_df_train.shape)
print(y_train.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

In [None]:
classifier.fit(pc.fit_transform(X_train_sc),y_train)
X_test_sc = sc.transform(X_test)
X_test_pc = pc.transform(X_test_sc)
y_lr = classifier.predict(X_test_sc)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [None]:
print('Confusion Matrix : \n',confusion_matrix(y_lr,y_test))
print('Accuracy Score : ',accuracy_score(y_lr,y_test))
print('Classification Report : \n',classification_report(y_lr,y_test))

## SVC

In [None]:
from sklearn.svm import SVC
classifier = SVC()

In [None]:
classifier.fit(pc.fit_transform(X_train_sc),y_train)
X_test_sc = sc.transform(X_test)
X_test_pc = pc.transform(X_test_sc)
y_svc = classifier.predict(X_test_sc)

In [None]:
print('Confusion Matrix : \n',confusion_matrix(y_svc,y_test))
print('Accuracy Score : ',accuracy_score(y_svc,y_test))
print('Classification Report : \n',classification_report(y_svc,y_test))

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier = RandomForestClassifier()
classifier.fit(pc.fit_transform(X_train_sc),y_train)
X_test_sc = sc.transform(X_test)
X_test_pc = pc.transform(X_test_sc)
y_rfc=classifier.predict(X_test_pc)

In [None]:
print('Confusion Matrix : \n',confusion_matrix(y_rfc,y_test))
print('Accuracy Score : ',accuracy_score(y_rfc,y_test))
print('Classification Report : \n',classification_report(y_rfc,y_test))

## Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
classifier = GradientBoostingClassifier()
classifier.fit(pc.fit_transform(X_train_sc),y_train)
X_test_sc = sc.transform(X_test)
X_test_pc = pc.transform(X_test_sc)
y_gbc=classifier.predict(X_test_pc)

In [None]:
print('Confusion Matrix : \n',confusion_matrix(y_gbc,y_test))
print('Accuracy Score : ',accuracy_score(y_gbc,y_test))
print('Classification Report : \n',classification_report(y_gbc,y_test))

### Random Forest with Hypertuning parameters

In [None]:
classifier = RandomForestClassifier(n_estimators=100,
                                    min_samples_split=5,
                                    min_samples_leaf=1,
                                    max_depth=5)
classifier.fit(X_train_sc,y_train)
X_test_sc = sc.transform(X_test)
y_rfc=classifier.predict(X_test_sc)

In [None]:
print('Confusion Matrix \n',confusion_matrix(y_rfc,y_test))
print()
print('Accuracy Score \n', accuracy_score(y_rfc,y_test))
print()
print('Classification Report \n',classification_report(y_rfc,y_test))

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
classifier = GaussianNB()
classifier.fit(pc.fit_transform(X_train_sc),y_train)
X_test_sc = sc.transform(X_test)
X_test_pc = pc.transform(X_test_sc)
y_gb=classifier.predict(X_test_pc)

In [None]:
print('Confusion Matrix \n',confusion_matrix(y_gb,y_test))
print()
print('Accuracy Score \n', accuracy_score(y_gb,y_test))
print()
print('Classification Report \n',classification_report(y_gb,y_test))

## XGBOOST with hypertuning parameters

In [None]:
from xgboost import XGBClassifier

In [None]:
Xgboost=XGBClassifier(random_state=28)
params = {'n_estimators': (100,300),
                  'learning_rate': (0.01, 0.6),
                  'subsample': (0.3, 0.9),
                  'max_depth': (2,5),
                  'colsample_bytree': (0.5, 0.9),
                  'min_child_weight': (1,5)
                 }

In [None]:
rf_classifier = RandomForestClassifier(random_state=34)

Param_rf={'max_depth':(2,5),
                         'min_samples_split':(5,10), 
                         'n_estimators':(100,300),
                         'min_samples_leaf':(1,3)

         }

In [None]:
from sklearn.model_selection import RepeatedKFold,RandomizedSearchCV

In [None]:
cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=1)
search = RandomizedSearchCV(rf_classifier, Param_rf, cv=cv)
search.fit(pc.fit_transform(X_train_sc), y_train)
print(search.best_params_)

In [None]:
classifier = XGBClassifier(subsample= 0.9,
                           n_estimators=300,
                           min_child_weight=5,
                           max_depth=2,
                           learning_rate=0.01,
                           colsample_bytree= 0.9)
classifier.fit(pc.fit_transform(X_train_sc),y_train)
X_test_sc = sc.transform(X_test)
X_test_pc = pc.transform(X_test_sc)
y_xg=classifier.predict(X_test_pc)

In [None]:
print('Confusion Matrix \n',confusion_matrix(y_xg,y_test))
print()
print('Accuracy Score \n', accuracy_score(y_xg,y_test))
print()
print('Classification Report \n',classification_report(y_xg,y_test))

In [None]:
from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score
import matplotlib

In [None]:
lr_df = pd.DataFrame(data=[f1_score(y_test,y_lr),accuracy_score(y_test, y_lr), recall_score(y_test, y_lr), precision_score(y_test, y_lr), roc_auc_score(y_test, y_lr)], 
             columns=['Logistic Regression'], index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])
rf_df = pd.DataFrame(data=[f1_score(y_test,y_rfc),accuracy_score(y_test, y_rfc), recall_score(y_test, y_rfc),precision_score(y_test, y_rfc), roc_auc_score(y_test, y_rfc)], 
             columns=['Random Forest Score'],index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])
nb_df = pd.DataFrame(data=[f1_score(y_test,y_gb),accuracy_score(y_test, y_gb), recall_score(y_test, y_gb), precision_score(y_test, y_gb), roc_auc_score(y_test, y_gb)], 
             columns=['Naive Bayes'], index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])

xg_df = pd.DataFrame(data=[f1_score(y_test,y_xg),accuracy_score(y_test, y_xg), recall_score(y_test, y_xg), precision_score(y_test, y_xg), roc_auc_score(y_test, y_xg)], 
             columns=['XG Boost'], index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])
gbc_df = pd.DataFrame(data=[f1_score(y_test,y_gbc),accuracy_score(y_test, y_gbc), recall_score(y_test, y_gbc), precision_score(y_test, y_gbc), roc_auc_score(y_test,y_gbc)], 
             columns=['Gradient Boosting'], index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])
svc_df = pd.DataFrame(data=[f1_score(y_test,y_xg),accuracy_score(y_test, y_xg), recall_score(y_test, y_xg), precision_score(y_test, y_xg), roc_auc_score(y_test,y_xg)], 
             columns=['Gradient Boosting'], index=["F1","Accuracy", "Recall", "Precision", "ROC AUC Score"])


df_models = round(pd.concat([lr_df,rf_df,nb_df,gbc_df,xg_df,svc_df], axis=1),3)
colors = ["bisque","ivory","sandybrown","steelblue","lightsalmon"]
colormap = matplotlib.colors.LinearSegmentedColormap.from_list("", colors)

background_color = "white"

fig = plt.figure(figsize=(18,26)) # create figure
gs = fig.add_gridspec(4, 2)
gs.update(wspace=0.1, hspace=0.5)
ax0 = fig.add_subplot(gs[0, :])

sns.heatmap(df_models.T, cmap=colormap,annot=True,fmt=".1%",vmin=0,vmax=0.95, linewidths=2.5,cbar=False,ax=ax0,annot_kws={"fontsize":16})
fig.patch.set_facecolor(background_color) # figure background color
ax0.set_facecolor(background_color) 

ax0.text(0,-0.5,'Model Comparison',fontsize=20,fontweight='bold',fontfamily='serif')
plt.show()

## Conclusion
### We can conclude that almost all machine learning model perform well except Logistic Regression. However, Naive Bias gives the best accuracy 0f 90.2%