# About this dataset
## I) Dependant Variables
1. Age : Age of the patient
2. Sex : Sex of the patient
3. exang: exercise induced angina (1 = yes; 0 = no)
4. ca: number of major vessels (0-3)
5. cp : Chest Pain type chest pain type
     * Value 1: typical angina
     * Value 2: atypical angina
     * Value 3: non-anginal pain
     * Value 4: asymptomatic
6. trtbps : resting blood pressure (in mm Hg)
7. chol : cholestoral in mg/dl fetched via BMI sensor
8. fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
9. rest_ecg : resting electrocardiographic results
     * Value 0: normal
     * Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
     * Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
     * thalach : maximum heart rate achieved


## II) Independent Variables:
target : 
         0= less chance of heart attack 
         1= more chance of heart attack

# Importing Libraries

In [None]:
##Importing Libraries

#Libraries for Dataset Loading and Data Wrangling
import pandas as pd
import numpy as np 

#Data Visualization
import seaborn as sns
sns.set_theme(style="darkgrid")
import plotly.graph_objects as go
import matplotlib.pyplot as plt

#Model Selection & Model Optimization
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.feature_selection import SequentialFeatureSelector 

#Statistical Inference Analyis
import statsmodels.api as sm
import scipy.stats as stats

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


#Validation
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

#Filter's Warnings 
import warnings
warnings.filterwarnings('ignore')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data Exploration

In [None]:
data = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
print(f"Number of Rows: {data.shape[0]}\nNumber of Columns: {data.shape[1]}")
data.head()

In [None]:
data.isna().sum()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
labels = list(data['output'].unique())
values = list(data['output'].value_counts())
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.5)])
fig.show()

The DataSet is Balanced

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.columns

In [None]:
sns.set(rc = {'figure.figsize':(15,10)})
sns.heatmap(data.corr(),annot=True,linewidths=0.4,
    linecolor='Black',
    cbar=False,cmap = 'plasma')

1. Positivily Correlated Variables:
* thalachh
* cp
* sip
* restecg
2. Negativily Correalated Variables:
* exng 
* oldpeak
* caa
* thall

In [None]:
sns.set(rc = {'figure.figsize':(15,10)})
sns.heatmap(data.isnull(),cmap = 'plasma',yticklabels = False)

The Datasets contains Zero Nulls values

# Univariate analysis

In [None]:
labels = list(data['sex'].unique())
values = list(data['sex'].value_counts())
fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.show()

The number of Male(68.3%) is greater than Female(31.7%)

In [None]:
data['cp'].value_counts(normalize=True).plot.bar(color=['green','blue','purple','red'],edgecolor='black',title='Major Vessels')

In [None]:
sns.countplot(data=data,x='age')
plt.title( 'Age Count')

In [None]:
sns.histplot(data,x = data['age'],bins = 10,)

# Splitting the Data

In [None]:
X = data.drop('output',axis = 1)
y = data.output

In [None]:
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size = 0.25,shuffle = True)

# Statistical Inference & Feature Selection 

In [None]:
log_reg = sm.Logit(ytrain,Xtrain).fit()

In [None]:
print(log_reg.summary())

**Note:**
age, trtbps, chol, fbs, restecg shows high p-values, under which we cannot reject the null hypothesis that it is insignificant to the dependent variables, which should be removed at first. 

In [None]:
SFS = SequentialFeatureSelector(LogisticRegression(random_state=0),n_features_to_select=7,
    direction='backward',
    scoring='roc_auc',
    cv=5).fit(Xtrain,ytrain)

In [None]:
SFS_results = pd.DataFrame({'Variable':Xtrain.columns,
                             'Chosen':SFS.get_support()})
SFS_results.head(13)

**Notes:**

Through Logistic Regression and Sequential Feature Selection, we could further consolidate our findings and hypotheses from privous Pearson Correlation heatmap.

The outcome reveals that age, trtbps, chol, fbs, restecg are the significant variables for prediction and the remaining will be conceived as insignificant noise and removed accordingly.

In [None]:
SFS_Variable = SFS_results[SFS_results['Chosen'] == True]['Variable']
log_reg = sm.Logit(ytrain, Xtrain[SFS_Variable]).fit()

In [None]:
print(log_reg.summary())

# Model Evalution

In [None]:
Perfomance_df = pd.DataFrame(columns=['Model','Feature_Selection','Accuracy','Log_Loss','Roc'])
Perfomance_df.head()

> **RandomForestClassifier(for all dependent Variables):**

In [None]:
#Pruning
alphas = DecisionTreeClassifier(random_state=0).cost_complexity_pruning_path(Xtrain, ytrain)['ccp_alphas']
                    
#Pools of Parameters
random_parameters = {'n_estimators': [10,100,1000],
                     'criterion':['gini','entropy'],
                     'max_depth': [10,100,1000],
                     'max_features':["auto","sqrt", "log2"],
                     'bootstrap' :[True,False],
                     'class_weight': ["balanced", "balanced_subsample"], 
                     'ccp_alpha': alphas
                    }

#Randomized Cross Validation for Hyperparameters Tuning
RFC = RandomizedSearchCV(RandomForestClassifier(), 
                         param_distributions = random_parameters,
                         n_iter = 100,
                         scoring = 'accuracy',
                         n_jobs = 10,
                         cv = 3,
                         verbose = 2,
                         random_state=0,
                         return_train_score = True)
RFC.fit(Xtrain, ytrain)

In [None]:
Best_Parameter = RFC.best_params_


#Test the Preforamnce of Best Parameters 

RFC = RandomForestClassifier(n_estimators = Best_Parameter['n_estimators'],
                             criterion = Best_Parameter['criterion'],
                             max_depth = Best_Parameter['max_depth'],
                             max_features = Best_Parameter['max_features'],
                             bootstrap = Best_Parameter['bootstrap'],
                             class_weight = Best_Parameter['class_weight'],
                             ccp_alpha = Best_Parameter['ccp_alpha']
                            )
RFC.fit(Xtrain, ytrain)

In [None]:
#Validation
pred = RFC.predict(Xtest)
Performance_df = Perfomance_df.append(pd.DataFrame([['RFC', 'Full', accuracy_score(ytest, pred), log_loss(ytest, pred), roc_auc_score(ytest, pred)]],
                                                    columns = ['Model', 'Feature Selection','Accuracy', 'Log Loss', 'ROC']), sort = False)

print('Accuracy:', accuracy_score(ytest, pred))
print('Log Loss:', log_loss(ytest, pred))
print('ROC Accuracy:', roc_auc_score(ytest, pred))
print('Confusion Matrix:\n', 
       confusion_matrix(ytest, pred))

In [None]:
plot_roc_curve(RFC, Xtest, ytest)

> **RandomForest(for Selected Dependent Variables):**

In [None]:
#Find the Alpha
alphas = DecisionTreeClassifier(random_state=0).cost_complexity_pruning_path(Xtrain[SFS_Variable], ytrain)['ccp_alphas']

#Pools of Parameters        
random_parameters = {'n_estimators': [10,100,1000],
                     'criterion':['gini','entropy'],
                     'max_depth': [10,100,1000],
                     'max_features':["auto","sqrt", "log2"],
                     'bootstrap' :[True,False],
                     'class_weight': ["balanced", "balanced_subsample"], 
                     'ccp_alpha': alphas
                    }
  
#Randomized Cross Validation for Hyperparameters Tuning
RFC = RandomizedSearchCV(RandomForestClassifier(), 
                         param_distributions = random_parameters,
                         n_iter = 100,
                         scoring = 'accuracy',
                         n_jobs = 10,
                         cv = 3,
                         verbose = 2,
                         random_state=0,
                         return_train_score = True)
RFC.fit(Xtrain[SFS_Variable], ytrain)

In [None]:
Best_Parameter = RFC.best_params_

#Test the Preforamnce of Best Parameters 
RFC = RandomForestClassifier(n_estimators = Best_Parameter['n_estimators'],
                             criterion = Best_Parameter['criterion'],
                             max_depth = Best_Parameter['max_depth'],
                             max_features = Best_Parameter['max_features'],
                             bootstrap = Best_Parameter['bootstrap'],
                             class_weight = Best_Parameter['class_weight'],
                             ccp_alpha = Best_Parameter['ccp_alpha']
                            )
RFC.fit(Xtrain[SFS_Variable], ytrain)

In [None]:
#Validation
pred = RFC.predict(Xtest[SFS_Variable])
Performance_df = Performance_df.append(pd.DataFrame([['RFC', 'Selected', accuracy_score(ytest, pred), log_loss(ytest, pred), roc_auc_score(ytest, pred)]],
                                                    columns = ['Model', 'Feature Selection','Accuracy', 'Log Loss', 'ROC']), sort = False)

print('Accuracy:', accuracy_score(ytest, pred))
print('Log Loss:', log_loss(ytest, pred))
print('ROC Accuracy:', roc_auc_score(ytest, pred))
print('Confusion Matrix:\n', 
       confusion_matrix(ytest, pred))

In [None]:
plot_roc_curve(RFC,Xtest[SFS_Variable], ytest)