# Importing Packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as mno

In [None]:
df = pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")

In [None]:
sns.set_style("darkgrid", {"axes.facecolor": ".9"})
sns.set_context("talk")

# Dataset Analysis

In [None]:
df.head()

- It can be seen that few of the columns contains continous value.

In [None]:
df.shape

In [None]:
df.info()

- All are numerical columns

In [None]:
df.isnull().sum()

- There are no missing column

We are going to separate continous and non-continous column to perform analysis in different way.

In [None]:
continous_cols = [col  for col in df.columns if len(df[col].unique()) > 5]

In [None]:
continous_cols

In [None]:
discrete_cols = [ col for col in df.columns if len(df[col].unique()) <= 5 ]

In [None]:
discrete_cols

## Univariate Analysis

We will create histograms for continous columns and countplot for discrete columns.

In [None]:
def create_hist_for_cont_col():
    dataset = df.copy()
    fig, axes = plt.subplots(3, 2, figsize=(26, 20) )
    row_idx = 0
    col_idx = 0
    for col in continous_cols:
        if col_idx > 1 and col_idx % 2 == 0:
            row_idx += 1
            col_idx = 0
        sns.histplot(x=col, data= dataset, ax = axes[row_idx, col_idx])
        col_idx += 1
    plt.show()

In [None]:
create_hist_for_cont_col()

- __Age:__ It can be seen that major popluation is between age 50 and 60.
- __trtbps( Resting blood pressure in mm Hg ):__ It can be seen that most of the population has Resting nood pressure between 10 and 160.
- __chol(Cholesterol in mg/dl):__ It can be seen that cholesterol value for major population ranges between 200 and 300.
- __thalachh(Max heart rate achieved):__ It can be seen that for major population heart rate is between 140 and 180.
- __oldpeak:__ it can be seen that for major population old peak value is almost Zero.

- 

In [None]:
def create_bar_for_discrete_col():
    dataset = df.copy()
    fig, axes = plt.subplots(3, 3, figsize=(36, 20) )
    row_idx = 0
    col_idx = 0
    for col in discrete_cols:
        if col_idx > 1 and col_idx % 3 == 0:
            row_idx += 1
            col_idx = 0
        sns.countplot(x=col, data= dataset, ax = axes[row_idx, col_idx])
        col_idx += 1
    plt.show()

In [None]:
create_bar_for_discrete_col()

- __sex:__ It can be seen that major population are in sex category 1 (Dataset does not specify, whether 0 is male or female).
- __cp(Chest Pain type):__ It can be seen that major population has chest pain of type 0, i.e., _Typical angina_.
- __fbs(Fasting blood Sugar > 120 mg/dl):__ It can be seen that mojor populatio has fasting blood sugar greater than 120.
- __restecg(Resting Electrcardiographic Results):__ It can be seen that mojor population have _ST-T wave abnormality_. However, almost same chunk of population have normal result as well.
- __exng(Excerice Induced Angina):__ It can be seen that mojor population doesnot have Angine induced by Excercise.
- __slp(Slope):__ It can be seen that mojor population has either slope of 1 or slope of 2.
- __caa(Number of Major Vessels):__ It can be seen that mojor population has vessel of type 0.
- __thall(Thall rate):__ It can be seen that mojor population has Thall rate of 2. Thall Rate of 3 is also significant.
- __output:__ Most of the population had heart attacks.

## Bivariate Analysis

In [None]:
def draw_scatter_plot(x):
    fig, axes = plt.subplots(2, 2, figsize=(40, 20))
    row_idx = 0
    col_idx = 0
    for col in continous_cols[1:]:
        if col_idx > 1 and col_idx % 2 == 0:
            row_idx += 1
            col_idx = 0
        sns.scatterplot(x=x, y=col, data=df, hue='output', ax = axes[row_idx, col_idx])
        col_idx += 1

In [None]:
draw_scatter_plot('age')

#### Age vs trtbps
- With increasing age, Resting Blood Pressure is increasing.
- People between 50 and 60 have high Resting Blood Pressure.
- Both population with low and high resting blood pressure are likely to get heart attack. 

#### Age vs Cholesterol
- With increasing age, Cholesterol is increasing.
- People between 50 and 60 have high Cholesterol.
- Both population with low and high Cholesterol are likely to get heart attack. 

#### Age vs Max Heart Rate Acheived(thalachh)
- With increasing age, thalachh is descresing.
- People between 50 and 60 have low thalachh.
- Both population with low and high thalachh are likely to get heart attack. 

#### Age vs Oldpeak
- With increasing age, Oldpeak is increasing.
- People between 50 and 60 have high oldpeak.
- Both population with low and high oldpeak are likely to get heart attack. 

In [None]:
def relate_cont_col_with_output():
    dataset = df.copy()
    fig, axes = plt.subplots(3, 2, figsize=(26, 20) )
    row_idx = 0
    col_idx = 0
    for col in continous_cols:
        if col_idx > 1 and col_idx % 2 == 0:
            row_idx += 1
            col_idx = 0
        sns.histplot(x=col, data= dataset, ax = axes[row_idx, col_idx], hue='output')
        col_idx += 1
    plt.show()

In [None]:
relate_cont_col_with_output()

#### Age vs Output
- All population has heart attack having age 29, 34, 71, 74, 76. This means that adult and older people are more likely to get heart attack.
- Middle age group people ( people between 35 and 70 ) also likely to get heart attack but with less probability.

#### Resting Blood Pressure(trtbps) vs Output 
- Population having Resting Blood Pressure between 140 and 160 are more liekly to get heart attack.

#### Cholesterol and Output
- Population having Cholesterol value between 200 and 280 are more likely to get heart attack.

#### Max heart Rate Achieved(thalachh) VS Output.
- Population having heart rate greater than 140 are more likely to get heart attack.

#### Oldpeak vs Output
- Population with Zero oldpeak are more likely to get heart attack.

# Feature Engineering

In [None]:
from sklearn.preprocessing import StandardScaler

def scale_dataset(df):
    dataset = df.copy()
    dataset.drop(['age', 'output'], axis = 1, inplace = True)
    scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(dataset),columns = dataset.columns)

In [None]:
df_scaled = pd.merge(scale_dataset(df), df['output'],left_index=True, right_index=True )

In [None]:
df_scaled.columns

# Feature Selection

In [None]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [None]:
dataset = df_scaled.copy()
Y = dataset['output']
X = dataset.drop('output', axis = 1)

In [None]:
feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state = 0))
feature_sel_model.fit(X, Y)

In [None]:
feature_sel_model.get_support()

In [None]:
selected_feat = X.columns[(feature_sel_model.get_support())]

In [None]:
selected_feat

# Model Evaluation with Stratified KFold Validation

In [None]:
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [None]:
def apply_stratified_k_fold_validation( model, x = X[selected_feat], y = Y ):
    stratified_acc = []
    for train_index, test_index in skf.split(x , y):
        x_train_fold, x_test_fold = x.iloc[train_index.tolist()], x.iloc[test_index.tolist()]
        y_train_fold, y_test_fold = y.iloc[train_index.tolist()], y.iloc[test_index.tolist()]
        model.fit(x_train_fold, y_train_fold)
        stratified_acc.append(model.score(x_test_fold, y_test_fold))
    
    print("\n")
    print('List of possible accuracy:', stratified_acc)
    
    print("\n")
    print('Maximum Accuracy That can be obtained from this model is:', max(stratified_acc)*100, '%')
    
    print("\n")
    print('Minimum Accuracy:', min(stratified_acc)*100, '%')
    
    print("\n")
    print('Overall Accuracy:', np.mean(stratified_acc)*100, '%')
    
    print("\n")
    print('Standard Deviation is:', np.std(stratified_acc)*100, '%')


## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
apply_stratified_k_fold_validation(model)

## K Nearest Neighbour

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=7)    
apply_stratified_k_fold_validation(model)

## Decision Trees

In [None]:
from sklearn import tree

model = tree.DecisionTreeClassifier()
apply_stratified_k_fold_validation(model)

## XBoost Classifier


In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(random_state = 42, use_label_encoder =False)
apply_stratified_k_fold_validation(model)

## Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
apply_stratified_k_fold_validation(model)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=0)
apply_stratified_k_fold_validation(model)