## 1. Importing Necessary Libraries

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
%matplotlib inline    
# To make data visualisations display in Jupyter Notebooks 

import numpy as np    # linear algebra 
import pandas as pd    # Data processing, Input & Output load    
import matplotlib.pyplot as plt    # Visualization & plotting
import datetime

import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier    # GBM algorithm
from sklearn.ensemble import RandomForestClassifier    # Random Forest Algorithm
from sklearn.linear_model import LogisticRegression    # Logistic Regression Algorithm

from xgboost.sklearn import XGBClassifier    # Extreme Gradient Boosting
from xgboost import plot_importance    # Plotting Important Variables

import joblib  #Joblib is a set of tools to provide lightweight pipelining in Python (Avoid computing twice the same thing)
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import cross_val_score, GridSearchCV
                                    # GridSearchCV - Implements a “fit” and a “score” method
                                    # train_test_split - Split arrays or matrices into random train and test subsets
                                    # cross_val_score - Evaluate a score by cross-validation     

from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score, precision_score, accuracy_score, roc_auc_score, recall_score, roc_curve
from sklearn.metrics import make_scorer, confusion_matrix, classification_report   # Differnt metrics to evaluate the model
import pandas_profiling as pp    # simple and fast exploratory data analysis of a Pandas Dataframe

import warnings    # To avoid warning messages in the code run
warnings.filterwarnings('ignore')

## 2. Defining Functions For Plotting ROC_AUC Curve & ROC_Plot

In [None]:
def plot_roc_auc_curve(y_train_actual, train_pred_prob, y_test_actual, test_pred_prob, *args):
    '''
    Generate train and test roc curve
    '''
      
    AUC_Train = roc_auc_score(y_train_actual, train_pred_prob)
    AUC_Test = roc_auc_score(y_test_actual, test_pred_prob)
    
    if len(args) == 0:
        print("Train AUC = ", AUC_Train)
        print("Test AUC = ", AUC_Test)
        fpr_train, tpr_train, thresholds = roc_curve(y_train_actual, train_pred_prob)
        fpr_test, tpr_test, thresholds = roc_curve(y_test_actual, test_pred_prob)
        roc_plot(fpr_train, tpr_train, fpr_test, tpr_test)
        
    else:
        AUC_Valid = roc_auc_score(args[0], args[1])
        print("Train AUC = ", AUC_Train)
        print("Test AUC = ", AUC_Test)
        print("Validation AUC = ", AUC_Valid)
        fpr_train, tpr_train, thresholds = roc_curve(y_train_actual, train_pred_prob)
        fpr_test, tpr_test, thresholds = roc_curve(y_test_actual, test_pred_prob)
        fpr_val, tpr_val, thresholds = roc_curve(args[0], args[1])
        roc_plot(fpr_train, tpr_train, fpr_test, tpr_test, fpr_val, tpr_val)        

In [None]:
def roc_plot(fpr_train, tpr_train, fpr_test, tpr_test, *args):
    '''
    Generate roc plot
    '''
    
    fig = plt.plot(fpr_train, tpr_train, label = 'Train')
    fig = plt.plot(fpr_test, tpr_test, label = 'Test')
    
    if len(args) == 0:
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.0])
        plt.title("ROC curve using ")
        plt.xlabel("False Positive Rate (1 - Specificity)")
        plt.ylabel("True Positive Rate (Sensitivity)")
        plt.legend(loc = 'lower right')
        plt.grid(True)
        plt.show()
    
    else:
        fig = plt.plot(args[0], args[1], label = 'Validation')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.0])
        plt.title("ROC curve using ")
        plt.xlabel("False Positive Rate (1 - Specificity)")
        plt.ylabel("True Positive Rate (Sensitivity)")
        plt.legend(loc = 'lower right')
        plt.grid(True)
        plt.show()

## 3. Importing Dataset

In [None]:
data = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')

# Copying the original data into a new python variable object data_new
data_new = data.copy()

print("Data Shape - ", data_new.shape)

data_new.head()

* The entire dataset contains <b>303</b> rows and <b>14</b> columns.

## 4. Let's Understand Our Data

1. First, let's get the summary of the numerical data

In [None]:
data_new.describe().transpose()

In [None]:
data_new.info()

## 5. Data Profiling Report

In [None]:
pp.ProfileReport(data_new)

## 6. EDA(Exploratory Data Analysis)

* We shall first do the <b>Univariate Analysis</b> by analysing the data w.r.t our <b>Target Variable - output</b>.

In [None]:
Target = 'output'
pd.crosstab(data_new[Target], columns = 'Normalized', normalize = True)

* We have approximately <b>46%</b> of 0's and <b>54%</b> of 1's in our data.

* Let's check if there are any null variables in the <b>data_new</b> dataset.

In [None]:
data_new.isnull().sum()

## 6.1) Univariate Analysis

## a) Analysis of unique values & their counts for categorical variables of the data_new dataset.

In [None]:
print("Unique values sex count: ", data_new['sex'].nunique())
print("sex values: ", data_new['sex'].unique())
pd.value_counts(data_new['sex'])

In [None]:
print("Unique values cp count: ", data_new['cp'].nunique())
print("cp values: ", data_new['cp'].unique())
pd.value_counts(data_new['cp'])

In [None]:
print("Unique values fbs count: ", data_new['fbs'].nunique())
print("fbs values: ", data_new['fbs'].unique())
pd.value_counts(data_new['fbs'])

In [None]:
print("Unique values restecg count: ", data_new['restecg'].nunique())
print("restecg values: ", data_new['restecg'].unique())
pd.value_counts(data_new['restecg'])

In [None]:
print("Unique values exng count: ", data_new['exng'].nunique())
print("exng values: ", data_new['exng'].unique())
pd.value_counts(data_new['exng'])

In [None]:
print("Unique values slp count: ", data_new['slp'].nunique())
print("slp values: ", data_new['slp'].unique())
pd.value_counts(data_new['slp'])

In [None]:
print("Unique values caa count: ", data_new['caa'].nunique())
print("caa values: ", data_new['caa'].unique())
pd.value_counts(data_new['caa'])

In [None]:
print("Unique values thall count: ", data_new['thall'].nunique())
print("thall values: ", data_new['thall'].unique())
pd.value_counts(data_new['thall'])

In [None]:
print("Unique values output count: ", data_new['output'].nunique())
print("output values: ", data_new['output'].unique())
pd.value_counts(data_new['output'])

## b) Analysis of percentage unique values for categorical variables of the data_new dataset.

### a) sex

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(data_new['sex']), 
        labels = [1,0],
        autopct = '%.2f%%',
        textprops = {'size' : 'x-large',
                     'fontweight' : 'bold', 
                     'rotation' : '30',
                     'color' : 'w'})

plt.legend()
plt.title('Percentage Of Sex Types', fontsize = 18, fontweight = 'bold')
plt.show()

### b) cp(chestpain)

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(data_new['cp']),
       labels = [0,2,1,3],
       autopct = '%.2f%%',
       textprops = {'size' : 'x-large',
                   'fontweight' : 'bold',
                   'rotation' : '30',
                   'color' : 'w'})

plt.legend()
plt.title('Percentage Of Chestpain Types', fontsize = 18, fontweight = 'bold')
plt.show()

### c) fbs(fasting blood sugar)

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(data_new['fbs']),
       labels = [0,1],
       autopct = '%.2f%%',
       textprops = {'size' : 'x-large',
                   'fontweight' : 'bold',
                   'rotation' : '30',
                   'color' : 'w'})

plt.legend()
plt.title('Percentage Of Fasting Blood Sugar Types', fontsize = 18, fontweight = 'bold')
plt.show()

### d) restecg(resting ecg)

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(data_new['restecg']),
       labels = [1,0,2],
       autopct = '%.2f%%',
       textprops = {'size' : 'x-large',
                    'fontweight' : 'bold',
                    'rotation' : '30',
                    'color' : 'w'})

plt.legend()
plt.title('Percentage Of Rest ECG Types', fontsize = 18, fontweight = 'bold')
plt.show()

### e) exng(exercise induced angina)

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(data_new['exng']),
       labels = [0,1],
       autopct = '%.2f%%',
       textprops = {'size' : 'x-large',
                   'fontweight' : 'bold',
                   'rotation' : '30',
                   'color' : 'w'})

plt.legend()
plt.title('Percentage Of Exercise Induced Angina Types', fontsize = 18, fontweight = 'bold')
plt.show()

### f) slp

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(data_new['slp']),
       labels = [2,1,0],
       autopct = '%.2f%%',
       textprops = {'size' : 'x-large',
                   'fontweight' : 'bold',
                   'rotation' : '30',
                   'color' : 'w'})

plt.legend()
plt.title('Percentage Of SLP Types', fontsize = 18, fontweight = 'bold')
plt.show()

### g) caa(number of major vessels)

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(data_new['caa']),
       labels = [0,1,2,3,4],
       autopct = '%.2f%%',
       textprops = {'size' : 'x-large',
                   'fontweight' : 'bold',
                   'rotation' : '30',
                   'color' : 'w'})

plt.legend()
plt.title('Percentage Of CAA Types', fontsize = 18, fontweight = 'bold')
plt.show()

### h) thall(heart rate)

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(data_new['thall']),
       labels = [2,3,1,0],
       autopct = '%.2f%%',
       textprops = {'size' : 'x-large',
                   'fontweight' : 'bold',
                   'rotation' : '30',
                   'color' : 'w'})

plt.legend()
plt.title('Percentage Of Heart Rate Types', fontsize = 18, fontweight = 'bold')
plt.show()

### i) output

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(data_new['output']),
       labels = [1,0],
       autopct = '%.2f%%',
       textprops = {'size' : 'x-large',
                   'fontweight' : 'bold',
                   'rotation' : '30',
                   'color' : 'w'})

plt.legend()
plt.title('Percentage Of Output', fontsize = 18, fontweight = 'bold')
plt.show()

### Following are the insights gathered from the data_new dataframe

1. <b>Maximum entries</b> are of <b>Sex 1</b> as compared to <b>Sex 0</b>.
2. <b>47.19%</b> of the total population is suffering from <b>Type 0</b> chestpain(cp).
3. <b>28.71%</b> of the total population is suffering from <b>Type 2</b> chestpain(cp).
4. <b>16.5%</b> of the total population is suffering from <b>Type 1</b> chestpain(cp).
5. <b>7.59%</b> of the total population is suffering from <b>Type 3</b> chestpain(cp).
6. <b>85.15%</b> of the total population has <b>Type 0</b> Fasting Blood Sugar(fbs).
7. <b>14.85%</b> of the total population has <b>Type 1</b> Fasting Blood Sugar(fbs).
8. <b>50.17%</b> of the total population has Restecg of <b>Type 1</b> which has having ST-T wave abnormality.
9. <b>48.51%</b> of the total population has Restecg of <b>Type 0</b> which is normal ecg.
10. <b>1.32%</b> of the total population has Restecg of <b>Type 2</b> which shows probable or definite left ventricular hypertrophy by Estes' criteria.
11. <b>67.33%</b> of the total population doesn't suffer from Exercise Induced Angina(exng). 
12. <b>32.67%</b> of the total population suffers from Exercise Induced Angina(exng).
13. <b>46.86%</b> of the total population suffers from <b>Type 2</b> SLP.
14. <b>46.20%</b> of the total population suffers from <b>Type 1</b> SLP.
15. <b>6.93%</b> of the total population suffers from <b>Type 0</b> SLP.
16. <b>57.76%</b> of the total population doesn't have any major vessel affected in their heart.
17. <b>21.45%</b> of the total population has <b>1</b> major vessel affected in their heart.
18. <b>12.54%</b> of the total population has <b>2</b> major vessels affected in their heart.
19. <b>6.60%</b> of the total population have <b>3</b> major vessels affected in their heart.
20. <b>1.65%</b> of the total population have <b>4</b> major vessels affected in their heart.
21. <b>54.46%</b> of the total population has experienced a heart attack.
22. <b>45.54%</b> of the total population has never experienced a heart attack.

## 6.2) Bivariate Analysis

## 1. Data Categorization

* We would categorize the existing variables of our existing dataframe into <b>numerical</b> and <b>categorical</b> variables.

In [None]:
num_cols = data_new[['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']]
obj_cols = data_new[['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall', 'output']]

* Let's drop the columns which we won't be using.

In [None]:
obj_cols = obj_cols.drop(['output'], axis = 1)

In [None]:
print('Numeric Columns \n', num_cols)
print('Non-Numeric Columns \n', obj_cols)

## 2. Analysis of each category of the numerical variables of num_cols dataframe w.r.t Target variable - output.

* Let's first plot the boxplot of each numerical variable w.r.t our target variable.

In [None]:
num_cols_viz = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']

fig, axes = plt.subplots(1, 1, sharex = False, sharey = False, figsize = (15, 15))
data_new.loc[:, [Target]+num_cols_viz].boxplot(by = Target, ax = axes, return_type = 'axes');

### Following are the insights gathered from the boxplots

* <b>The "thalachh" boxplot shows that greater the heart beat rate, higher the chance of a person experiencing a heart attack</b>.
* <b>The "trtbps" shows that lower the resting blood pressure, higher the chance of a person experiencing a heart attack</b>.

## 3. Analysis of each category of the categorical variables of obj_cols dataframe w.r.t Target variable - output.

In [None]:
obj_cols_viz = ['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall', 'output']
fig, axes = plt.subplots(len(obj_cols_viz), sharex = False, sharey = False, figsize = (15, 50))

for i in range(0, len(obj_cols_viz)):
    pd.crosstab(data_new[obj_cols_viz[i]], data_new[Target]).plot(kind = 'bar', stacked = True, grid = False, ax = axes[i])

### Following are the insights gathered from the stacked bar charts

* <b>Sex 0 is more prone to a heart attack as compared to Sex 1</b>.
* <b>Persons having cp(chestpain) of type 1, type 2 & type 3 are more prone to a heart attack </b>.
* <b>Persons having restecg of type 1 are more prone to a heart attack</b>.
* <b>Persons not having exng(exercise indused angina) are more prone to a heart attack</b>.
* <b>Persons having slp of type 2 are more prone to a heart attack</b>.
* <b>Persons having 0 major vessels are more prone to a heart attack</b>.
* <b>Persons having thall(thal rate) of type 2 are more prone to a heart attack</b>.
* <b>So, overall we can say that a person who is of Sex 0 and has a cp of type 1, 2 or 3 and has a restecg of type 1 and doesn't have exng and has an slp of type 2 and has 0 major vessels and has a thall of type 2 is more prone to a heart attack</b>.

## 7. Feature Engineering

## 7.1) Creating Model Dataset

### a) Creating Dummy Variables

In [None]:
# Numerical columns data
data_new_num = data_new[['age', 'trtbps', 'chol', 'thalachh', 'oldpeak', 'output']]

# Categorical columns data
data_new_cat = data_new[['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall']]

# Creating dummies
data_new_cat_dummies = pd.get_dummies(data_new_cat)
print(data_new_cat_dummies.shape)
data_new_cat_dummies.head()

### b) Concatenating columns - numeric and dummies

In [None]:
data_new_final = pd.concat([data_new_num, data_new_cat_dummies], axis = 1)
print(data_new_final.shape)
data_new_final.head()

### c) Null value check in the final dataset before model run

In [None]:
data_new_final.isnull().sum(axis = 0)

## 7.2) Splitting the newly created model data into train and test data

### a) Separating the target variable - output from the data_new_final dataframe

In [None]:
X = data_new_final.drop(['output'], axis = 1)
y = data_new_final['output']

### b) Performing Train, Test & Split

In [None]:
X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.3, random_state = 100) 

print('Train Shape: ', X_train.shape)
print('Test Shape: ', X_test.shape)

## 8) Applying Different Models On Train & Test Data

## 8.1 Model 1 - GBM (Gradient Boosting)

### a) Define model parameters to be tuned

In [None]:
model_parameters = {'n_estimators': [10, 50, 100, 200, 500, 750, 1000], 'max_depth': [3, 5, 10],
                    'min_samples_leaf': [np.random.randint(1,10)], 'max_features': [None, 'sqrt', 'log2']}

### b) Using GridSearch Cross Validation to find out the best parameters using L2 penalty

In [None]:
model = GradientBoostingClassifier(random_state = 10)
gscv_GBM = GridSearchCV(estimator = model, 
                        param_grid = model_parameters, 
                        cv = 5, 
                        verbose = 1, 
                        n_jobs = -1,
                        scoring = 'roc_auc')

gscv_GBM.fit(X_train, y_train)

### c) Displaying the best parameters

In [None]:
print('The best parameters are -', gscv_GBM.best_params_)

### d) Refitting the model with best parameters

In [None]:
final_mod_GBM = GradientBoostingClassifier(**gscv_GBM.best_params_)
final_mod_GBM.fit(X_train, y_train)

### e) Displaying model prediction and classification report

In [None]:
train_pred = final_mod_GBM.predict(X_train)
test_pred = final_mod_GBM.predict(X_test)

In [None]:
print('Classification report for train data is : \n',
      classification_report(y_train, train_pred))
print('Classification report for test data is : \n',
      classification_report(y_test, test_pred))

### f) Saving the variables used in the model

In [None]:
final_mod_GBM.variables = X_train.columns

### g) Saving the best model

In [None]:
joblib.dump(final_mod_GBM, 'best_model_GBM.joblib')

### h) Model Evaluation

In [None]:
plt.subplots(figsize = (10, 5))
train_prob = final_mod_GBM.predict_proba(X_train)[:, 1]
test_prob = final_mod_GBM.predict_proba(X_test)[:, 1]

plot_roc_auc_curve(y_train, train_prob, y_test, test_prob)

### i) Making predictions for test data

In [None]:
y_pred = final_mod_GBM.predict(X_test)
predictions = [round(value) for value in y_pred]

### j) Evaluating prediction accuracy for test data

In [None]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## 8.2) Model 2 - Logistic Regression

### a) Applying logistic regression

In [None]:
log_reg = LogisticRegression(solver = 'liblinear')
log_reg.fit(X_train, y_train)

### b) Displaying model prediction and classification report

In [None]:
train_pred = log_reg.predict(X_train)
test_pred = log_reg.predict(X_test)

In [None]:
print('Classification report for train data is : \n',
      classification_report(y_train, train_pred))
print('Classification report for test data is : \n',
      classification_report(y_test, test_pred))

### c) Saving the variables used in the model

In [None]:
log_reg.variables = X_train.columns

### d) Saving the best model

In [None]:
joblib.dump(log_reg, 'best_model_log_reg.joblib')

### e) Model Evaluation

In [None]:
plt.subplots(figsize = (10, 5))
train_prob = log_reg.predict_proba(X_train)[:, 1]
test_prob = log_reg.predict_proba(X_test)[:, 1]

plot_roc_auc_curve(y_train, train_prob, y_test, test_prob)

### f) Making predictions for test data

In [None]:
y_pred = log_reg.predict(X_test)
predictions = [round(value) for value in y_pred]

### g) Evaluating prediction accuracy for test data

In [None]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## 8.3) Model 3 - Random Forest Classifier

### a) Define model parameters to be tuned

In [None]:
model_parameters = {'n_estimators': [10, 50, 100, 200, 500, 750, 1000], 'max_depth': [3, 5, 10],
                    'min_samples_leaf': [np.random.randint(1,10)], 'max_features': [None, 'sqrt', 'log2']}

### b) Using GridSearch Cross Validation to find out the best parameters using L2 penalty

In [None]:
model = RandomForestClassifier(random_state = 10)
gscv_randfor = GridSearchCV(estimator = model, 
                        param_grid = model_parameters, 
                        cv = 5, 
                        verbose = 1, 
                        n_jobs = -1,
                        scoring = 'roc_auc')

gscv_randfor.fit(X_train, y_train)

### c) Displaying the best parameters

In [None]:
print('The best parameters are -', gscv_randfor.best_params_)

### d) Refitting the model with best parameters

In [None]:
final_mod_randfor = GradientBoostingClassifier(**gscv_randfor.best_params_)
final_mod_randfor.fit(X_train, y_train)

### e) Displaying model prediction and classification report

In [None]:
train_pred = final_mod_randfor.predict(X_train)
test_pred = final_mod_randfor.predict(X_test)

In [None]:
print('Classification report for train data is : \n',
      classification_report(y_train, train_pred))
print('Classification report for test data is : \n',
      classification_report(y_test, test_pred))

### f) Saving the variables used in the model

In [None]:
final_mod_randfor.variables = X_train.columns

### g) Saving the best model

In [None]:
joblib.dump(final_mod_randfor, 'best_model_randfor.joblib')

### h) Model Evaluation

In [None]:
plt.subplots(figsize = (10, 5))
train_prob = log_reg.predict_proba(X_train)[:, 1]
test_prob = log_reg.predict_proba(X_test)[:, 1]

plot_roc_auc_curve(y_train, train_prob, y_test, test_prob)

### i) Making predictions for test data

In [None]:
y_pred = log_reg.predict(X_test)
predictions = [round(value) for value in y_pred]

### j) Evaluating prediction accuracy for test data

In [None]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## 9) Displaying Best Model

In [None]:
print('The best model is Logistic Regression model')