## 1. Importing Necessary Libraries

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
%matplotlib inline    
# To make data visualisations display in Jupyter Notebooks 

import numpy as np    # linear algebra 
import pandas as pd    # Data processing, Input & Output load    
import matplotlib.pyplot as plt    # Visualization & plotting
import datetime

import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier    # GBM algorithm
from sklearn.ensemble import RandomForestClassifier    # Random Forest Algorithm
from sklearn.linear_model import LogisticRegression    # Logistic Regression Algorithm

from xgboost.sklearn import XGBClassifier    # Extreme Gradient Boosting
from xgboost import plot_importance    # Plotting Important Variables

import joblib  #Joblib is a set of tools to provide lightweight pipelining in Python (Avoid computing twice the same thing)
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import cross_val_score, GridSearchCV
                                    # GridSearchCV - Implements a “fit” and a “score” method
                                    # train_test_split - Split arrays or matrices into random train and test subsets
                                    # cross_val_score - Evaluate a score by cross-validation     

from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score, precision_score, accuracy_score, roc_auc_score, recall_score, roc_curve
from sklearn.metrics import make_scorer, confusion_matrix, classification_report   # Differnt metrics to evaluate the model
import pandas_profiling as pp    # simple and fast exploratory data analysis of a Pandas Dataframe

import warnings    # To avoid warning messages in the code run
warnings.filterwarnings('ignore')

## 2. Defining Functions For Plotting ROC_AUC Curve & ROC_Plot 

In [None]:
def plot_roc_auc_curve(y_train_actual, train_pred_prob, y_test_actual, test_pred_prob, *args):
    '''
    Generate train and test roc curve
    '''
      
    AUC_Train = roc_auc_score(y_train_actual, train_pred_prob)
    AUC_Test = roc_auc_score(y_test_actual, test_pred_prob)
    
    if len(args) == 0:
        print("Train AUC = ", AUC_Train)
        print("Test AUC = ", AUC_Test)
        fpr_train, tpr_train, thresholds = roc_curve(y_train_actual, train_pred_prob)
        fpr_test, tpr_test, thresholds = roc_curve(y_test_actual, test_pred_prob)
        roc_plot(fpr_train, tpr_train, fpr_test, tpr_test)
        
    else:
        AUC_Valid = roc_auc_score(args[0], args[1])
        print("Train AUC = ", AUC_Train)
        print("Test AUC = ", AUC_Test)
        print("Validation AUC = ", AUC_Valid)
        fpr_train, tpr_train, thresholds = roc_curve(y_train_actual, train_pred_prob)
        fpr_test, tpr_test, thresholds = roc_curve(y_test_actual, test_pred_prob)
        fpr_val, tpr_val, thresholds = roc_curve(args[0], args[1])
        roc_plot(fpr_train, tpr_train, fpr_test, tpr_test, fpr_val, tpr_val)        

In [None]:
def roc_plot(fpr_train, tpr_train, fpr_test, tpr_test, *args):
    '''
    Generate roc plot
    '''
    
    fig = plt.plot(fpr_train, tpr_train, label = 'Train')
    fig = plt.plot(fpr_test, tpr_test, label = 'Test')
    
    if len(args) == 0:
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.0])
        plt.title("ROC curve using ")
        plt.xlabel("False Positive Rate (1 - Specificity)")
        plt.ylabel("True Positive Rate (Sensitivity)")
        plt.legend(loc = 'lower right')
        plt.grid(True)
        plt.show()
    
    else:
        fig = plt.plot(args[0], args[1], label = 'Validation')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.0])
        plt.title("ROC curve using ")
        plt.xlabel("False Positive Rate (1 - Specificity)")
        plt.ylabel("True Positive Rate (Sensitivity)")
        plt.legend(loc = 'lower right')
        plt.grid(True)
        plt.show()

## 3. Importing Dataset

In [None]:
data = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

# Copying the original data into a new python variable object data_new
data_new = data.copy()

print("Data Shape - ", data_new.shape)

data_new.head()

* The entire dataset contains <b>5110</b> rows and <b>12</b> columns.

## 4. Let's Understand Our Data

1. First, let's get the summary of the numerical data

In [None]:
data_new.describe()

2. Now, let's get the summary for categorical data 

In [None]:
data_new.describe(include = np.object)

In [None]:
data_new.info()

## 5. Data Profiling Report

In [None]:
pp.ProfileReport(data_new)

## 6. EDA(Exploratory Data Analysis)

* We shall first do the <b>Univariate Analysis</b> by analysing the data w.r.t our <b>Target Variable - stroke</b>.

In [None]:
Target = 'stroke'
pd.crosstab(data_new[Target], columns = 'Normalized', normalize = True)

* We have approximately <b>95%</b> of <b>0's</b> and <b>5%</b> of <b>1's</b> in our data.

* Let's check if there are any null variables in the <b>data_new</b> dataset.

In [None]:
data_new.isnull().sum()

## 6.1) Univariate Analysis

## a) Analysis of unique values & their counts for categorical variables of the data_new dataset.

In [None]:
print('Unique values gender count: ', data_new['gender'].nunique()) 
print('gender values: ', data_new['gender'].unique())
pd.value_counts(data_new['gender'])

In [None]:
print('Unique values hypertension count: ', data_new['hypertension'].nunique()) 
print('hypertension values: ', data_new['hypertension'].unique())
pd.value_counts(data_new['hypertension'])

In [None]:
print('Unique values heart_disease count: ', data_new['heart_disease'].nunique()) 
print('hyper_disease values: ', data_new['heart_disease'].unique())
pd.value_counts(data_new['heart_disease'])

In [None]:
print('Unique values ever_married count: ', data_new['ever_married'].nunique()) 
print('ever_married values: ', data_new['ever_married'].unique())
pd.value_counts(data_new['ever_married'])

In [None]:
print('Unique values work_type count: ', data_new['work_type'].nunique())
print('work_type values: ', data_new['work_type'].unique())
pd.value_counts(data_new['work_type'])

In [None]:
print('Unique values Residence_type count: ', data_new['Residence_type'].nunique())
print('Residence_type values: ', data_new['Residence_type'].unique())
pd.value_counts(data_new['Residence_type'])

In [None]:
print('Unique values smoking_status count: ', data_new['smoking_status'].nunique())
print('smoking_status values: ', data_new['smoking_status'].unique())
pd.value_counts(data_new['smoking_status'])

In [None]:
print('Unique values stroke count: ', data_new['stroke'].nunique())
print('stroke values: ', data_new['stroke'].unique())
pd.value_counts(data_new['stroke'])

## b) Analysis of percentage unique values  for categorical variables of the data_new dataset.

### a) gender

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(data_new['gender']), 
        labels = ['Female', 'Male','Other'],
        autopct = '%.2f%%',
        textprops = {'size' : 'x-large',
                     'fontweight' : 'bold', 
                     'rotation' : '30',
                     'color' : 'w'})

plt.legend()
plt.title('Percentage of Gender', fontsize = 18, fontweight = 'bold')
plt.show()

### b) hypertension

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(data_new['hypertension']), 
        labels = [0,1],
        autopct = '%.2f%%',
        textprops = {'size' : 'x-large',
                     'fontweight' : 'bold', 
                     'rotation' : '30',
                     'color' : 'w'})

plt.legend()
plt.title('Percentage of hypertension', fontsize = 18, fontweight = 'bold')
plt.show()

### c) heart_disease

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(data_new['heart_disease']), 
        labels = [0,1],
        autopct = '%.2f%%',
        textprops = {'size' : 'x-large',
                     'fontweight' : 'bold', 
                     'rotation' : '30',
                     'color' : 'w'})

plt.legend()
plt.title('Percentage of heart disease', fontsize = 18, fontweight = 'bold')
plt.show()

### d) ever_married

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(data_new['ever_married']), 
        labels = ['Yes', 'No'],
        autopct = '%.2f%%',
        textprops = {'size' : 'x-large',
                     'fontweight' : 'bold', 
                     'rotation' : '30',
                     'color' : 'w'})

plt.legend()
plt.title('Percentage of people married', fontsize = 18, fontweight = 'bold')
plt.show()

### e) work_type

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(data_new['work_type']), 
        labels = ['Private', 'Self-employed', 'children', 'Govt_job', 'Never_worked'],
        autopct = '%.2f%%',
        textprops = {'size' : 'x-large',
                     'fontweight' : 'bold', 
                     'rotation' : '30',
                     'color' : 'w'})

plt.legend()
plt.title('Percentage of people working in different sectors', fontsize = 18, fontweight = 'bold')
plt.show()

### f) Residence_type

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(data_new['Residence_type']), 
        labels = ['Urban', 'Rural'],
        autopct = '%.2f%%',
        textprops = {'size' : 'x-large',
                     'fontweight' : 'bold', 
                     'rotation' : '30',
                     'color' : 'w'})

plt.legend()
plt.title('Percentage of people staying in different areas', fontsize = 18, fontweight = 'bold')
plt.show()

### g) smoking_status

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(data_new['smoking_status']), 
        labels = ['never smoked', 'Unknown', 'formerly smoked', 'smokes'],
        autopct = '%.2f%%',
        textprops = {'size' : 'x-large',
                     'fontweight' : 'bold', 
                     'rotation' : '30',
                     'color' : 'w'})

plt.legend()
plt.title('Percentage of people of different smoking categories', fontsize = 18, fontweight = 'bold')
plt.show()

### h) stroke

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(data_new['stroke']), 
        labels = [0,1],
        autopct = '%.2f%%',
        textprops = {'size' : 'x-large',
                     'fontweight' : 'bold', 
                     'rotation' : '30',
                     'color' : 'w'})

plt.legend()
plt.title('Percentage of stroke', fontsize = 18, fontweight = 'bold')
plt.show()

### Following are the insights gathered from the data_new dataframe

1. <b>Maximum entries</b> are of <b>females</b> as compared to <b>males</b>.
2. <b>66.62%</b> of the total population is <b>married</b>.
3. <b>34.38%</b> of the total population is <b>unmarried</b>.
4. <b>90.25%</b> of the total population is free from hypertension.
5. <b>9.75%</b> of the total population suffers from hypertension.
6. <b>94.60%</b> of the total population doesn't have any heart disease.
7. <b>5.40%</b> of the total population has some sort of a heart disease.
8. <b>57.24%</b> people are <b>Private</b> sector employees.
9. <b>16.03%</b> people are <b>Self-employed</b>.
10. <b>13.44%</b> of the total population comprises of <b>children</b>.
11. <b>12.86%</b> people are <b>Government</b> job employees.
12. <b>0.43%</b> of the population have <b>never worked</b> at all.
13. People staying in <b>Urban</b> and <b>Rural</b> areas are <b>approximately same</b>.
14. <b>37.03%</b> people have <b>never smoked</b> in their life.
15. The <b>smoking status</b> of <b>30.22%</b> of the total population is <b>unknown</b>.
16. <b>17.32%</b> people had <b>smoked earlier</b> in their life but then quit it afterwards.
17. <b>15.44%</b> people are <b>currently smoking</b> atleast one cigarette a day on an average.
18. <b>4.87%</b> of the total population has experienced a stroke.
19. <b>95.13%</b> of the total population has never experienced a stroke.

## 6.2) Bivariate Analysis

## 1. Data Categorization

* We would categorize the existing variables of our existing dataframe into <b>numerical</b> and <b>categorical</b> variables.

In [None]:
num_cols = data_new.select_dtypes(include = [np.number]).columns.tolist()
obj_cols = data_new.select_dtypes(exclude = [np.number]).columns.tolist()

* Let's drop the columns which we won't be using.

In [None]:
num_cols = data_new.drop(['id', 'stroke'], axis = 1).select_dtypes(include = [np.number]).columns.tolist()

In [None]:
print('Numeric Columns \n', num_cols)
print('Non-Numeric Columns \n', obj_cols)

## 2. Analysis of each category of the numerical variables of num_cols dataframe w.r.t Target variable - stroke.

* Let's first plot the boxplot of each numerical variable w.r.t our target variable.

In [None]:
# We shall exclude the columns 'hypertension', 'heart_disease'

num_cols_viz = ['age', 'avg_glucose_level', 'bmi']

fig, axes = plt.subplots(1, 1, sharex = False, sharey = False, figsize = (15, 15))
data_new.loc[:, [Target]+num_cols_viz].boxplot(by = Target, ax = axes, return_type = 'axes');

### Following are the insights gathered from the boxplots

* <b>The "age" boxplot shows that greater the age, higher the chance of a person experiencing a stroke</b>.
* <b>The "avg_glucose_level" boxplot shows that greater the average glucose level, higher the chance of a person experiencing a stroke</b>.
* <b>The "bmi" boxplot shows that greater the bmi, higher the chance of a person experiencing a stroke</b>.

## 3. Analysis of each category of the categorical variables of obj_cols dataframe w.r.t Target variable - stroke.

In [None]:
obj_cols_viz = obj_cols + ['hypertension', 'heart_disease']
fig, axes = plt.subplots(len(obj_cols_viz), sharex = False, sharey = False, figsize = (15, 50))

for i in range(0, len(obj_cols_viz)):
    pd.crosstab(data_new[obj_cols_viz[i]], data_new[Target]).plot(kind = 'bar', stacked = True, grid = False, ax = axes[i])

### Following are the insights gathered from the stacked bar charts

* <b>Females are more prone to a stroke attack as compared to males</b>.
* <b>Married persons are more prone to a stroke attack as compared to unmarried persons</b>.
* <b>Private job persons are more prone prone to a stroke attack as compared to other work types</b>
* <b>People living in Urban areas are more prone to a stroke attack as compared to persons living in Rural areas</b>.
* <b>A person who has never smoked is more prone to a stroke attack as compared to person who is smoking or had earlier smoked</b>.
* <b>A person who doesn't have hypertension or a heart disease is more prone to a stroke attack as compared to a person who suffers from any of these diseases</b>.
* <b>So, overall we can say that a person who is a Female and is married and is a Private sector employee and stays in an Urban area and is a non-smoker and is free from any kind of hypertension or a heart disease is prone to a stroke attack</b>.

## 6.3) Missing Value Treatment

### a) bmi

* From the Data Profiling report, we got to know that only variable <b>bmi</b> has <b>201</b> missing values.
* Let's find out how much percentage of data is missing.

In [None]:
print("Missing Data Percentage: ", (201/5112)*100, "%")

* As there's only 3.9% of the missing data, we can drop this missing data.

In [None]:
data_new = data_new.dropna(axis = 0)
data_new.head()

## 7. Feature Engineering

## 7.1) Dropping Least Important Variable

* As variable <b>id</b> has no correlation with any other variables, we can drop this variable.

In [None]:
data_new = data_new.drop("id", axis = 1)
data_new.head()

## 7.2) Creating Model Dataset

### a) Finding unique values of each object variable of data_new dataframe

In [None]:
encoding_list = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

label_encoding_list = []
one_hot_encoding_list = []

for i in range (0, len(encoding_list)):
    if(len(data_new[f'{encoding_list[i]}'].unique()) == 2):
        label_encoding_list.append(encoding_list[i])
    else:
        one_hot_encoding_list.append(encoding_list[i])
        
    print(f'Unique Values for {encoding_list[i]}', data_new[f'{encoding_list[i]}'].unique())

### b) Creating Dummy Variables

In [None]:
# Numerical columns data
data_new_num = data_new[num_cols + ['stroke']]

# Categorical columns data
data_new_cat = data_new[obj_cols]

# Creating dummies
data_new_cat_dummies = pd.get_dummies(data_new_cat)
print(data_new_cat_dummies.shape)
data_new_cat_dummies.head()

### c) Concatenating columns - numeric and dummies

In [None]:
data_new_final = pd.concat([data_new_num, data_new_cat_dummies], axis = 1)
print(data_new_final.shape)
data_new_final.head()

### d) Null value check in the final dataset before model run

In [None]:
data_new_final.isnull().sum(axis = 0)

## 7.3) Splitting the newly created model data into train and test data

### a) Separating the target variable - stroke from the data_new_final dataframe

In [None]:
X = data_new_final.drop(['stroke'], axis = 1)
y = data_new_final['stroke']

### b) Performing Train, Test & Split

In [None]:
X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.3, random_state = 100) 

print('Train Shape: ', X_train.shape)
print('Test Shape: ', X_test.shape)

## 8) Applying Different Models On Train & Test Data

## 8.1) Model 1 - GBM (Gradient Boosting)

### a) Define model parameters to be tuned

In [None]:
model_parameters = {'n_estimators': [10, 50, 100, 200, 500, 750, 1000], 'max_depth': [3, 5, 10],
                    'min_samples_leaf': [np.random.randint(1,10)], 'max_features': [None, 'sqrt', 'log2']}

### b) Using GridSearch Cross Validation to find out the best parameters

In [None]:
model = GradientBoostingClassifier(random_state = 10)
gscv_GBM = GridSearchCV(estimator = model, 
                        param_grid = model_parameters, 
                        cv = 5, 
                        verbose = 1, 
                        n_jobs = -1,
                        scoring = 'roc_auc')

gscv_GBM.fit(X_train, y_train)

### c) Displaying the best parameters

In [None]:
print('The best parameters are -', gscv_GBM.best_params_)

### d) Refitting the model with best parameters

In [None]:
final_mod_GBM = GradientBoostingClassifier(**gscv_GBM.best_params_)
final_mod_GBM.fit(X_train, y_train)

### e) Displaying model prediction and classification report

In [None]:
train_pred = final_mod_GBM.predict(X_train)
test_pred = final_mod_GBM.predict(X_test)

In [None]:
print('Classification report for train data is : \n',
      classification_report(y_train, train_pred))
print('Classification report for test data is : \n',
      classification_report(y_test, test_pred))

### f) Saving the variables used in the model

In [None]:
final_mod_GBM.variables = X_train.columns

### g) Saving the best model

In [None]:
joblib.dump(final_mod_GBM, 'best_model_GBM.joblib')

### h) Model Evaluation

In [None]:
plt.subplots(figsize = (10, 5))
train_prob = final_mod_GBM.predict_proba(X_train)[:, 1]
test_prob = final_mod_GBM.predict_proba(X_test)[:, 1]

plot_roc_auc_curve(y_train, train_prob, y_test, test_prob)

### i) Making predictions for test data

In [None]:
y_pred = final_mod_GBM.predict(X_test)
predictions = [round(value) for value in y_pred]

### j) Evaluating prediction accuracy for test data

In [None]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## 8.2) Model 2 - Logistic Regression

### a) Applying logistic regression

In [None]:
log_reg = LogisticRegression(solver = 'liblinear')
log_reg.fit(X_train, y_train)

### b) Displaying model prediction and classification report

In [None]:
train_pred = log_reg.predict(X_train)
test_pred = log_reg.predict(X_test)

In [None]:
print('Classification report for train data is : \n',
      classification_report(y_train, train_pred))
print('Classification report for test data is : \n',
      classification_report(y_test, test_pred))

### c) Saving the variables used in the model

In [None]:
log_reg.variables = X_train.columns

### d) Saving the best model

In [None]:
joblib.dump(log_reg, 'best_model_log_reg.joblib')

### e) Model Evaluation

In [None]:
plt.subplots(figsize = (10, 5))
train_prob = log_reg.predict_proba(X_train)[:, 1]
test_prob = log_reg.predict_proba(X_test)[:, 1]

plot_roc_auc_curve(y_train, train_prob, y_test, test_prob)

### f) Making predictions for test data

In [None]:
y_pred = log_reg.predict(X_test)
predictions = [round(value) for value in y_pred]

### g) Evaluating prediction accuracy for test data

In [None]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## 8.3) Model 3 - Random Forest Classifier

### a) Define model parameters to be tuned

In [None]:
model_parameters = {'n_estimators': [10, 50, 100, 200, 500, 750, 1000], 'max_depth': [3, 5, 10],
                    'min_samples_leaf': [np.random.randint(1,10)], 'max_features': [None, 'sqrt', 'log2']}

### b) Using GridSearch Cross Validation to find out the best parameters

In [None]:
model = RandomForestClassifier(random_state = 10)
gscv_randfor = GridSearchCV(estimator = model, 
                        param_grid = model_parameters, 
                        cv = 5, 
                        verbose = 1, 
                        n_jobs = -1,
                        scoring = 'roc_auc')

gscv_randfor.fit(X_train, y_train)

### c) Displaying the best parameters

In [None]:
print('The best parameters are -', gscv_randfor.best_params_)

### d) Refitting the model with best parameters

In [None]:
final_mod_randfor = GradientBoostingClassifier(**gscv_randfor.best_params_)
final_mod_randfor.fit(X_train, y_train)

### e) Displaying model prediction and classification report

In [None]:
train_pred = final_mod_randfor.predict(X_train)
test_pred = final_mod_randfor.predict(X_test)

In [None]:
print('Classification report for train data is : \n',
      classification_report(y_train, train_pred))
print('Classification report for test data is : \n',
      classification_report(y_test, test_pred))

### f) Saving the variables used in the model

In [None]:
final_mod_randfor.variables = X_train.columns

### g) Saving the best model

In [None]:
joblib.dump(final_mod_randfor, 'best_model_randfor.joblib')

### h) Model Evaluation

In [None]:
plt.subplots(figsize = (10, 5))
train_prob = log_reg.predict_proba(X_train)[:, 1]
test_prob = log_reg.predict_proba(X_test)[:, 1]

plot_roc_auc_curve(y_train, train_prob, y_test, test_prob)

### i) Making predictions for test data

In [None]:
y_pred = log_reg.predict(X_test)
predictions = [round(value) for value in y_pred]

### j) Evaluating prediction accuracy for test data

In [None]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## 9) Displaying Best Model

In [None]:
print('The best model is Logistic Regression model')