In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Purpose of this analysis:

#### The purpose of this analysis is to find the best model that better explains the relationshid between:
   #### - Explanatory varialbes: lifestyle and health
   #### - Dependent variable: suffered / not suffered a stroke

#### In the first section I load basic libraries as well as de dataset of this exercise. In the second one, I perform an EDA in order to gain insight about the variables conforming the dataset. Finally, cross-validate some models in order to select the top performer.

## 1. Import of basic libraries and dataset

In [None]:
import matplotlib.pyplot as plt
import seaborn as sbn
sbn.set_style("dark")

In [None]:
dataset = pd.read_csv(r'/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

## 2. EDA

### 1. Non graphical analysis:

In [None]:
dataset.info()

#### From the previous output we can see that there are some missing values for the variable bmi. Thus,such values will be replaced later on, after creating some variables that will be helpful for that purpose. 

#### Second, I check if there are duplicated rows. For that purpose, I use the following code that looks for them and keeps the first of all repeated elements (I use the unique() clause to make sure there isn't any after running the script)

In [None]:
dataset.duplicated(subset = None,keep = "first").unique()

#### Third, I use the .describe() function to explore the properties of the initial variables:

In [None]:
dataset.describe()

#### From the description of the dataset, we can spot several facts: 
1. We can drop id variable since it only serves as record identifier.
2. Hypertension, heart_disease are actually binary variables, but they are recorded in the dataset as integers (with values 0,1)
3. We have to further analyse age, avg_glucose_level and bmi since there is an important difference between 75% percentile and maximum value. It could lead to extreme values.
4. Age, avg_glucose_level and bmi are likely normally distributed variables.


In [None]:
dataset = dataset.drop(["id"], axis = 1)

#### After checking the numerical properties of the variables, I want to know the different values the categorical variables can adopt. For that purpose, I write an easy for-loop that skip numerical columns and shows the different options:

In [None]:
for col in dataset.columns:
    if any(col in value for value in ["id", "age", "avg_glucose_level", "bmi"]):
        pass   
    else:
        print("Column {}. Unique values: {}".format(col, dataset[col].unique()))

#### From the previous output we can see the different values the non-numerical varialbes can adopt. 
#### Regarding smoking_status, there are two values that might have similar behaviour: formerly smoked and smokes. Nevertheless, this is only an ex-ante appreciation that should be further investigated.
#### In addition, hypertension, heart_disease and ever_married are binaries, but their values differ. I will also standarize them.

### 2. Data cleaning:

In [None]:
y = dataset["stroke"]
X = dataset.drop(["stroke"], axis = 1)

In [None]:
X["hypertension"] = ["No" if row == 0 else "Yes" for row in dataset["hypertension"]]
X["heart_disease"] = ["No" if row == 0 else "Yes" for row in dataset["heart_disease"]]

#### Now, we are going to fill NaNs values within bmi variable. For that purpose, I create some extra features in order to assign the mean value of the most similar subset.
#### Regarding age classification, there are multiple segmentations. For this exercise, I stick with the standard of the World Health Organization:

- 0-2 infant
- 3-11 child
- 12-17 teen
- 18- 64 adult
- 65+ senior adult

Source: https://help.healthycities.org/hc/en-us/articles/219556208-How-are-the-different-age-groups-defined-

#### Once I get a clear definition of the different age segments, I create the variable within the dataset:

In [None]:
X['Age_segment'] = pd.cut(X['age'], bins = [0,2,11,17,64,float('Inf')], labels = ['Infant', 'Child', 'Teenager', 'Adult', 'Senior Adult'])

#### I will then repeat the same exercise with avg_glucose_level. I first plot its distribution to have an idea of its values and frequencies:

In [None]:
sbn.displot(dataset["avg_glucose_level"])

#### Researching on the internet, I found this measurement system:

- Normal: less than or equal 140
- Prediabetes: 140 - 199
- Diabetes: 200 or higher


Source: https://www.diabetes.org/a1c/diagnosis

#### As done with age, once we have a clear definition of the different glucose segments, I create the variable: 

In [None]:
X["Range_glucose"] = pd.cut(X["avg_glucose_level"], bins = [0,140, 199, 600], labels = ["Normal", "Pre-diabetes", "Diabetes"])

#### Now that I have categorized both avg_glucose_level and age, I will fill nan values fo bmi column. I will set the empty values to the mean of the closest group they belong to. 

In [None]:
X["bmi"] = X["bmi"].fillna(X.groupby(["Age_segment"])["bmi"].transform("mean"))

#### Once we have filled nan values within bmi columns, and that the previous two variables have been categorized, I will do the same with bmi. I have done researched on the web and have found the way people is categorized according to their bmi values:

* For adults: 

* <18.5: Underweight
* 18.5 - 24.9: Normal weight
* 25 - 29.9: Overweight

Source: https://www.cdc.gov/obesity/adult/defining.html#:~:text=If%20your%20BMI%20is%20less,falls%20within%20the%20obesity%20range.


* For infants, children and teenagers: 

* <= 2 %: Underweight 
* ">"  2% and <= 91%: Normal weight 
* ">" 92 %: Overweight

Source: https://www.nhs.uk/live-well/healthy-weight/bmi-calculator/

#### Now, It is time to categorize the variable according to the segmentation of both age and bmi:

In [None]:
X["Bmi_segment"] = ""
segment_percentiles = ("Infant", "Child", "Teenager")
num_rows = X.shape[0]
for i in range(0, num_rows):
    pivot_age_segment = X["Age_segment"].iloc[i]
    if any(pivot_age_segment in segment for segment in segment_percentiles):
       
        if X["bmi"].iloc[i] <=  X[X["Age_segment"] == pivot_age_segment]["bmi"].quantile(0.02):
            X["Bmi_segment"].iloc[i] = "Underweight"
        elif (X["bmi"].iloc[i] > X[X["Age_segment"] == pivot_age_segment]["bmi"].quantile(0.02) and X["bmi"].iloc[i] <= X[X["Age_segment"] == pivot_age_segment]["bmi"].quantile(0.91)):
            X["Bmi_segment"].iloc[i] = "Normal weight"
        else:
            X["Bmi_segment"].iloc[i] = "Overweight"
    else:
        if X["bmi"].iloc[i] <= 18.5:
            X["Bmi_segment"].iloc[i] = "Underweight"
        elif (X["bmi"].iloc[i]> 18.5 and X["bmi"].iloc[i]<= 24.9): 
            X["Bmi_segment"].iloc[i] = "Normal weight"
        else:
            X["Bmi_segment"].iloc[i] = "Overweight"

#### On the data cleaning stage, I have checked for duplicated rows, standarize binary values of different categorical variables and filled nan values on bmi column.
#### Now, the dataset is ready to be graphically explored.

### 3. Graphical Analysis:

#### I first separate numerical variables and check the correlation among them looking for uncorrelated or highly (positive or negative) correlations:

In [None]:
data_num = pd.concat([X.select_dtypes(exclude = ["object", "category"]), y], axis = 1)

In [None]:
sbn.heatmap(data_num.corr(method = "pearson"), annot = True)

#### From the previous heatmap we can see that there aren't neither uncorrelated nor highly correlated variables. 
#### I now plot their distributions:

In [None]:
fig, axes = plt.subplots(1,3,figsize = (15,5), squeeze = False)
sbn.histplot(x = 'age',data = data_num, ax = axes[0,0])
sbn.histplot(x = 'avg_glucose_level',data = data_num, ax = axes[0,1])
sbn.histplot(x = 'bmi',data = data_num, ax = axes[0,2])

#### What I we can see from the previous plot is that: 
* bmi is highly concentrated in the range 20-40, with several extreme values.
* age as large amount of people around 80. 
* Avg_glucose_level has two modes: between 70 and 100, aproximately and 190-220.

#### In order to analyze the impact of these variables on stroke, I will use the segmented version: Age_segment, Range_Glucose and Bmi_segment.

In [None]:
data_obj = pd.concat([X.select_dtypes(include = ["object", "category"]), y], axis = 1)

In [None]:
fig, axes = plt.subplots(3,3,figsize = (20,10), squeeze = False)
sbn.barplot(x = 'gender', y = 'stroke',data = data_obj, orient = 'v', ax = axes[0,0])
sbn.barplot(x = 'hypertension', y = 'stroke',data = data_obj, orient = 'v', ax = axes[0,1])
sbn.barplot(x = 'heart_disease', y = 'stroke',data = data_obj, orient = 'v', ax = axes[0,2])
sbn.barplot(x = 'ever_married', y = 'stroke',data = data_obj, orient = 'v', ax = axes[1,0])
sbn.barplot(x = 'work_type', y = 'stroke',data = data_obj, orient = 'v', ax = axes[1,1])
sbn.barplot(x = 'Residence_type', y = 'stroke',data = data_obj, orient = 'v', ax = axes[1,2])
sbn.barplot(x = 'smoking_status', y = 'stroke',data = data_obj, orient = 'v', ax = axes[2,0])
sbn.barplot(x = 'Bmi_segment', y = 'stroke',data = data_obj, orient = 'v', ax = axes[2,1])
sbn.barplot(x = 'Range_glucose', y = 'stroke',data = data_obj, orient = 'v', ax = axes[2,2])

In [None]:
sbn.barplot(x = 'Age_segment', y = 'stroke',data = data_obj, orient = 'v')

#### As we can see in the previous charts, there is a higher concentration of strokes on those who: 
* suffer from hypertension and heart_disease.
* are or have been married. 
* are self-employed workers.
* suffer from overweight and / or diabetes.
* are former smokers.
* are senior adults.

#### I also noticed that the variable gender has a value "Other". I first check that there is only one case and erase it.

In [None]:
index =X[X["gender"] == "Other"].index
index

In [None]:
X = X.drop(X[X["gender"] == "Other"].index)
y = y.drop(y.iloc[index])

### Summary of EDA: 
* 2 extra features have been created in order to fill nan values of Bmi variable.
* There are extreme values on variable bmi.
* There is neither highly correlated nor uncorrelated numerical variables.
* Variables that appear to have a higher impact on stroke are: hypertension_yes, heart_disease_yes, ever_married_yes, work_type_self_employed, smoking_status_formerly_smoked, Bmi_segment_overweight and Range_glucose_diabetes.

## 3. Baseline Model

### Import of libraries

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer

### Model creation

#### Firstly, I drop the non-original features from the dataset: 

In [None]:
X_init = X.drop(["Age_segment", "Range_glucose", "Bmi_segment"], axis = 1)

#### Then, I create the pipeline with transformation for both types of features: numerical (standarization) and categorical (dummyfication). Then, for each fold, the model is applied and scored with the F1-Score since the data is heavily unbalanced.

In [None]:
cat_idx = X_init.select_dtypes(include = ['object', 'category']).columns
num_idx = X_init.select_dtypes(exclude = ['object', 'category']).columns
trans_step = [('dummy', OneHotEncoder(drop = 'first'), cat_idx),('stand', StandardScaler(), num_idx)]
col_trans = ColumnTransformer(transformers =trans_step)
pipeline = Pipeline(steps = [('trans', col_trans), ('model', LogisticRegression())])
kfold = StratifiedKFold(n_splits = 10)
cv_score = cross_val_score(estimator = pipeline, X = X, y = y, cv = kfold, scoring = 'f1')
print(cv_score)

#### We see a quite poor result. I repeat execute the same pipeline, but with the new added features: Age_segment, Range_glucose and Bmi_segment.

In [None]:
cat_idx = X.select_dtypes(include = ['object', 'category']).columns
num_idx = X.select_dtypes(exclude = ['object', 'category']).columns
trans_step = [('dummy', OneHotEncoder(drop = 'first'), cat_idx),('stand', StandardScaler(), num_idx)]
col_trans = ColumnTransformer(transformers =trans_step)
pipeline = Pipeline(steps = [('trans', col_trans), ('model', LogisticRegression())])
kfold = StratifiedKFold(n_splits = 10)
cv_score = cross_val_score(estimator = pipeline, X = X, y = y, cv = kfold, scoring = 'f1')
print(cv_score)

#### We get multiples zeroes in the folds. But we can see how the maximum value for the model that uses the new variables almost doubles the performance. It means that the response to stroke is more related to segmentations rather than exact numerical values. 

## 4. Models comparison:

#### Once the baseline has been stablished, I repeat the same pipeline with different classification algorithms. I firstly import them. Second, I create a dictionary with the different options and loop over it collecting the F1-score of each option:

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
dict_models = {}
dict_models.update({"KNN": KNeighborsClassifier()})
dict_models.update({"SVC": SVC()})
dict_models.update({"Random Forest":RandomForestClassifier()})
dict_models.update({"Decission Tree": DecisionTreeClassifier()})
dict_models.update({"Naive Bayes": GaussianNB()})

In [None]:
max_scores_models = {}
for item in dict_models.items():
    cat_idx = X.select_dtypes(include = ['object', 'category']).columns
    num_idx = X.select_dtypes(exclude = ['object', 'category']).columns
    trans_step = [('dummy', OneHotEncoder(drop = 'first'), cat_idx),('stand', StandardScaler(), num_idx)]
    col_trans = ColumnTransformer(transformers =trans_step)
    pipeline = Pipeline(steps = [('trans', col_trans), ('model', item[1])])
    kfold = StratifiedKFold(n_splits = 10)
    cv_score = cross_val_score(estimator = pipeline, X = X, y = y, cv = kfold, scoring = 'f1')
    max_scores_models.update({item[0]: round(cv_score.max(),3)})
print(max_scores_models)

#### We see how Decission Tree Classifier obtains a higher F1-score than the rest of Classification algorithms with a large difference.
#### Now, I will train a neural network and see how it performs.

### 5. Neural network

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
import keras.backend as K

#### This dataset is quite unbalanced, meaning that it posses more information regarding people that does not suffered a stroke than people that actually did. For that reason, F1-score was used within the baseline model and the comparison of different algorithms. 
#### Now, with the neural network, I need to do the same in order to be able to compare it against the previous models. For that reason, I firstly need to declare the function of F1-score.

In [None]:
def get_f1(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

Credits: https://aakashgoel12.medium.com/how-to-add-user-defined-function-get-f1-score-in-keras-metrics-3013f979ce0d


#### We first prepare the dataset prior to using it for training the neural network.

In [None]:
X_num = X.select_dtypes(exclude = ["object", "category"])
ss = StandardScaler()
X_num = pd.DataFrame(data = ss.fit_transform(X_num), columns = X_num.columns)
X_cat = X.select_dtypes(include = ["object", "category"])
X_cat = pd.get_dummies(X_cat)
X_cat = X_cat.drop(["gender_Female", "hypertension_No", "heart_disease_No", "ever_married_No", "work_type_Govt_job","Age_segment_Child","Range_glucose_Normal","Bmi_segment_Underweight"], axis = 1)
X_num.reset_index(drop = True, inplace = True)
X_cat.reset_index(drop = True, inplace = True)
X_trans = pd.concat([X_num, X_cat], axis = 1)

In [None]:
neural_network = Sequential()
neural_network.add(Dense(units = X_trans.shape[1], input_dim = X_trans.shape[1], activation = "relu"))
neural_network.add(Dense(units = 1, activation = "sigmoid"))
neural_network.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = [get_f1])
neural_network.fit(x = X_trans, y = y, epochs = 200, batch_size = 15)

#### With a simple neural network of one layer of the same size of the input layer, we get a F1 Score in the range (0.02 and 0.09). 
#### I will create a second model with three intermediate layers and check if it produces an improvement in performance:

In [None]:
neural_network = Sequential()
neural_network.add(Dense(units = X_trans.shape[1], input_dim = X_trans.shape[1], activation = "relu"))
neural_network.add(Dense(units = X_trans.shape[1], activation = "relu"))
neural_network.add(Dense(units = X_trans.shape[1], activation = "relu"))
neural_network.add(Dense(units = X_trans.shape[1], activation = "relu"))
neural_network.add(Dense(units = 1, activation = "sigmoid"))
neural_network.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = [get_f1])
neural_network.fit(x = X_trans, y = y, epochs = 200, batch_size = 15)

#### As we can see, inserting two extra layers improves the model significantly. Now, the neural network easily reaches the range 0.40-0.50.
#### After proving with more layers and larger size of each one, the perfornace remains constant, meaning that we may have reached the limit for this exercise. 

#### We can state now that the neural network, among the tested ones, is the best model to predict the stroke variable.