# Import Packages

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import missingno as mno
import seaborn as sns
import pandas as pd

In [None]:
sns.set_theme(style="darkgrid")

# Dataset Analysis

In [None]:
df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
 df.isnull().sum()

In [None]:
numerical_cols = df.columns[df.dtypes != object]

In [None]:
numerical_cols

In [None]:
categorical_cols = df.columns[df.dtypes == object]

In [None]:
categorical_cols

## Univariate Analysis

We will be performing univariate analysis for numerical and categorical columns separately.

### Numerical columns

In [None]:
def draw_histplot_for_numerical_cols():
    dataset = df.copy()
    fig, axes = plt.subplots(1, len(numerical_cols), figsize=(36, 5))
    row_idx = 0
    for col in numerical_cols:
        sns.histplot(x=col, data=dataset, ax = axes[row_idx])
        row_idx += 1
    plt.show()

In [None]:
draw_histplot_for_numerical_cols()

Below observations can be made:
- Most people are from either age 40 or age 80.
- Mostly all people do not have hypertension.
- Mostly all people do not have heart disease.
- Most population has avergae glucose level less than 180.
- Almost all population has BMI less than 60.
- Most people did not had stroke.

**We have seen that BMI has missing value, and it follows almost normal trend (slightly right skewed), we can use mean to fill missing values.**

In [None]:
def plot_grouped_data(feature):
    dataset = df.copy()
    grouped_dataset = df.groupby(feature)['id'].agg({'count'}).reset_index()
    sns.barplot(y='count', x = feature, data=grouped_dataset)
    plt.show()
    

In [None]:
df.groupby('hypertension')['hypertension'].count().apply(lambda x: x* 100/ len(df))

In [None]:
plot_grouped_data('hypertension')

**90% of polulation do not have hypertension.**

In [None]:
df.groupby('heart_disease')['heart_disease'].count().apply(lambda x: x* 100/ len(df))

In [None]:
plot_grouped_data('heart_disease')

**94% of population do not have heart disease.**

In [None]:
df.groupby('stroke')['stroke'].count().apply(lambda x: x* 100/ len(df))

**95% of population did not had storke.**

In [None]:
plot_grouped_data('stroke')

**We can also see that data is higly imbalance.**

### For Categorical Column

In [None]:
def draw_count_plot_for_categorical_feature():
    dataset = df.copy()
    fig, axes = plt.subplots(1, len(categorical_cols), figsize=(36, 5))
    row_idx = 0
    for col in categorical_cols:
        sns.countplot(x=col, data=dataset, ax = axes[row_idx])
        row_idx += 1
    plt.show()

In [None]:
draw_count_plot_for_categorical_feature()

Below observation can be made:
- There are more female than males.
- There are more married people.
- More People work in Private sector.
- There equal number of people from rural and urban.
- Most people haven't smoked.

## BiVariate Analysis

In [None]:
sns.pairplot(df.iloc[: , 1:-1])

### FInd out which age group has more heart disease

In [None]:
sns.histplot(x ='age', data=df, bins = 10, hue = 'heart_disease', multiple="stack")

- Population below 40 has almost no heart disease.
- Population between 70 to 80 has most hear disease.

### Find out which age group has more hypertension

In [None]:
sns.histplot(x ='age', data=df, bins = 10, hue = 'hypertension', multiple="stack")

- Population below 20 do not have hypertension.
- Population between 20 and 30 have few peoples with hypertension.
- Population with age greater than 50 have more people with hypertension

### Find out which age group has high average glucose level

In [None]:
sns.lineplot(x ='age', data=df, y = 'avg_glucose_level')

- Average glucose level is increasing with age.

### Find out which age group has higher BMI

In [None]:
sns.lineplot(x ='age', data=df, y = 'bmi')

- BMI initially increase with age, then it starts to decrease after 50.

### Relation between age and categorical columns

In [None]:
def relate_age_and_categorical_cols():
    dataset = df.copy()
    fig, axes = plt.subplots(1, len(categorical_cols), figsize=(36, 5))
    for idx, col in enumerate(categorical_cols):
        sns.histplot(x ='age', data=dataset, bins = 10, hue = col, multiple="stack", ax = axes[idx])
    plt.show()

In [None]:
relate_age_and_categorical_cols()

Below observation can be made:
- There are more female in each age group.
- Population above 20 are married, and population greater than 40 have most married people.
- Population above 20, mostly work in private sector.
- Population have equal number of people from rural and urban.
- Population above 20 have fair amount of people who smokes.
- There are more people above 20 who never smoked.
- More people have quit smoking after 40.

In [None]:
def is_a_cause_of_stroke(feature):
    dataset = df.copy()
    grouped_df = dataset.groupby([feature, 'stroke'])['id'].agg({'count'}).reset_index()
    grouped_df['percentage_count'] = grouped_df.groupby(feature)['count'].apply(lambda x: 100 * x / x.sum())
    return grouped_df

### Does hypertension is a cause of Stroke?

In [None]:
is_a_cause_of_stroke('hypertension')

Below observation can be made:
- Population with no hypertension have less chance of getting stroke than population with hypertension.

### Does heart disease is a cause of stroke?

In [None]:
is_a_cause_of_stroke('heart_disease')

Below observation can be made:
- Population with no heart disease have less chance of getting stroke than population with heart disease.

### Does Smoking cause stroke?

In [None]:
is_a_cause_of_stroke('smoking_status')

Below observation can be made:
- Population who used to smoke has less chance of stroke.
- Population who smokes has less chance of stroke.
- Population who never smoked has less chance of stroke.
- Population who used to smoke has more chance of stroke when compared with smoker and non-smokers(never smoked).

### Which gender has high chance of stroke?

In [None]:
is_a_cause_of_stroke('gender')

Below observation can be made:
- Men have more chance of stroke than Female.

In [None]:
is_a_cause_of_stroke('work_type')

Below observation can be made:
- Self-Employed people have highest chance of stroke.
- Population who never worked did not had stroke.
- Private sector population is at 2nd number.

### Effect of resisdence on stroke.

In [None]:
is_a_cause_of_stroke('Residence_type')

Below observation can be made:
- Population living in Urban have more chance of stroke.

### Effect of marriage on stroke.

In [None]:
is_a_cause_of_stroke('ever_married')

Below observation can be made:
- Married people have high chance of stroke.

### Pearson Correlation

In [None]:
corr = df.iloc[:, 1:].corr()
corr

Below observtion can be made:
- Age is most responsible for Stroke.
- BMI is lease responsible for stroke.

# Feature Engineering

## Imputing missing numerical values

In [None]:
df.isnull().sum()

Only bmi has missing values.

In [None]:
sns.histplot(x='bmi', data=df)

It follows almost normal gaussian distribution, and it has no outliers. So, **mean can be used to imputer missing values.**

In [None]:
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())

In [None]:
df.isnull().sum()

## Encoding categorical columns

In [None]:
categorical_cols

In [None]:
def encode_categorical_cols_using_one_hot_encoding():
    dataset = df.copy()
    return pd.get_dummies(data=dataset, columns=categorical_cols)

In [None]:
encoded_df = encode_categorical_cols_using_one_hot_encoding()

In [None]:
encoded_df.columns

In [None]:
encoded_df.head()

We can see that age, average glucose level and BMI has higher values than other columns, so we need to scale them. 

In [None]:
grouped_df = df.groupby('stroke')['id'].agg({'count'}).reset_index()
grouped_df['percentage'] = grouped_df['count'].apply(lambda x: x * 100 / len(df))
grouped_df

We can see that our dataset is higly imbalance, as it contains 95% of value as 0 and only 5% of value as 1. We can use Random Undersampling, Random Oversampling and SMOTE to create a banace dataset.

### Scaling Dataset

In [None]:
from sklearn.preprocessing import StandardScaler

def scale_dataset(df):
    dataset = df.copy()
    dataset.drop(['stroke'], axis = 1, inplace = True)
    scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(dataset),columns = dataset.columns)

In [None]:
# Scaling Ecoded dataset
scaled_X = scale_dataset(encoded_df)

# Feature Selection

In [None]:
corr = scaled_X.corr().round(3)

In [None]:
corr

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(corr, vmin=-1, vmax=1, annot=True)

There are no higly correlated columns. So there is less chance of overfitting.

### Using Lasso Regression

In [None]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [None]:
scaled_X.columns

In [None]:
scaled_X.drop('id', inplace=True, axis=1)

In [None]:
dataset = scaled_X.copy()
Y = df['stroke']
X = dataset

In [None]:
feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state = 0))
feature_sel_model.fit(X, Y)

In [None]:
feature_sel_model.get_support()

In [None]:
selected_features = X.columns[(feature_sel_model.get_support())]

In [None]:
selected_features

# Model Selction with Cross Validation

In [None]:
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler

- *This problem is a classification problem with an imbalanced dataset. So we can use Recall (True Positive Rate), Precision, and F1 Score as performance metrics.*
- *We need to focus more on predicting stroke rather than predicting that person will not have a stroke because if a person expected to have a stroke. If he is not going to have a stroke, then a person can take further tests and medicines, but if a person is going to have a stroke, our model predicted as NO, then it can cause disaster. Hence we need to reduce the False Negative value. __Therefore Recall should be used as a metric.__*
- *Although predicting stroke is more important, there may be a scenario where is person is predicted to have stroke and it is a false positive, and the person start taking medication and those meidcation starts effecting him/her adversly. Therefore we should avoid predicting stroke, if a person is not supposed to have one. __Hence Precision is also important.__*
- *Since both Recall and Precision are important, therefore __we will also be using F1 Score.__*

#### Using above explaination, following metrics will be used for performace metrics
- Recall
- Precision
- F1 Score

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
smt = RandomOverSampler(random_state=12)
def apply_stratified_k_fold_validation( model, x = X[selected_features], y = Y ):
    fig, axes = plt.subplots(1, 5, figsize=(36, 5))
    exp = 0
    for train_index, test_index in skf.split(x , y):
        x_train_fold, x_test_fold = x.iloc[train_index.tolist()], x.iloc[test_index.tolist()]
        y_train_fold, y_test_fold = y.iloc[train_index.tolist()], y.iloc[test_index.tolist()]
        X_train_res, y_train_res = smt.fit_resample(x_train_fold, y_train_fold)
        model.fit(X_train_res, y_train_res)
        y_pred_fold = model.predict(x_test_fold)
        print("Experiment:", exp)
        print("Classification Report")
        print(classification_report(y_test_fold, y_pred_fold))
        print("Confusion Matrix")
        mat = confusion_matrix(y_test_fold, y_pred_fold)
        print("True Postive:", mat[0][0])
        print("True Negative:", mat[1][1])
        print("False Postive:", mat[0][1])
        print("False Negative:", mat[1][0])
        print("\n")
        sns.countplot(x=y_pred_fold, ax= axes[exp])
        plt.title("Experiment "+ str(exp))
        exp += 1    


## Logistics Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
apply_stratified_k_fold_validation(model)

## Decision Tree

In [None]:
from sklearn import tree

model = tree.DecisionTreeClassifier()
apply_stratified_k_fold_validation(model)

## XGBoost Classifier

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier(max_depth=12,random_state = 42, use_label_encoder =False)
apply_stratified_k_fold_validation(model)

## Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
apply_stratified_k_fold_validation(model)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=0)
apply_stratified_k_fold_validation(model)

Each experiment only predicted __0__ with high accuracy not __1's__. However, out of all models, Logistics Regression performs well.