# If you like the notebook please upvote it

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

# **LOADING THE DATASET**

In [None]:
data = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
data

**dropping the id column**

In [None]:
data.drop('id', axis=1, inplace=True)

# DATA EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

**lets look for some insights**

**first I want to look if a person has a higher chance of geeting a stroke if he has hypertension**

In [None]:
sns.factorplot(x='stroke', col='hypertension', kind='count', data=data)

**a person with no hyper tension has a slightly higher change of having a stroke**

**Now I want to look if a person has a higher chance of geeting a stroke if he has heart disease**

In [None]:
sns.factorplot(x='stroke', col='heart_disease', kind='count', data=data)

**a person who does not have a heart disease has a slightly higher chance of getting a stroke**

**lets see if marriage effects the chances to stroke**

In [None]:
sns.factorplot(x='stroke', col='ever_married', kind='count', data=data)

**a person who is married has a higher chance of stroke**

**lets also see if gender effects the chances of stroke**

In [None]:
sns.factorplot(x='stroke', col='gender', kind='count', data=data)

**females have a slightly higher chance of getting a stroke compared to male**

**and finally lets see if smoking effects the chances of stroke**

In [None]:
sns.factorplot(x='stroke', col='smoking_status', kind='count', data=data)

**a person who has never smoked has a higher chance of getting stroke**

**what is the highest glucose level recorded among the people who had stroke?**

In [None]:
avg = data[data['stroke']==1].avg_glucose_level
max(avg)

**lets plot the histogram of the data**

In [None]:
data.hist()

# **DATA PROCESSING**

**first lets check for missing values**

In [None]:
data.isnull().sum()

**there are 201 missing values which is just 3% of the data and we can dop them**

In [None]:
data.dropna(inplace=True)

**lets check for any duplicate values**

In [None]:
data.duplicated().any()

**there are no duplicate values**

**Feature Binning**

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

**creating all the binners**

In [None]:
age_binner = KBinsDiscretizer(n_bins=5, encode='ordinal')
glucode_lvl_binner = KBinsDiscretizer(n_bins=4, encode='ordinal')
bmi_binner = KBinsDiscretizer(n_bins=5, encode='ordinal')

**binning the features**

In [None]:
data['age_bins'] = age_binner.fit_transform(data['age'].values.reshape(-1,1)).astype('int64')
data['avg_glucose_level_bins'] = glucode_lvl_binner.fit_transform(data['avg_glucose_level'].values.reshape(-1,1)).astype('int64')
data['weight'] = bmi_binner.fit_transform(data['bmi'].values.reshape(-1,1)).astype('int64')

Encoding categorical data

In [None]:
cat_cols = data.select_dtypes(object).columns

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in cat_cols:
    data[col] = le.fit_transform(data[col])

**scaling continuous data**

In [None]:
from sklearn.preprocessing import RobustScaler, StandardScaler

**robust scaler to remove outliars and standard scaler to scale the data**

In [None]:
cols_to_scale = ['age', 'avg_glucose_level', 'bmi']

In [None]:
robust = RobustScaler()
standard = StandardScaler()

data[cols_to_scale] = robust.fit_transform(data[cols_to_scale])
data[cols_to_scale] = standard.fit_transform(data[cols_to_scale])

**Feature Correlation**

In [None]:
corr = data.corr()
plt.figure(figsize=(10,10))
sns.heatmap(corr, cmap='coolwarm', annot=True, square=True, fmt='.2f')
plt.show()

**stroke aka the target column has no highly correlated features but if your target column has highly correlated features then you gotta remove them or reduce their dimentions using PCA**

**Feature Imbalance**

In [None]:
X, y = data.drop('stroke', axis=1), data['stroke']

In [None]:
sns.countplot(y)

**there is a lot of imbalance in the data lets fix this**

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE()

X, y = smote.fit_resample(X, y)

In [None]:
sns.countplot(y)

**now our data is balanced**

# **SPLITTING DATA INTO TRAINING AND TESTING SETS**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# SPLITTING TRAINING DATA INTO TRAINING AND VALIDATION SETS FOR MODEL SELECTION

In [None]:
x_train_, x_val, y_train_, y_val = train_test_split(x_train, y_train, test_size=0.2)

# **MODEL SELECTION**

In [None]:
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [None]:
from sklearn.metrics import f1_score, classification_report, mean_squared_error

In [None]:
def model_selection(x_train_, x_val, y_train_, y_val, model):
  model = model()
  model.fit(x_train_, y_train_)

  pred = model.predict(x_val)

  error = np.sqrt(mean_squared_error(y_val, pred))
  acc = f1_score(y_val, pred)
  report = classification_report(y_val, pred)
  train_score = model.score(x_train_, y_train_)
  val_score = model.score(x_val, y_val)

  print('Error:', error*100)
  print('\n')
  print('ACC:', acc*100)
  print('\n')
  print('Classification report:', report)
  print('\n')
  print('Train Score:', train_score*100)
  print('\n')
  print('Val Score:', val_score*100)
  print('\n')
  print('Is overfitting:', True if train_score>val_score else False)
  print('\n')
  print('Overfitting by:',train_score*100-val_score*100)

In [None]:
extratrees = model_selection(x_train_, x_val, y_train_, y_val, ExtraTreesClassifier)
extratrees 

In [None]:
gradient = model_selection(x_train_, x_val, y_train_, y_val, GradientBoostingClassifier)
gradient

In [None]:
randomforest = model_selection(x_train_, x_val, y_train_, y_val, RandomForestClassifier)
randomforest

In [None]:
ada = model_selection(x_train_, x_val, y_train_, y_val, AdaBoostClassifier)
ada

In [None]:
xgb = model_selection(x_train_, x_val, y_train_, y_val, XGBClassifier)
xgb

In [None]:
lgbm = model_selection(x_train_, x_val, y_train_, y_val, LGBMClassifier)
lgbm

In [None]:
catboost = model_selection(x_train_, x_val, y_train_, y_val, CatBoostClassifier)
catboost

In [None]:
tree = model_selection(x_train_, x_val, y_train_, y_val, DecisionTreeClassifier)
tree

In [None]:
logistic = model_selection(x_train_, x_val, y_train_, y_val, LogisticRegression)
logistic

In [None]:
sgd = model_selection(x_train_, x_val, y_train_, y_val, SGDClassifier)
sgd

**I will use LGBMClassifier because it gives a good accuracy and the  error and overfitting rate is low**

# **HYPER PARAMETER TUNING**

**if you get an accuracy of like 95+ you dont need to do hyper parameter tuning because it may decrease your accuracy**

**even though our accuracy is excellent i am including this part just to show how hyper parameter tuning works**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

In [None]:
model = LGBMClassifier()

In [None]:
params = {'num_leaves': sp_randint(6, 50), 
            'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
             'scale_pos_weight':[1,2,6,12],
             'depth': sp_randint(3,10),
             'learning_rate': sp_uniform()}

In [None]:
search = RandomizedSearchCV(model, params, n_iter=50, scoring='f1', n_jobs=-1, cv=5)
search.fit(x_train, y_train)

In [None]:
print('Best Score:', search.best_score_)
print('\n')
print('Best Params:', search.best_params_)
print('\n')
print('Best Estimator:', search.best_estimator_)

**We got our optimal parameters now lets build the model using them just copy the best estimator and paste it in another cell**

# **MODEL BUILDING AND TRAINING**

In [None]:
model = LGBMClassifier(colsample_bytree=0.9040713748241764, depth=8,
               learning_rate=0.887075684185765, min_child_samples=208,
               min_child_weight=1e-05, num_leaves=35, reg_alpha=0.1,
               reg_lambda=1, scale_pos_weight=2, subsample=0.507606910665195)

model.fit(x_train, y_train)

# **PREDICTIONS**

In [None]:
pred = model.predict(x_test)
pred

# METRIC CHECK

**Mean Squared Error**

In [None]:
error = np.sqrt(mean_squared_error(y_test, pred))*100
error

**F1 Score**

In [None]:
acc = f1_score(y_test, pred)
acc*100

**Classification Report**

In [None]:
classification_report(y_test, pred)

**OVER FITTING CHECK**

In [None]:
over_fitting_rate = model.score(x_train, y_train)*100 - model.score(x_test, y_test)*100
over_fitting_rate

**the model is overfitting by 4% which is honestly not a lot so we can leave it there**