In [None]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
 
import time
import warnings 
warnings.filterwarnings('ignore')

In [None]:
stroke = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
stroke.head()

In [None]:
stroke.isnull().sum()

In [None]:
stroke[stroke.bmi.isnull()]

In [None]:
median_bmi=stroke.bmi.median()
stroke["bmi"] = stroke["bmi"].fillna(median_bmi)

In [None]:
stroke.isnull().sum()

In [None]:
gender_dummies = pd.get_dummies(stroke.gender, prefix="gender")
ever_married_dummies = pd.get_dummies(stroke.ever_married, prefix="ever_married")
work_type_dummies = pd.get_dummies(stroke.work_type, prefix="work_type")
Residence_type_dummies = pd.get_dummies(stroke.Residence_type, prefix="Residence_type")
smoking_status_dummies = pd.get_dummies(stroke.smoking_status, prefix="smoking_status")

# concatenate dummy columns with main dataset
stroke_dummies = pd.concat([stroke, ever_married_dummies, work_type_dummies, Residence_type_dummies, smoking_status_dummies], axis=1)

# drop categorical fields
stroke_dummies.drop(['gender', 'ever_married','work_type', 'Residence_type','smoking_status'], axis=1, inplace=True)

stroke_dummies.head()


In [None]:
stroke.dtypes

In [None]:
from sklearn.model_selection import train_test_split
 
# models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
 
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
# preprocessing
## variables countinious
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
## variables categorical
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
 
# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
X_train, X_test, y_train, y_test = train_test_split(stroke_dummies.drop('stroke', axis=1),
                                                    stroke_dummies['stroke'],
                                                    test_size=0.3,
                                                    random_state=42)

In [None]:
# columns num
cols_numerical = X_train.select_dtypes(include=['int64', 'float64']).columns
 
# transformer numerical
transformer_numerical = Pipeline(steps = [
    ('num_trans', StandardScaler())
])

In [None]:
# preprocesor 
preprocessor = ColumnTransformer(transformers = [
    ('numerical', transformer_numerical, cols_numerical) 
])

In [None]:
classifiers = [
    DummyClassifier(strategy='stratified'),
    LogisticRegression(max_iter=500), # można tutaj podać hiperparametry
    KNeighborsClassifier(2), # 2 bo mamy dwie klasy
    ExtraTreeClassifier(),
    RandomForestClassifier(),
    SVC(),
    XGBClassifier(),
    CatBoostClassifier(silent=True),
    LGBMClassifier(verbose=-1)
]
 
# transformators for numerical
scalers = [StandardScaler(), MinMaxScaler(), Normalizer()]


In [None]:
# dataframe 
models_df = pd.DataFrame()
 
#  pipeline
pipe = Pipeline(steps = [
    ('preprocessor', preprocessor), 
    ('classifier', None) 
])
 
# for each model 
for model in classifiers:
    for num_tr in scalers:
            pipe_params = {
                'preprocessor__numerical__num_trans': num_tr,
                'classifier': model
            }
            pipe.set_params(**pipe_params)
 
            # time
            start_time = time.time()
            pipe.fit(X_train, y_train)   
            end_time = time.time()
 
            # score
            score = pipe.score(X_test, y_test)
 
            # dict for params
            param_dict = {
                        'model': model.__class__.__name__,
                        'num_trans': num_tr.__class__.__name__,
                        'score': score,
                        'time_elapsed': end_time - start_time
            }
 
            models_df = models_df.append(pd.DataFrame(param_dict, index=[0]))
 
models_df.reset_index(drop=True, inplace=True)

In [None]:
models_df.sort_values('score', ascending=False).head(10)

In [None]:
sns.boxplot(data=models_df, x='score', y='model')