age
Age of the person

sex
Gender of the person

cp
Chest Pain type chest pain type

trtbps
resting blood pressure (in mm Hg)

chol
cholestoral in mg/dl fetched via BMI sensor

fbs
(fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

restecg
resting electrocardiographic results

thalachh
maximum heart rate achieved

exng
exercise induced angina (1 = yes; 0 = no)

oldpeak
Previous peak

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import time
import warnings 
warnings.filterwarnings('ignore')

In [None]:
from sklearn.model_selection import train_test_split
 
# models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
 
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
# preprocessing
## variables countinious
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
## variables categorical
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
 
# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
heart = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
heart

In [None]:
heart.dtypes

In [None]:
X_train, X_test, y_train, y_test = train_test_split(heart.drop('output', axis=1),
                                                    heart['output'],
                                                    test_size=0.3,
                                                    random_state=42)

In [None]:
# columns num
cols_numerical = X_train.select_dtypes(include=['int64', 'float64']).columns
 
# transformer numerical
transformer_numerical = Pipeline(steps = [
    ('num_trans', StandardScaler())
])

In [None]:
# preprocesor 
preprocessor = ColumnTransformer(transformers = [
    ('numerical', transformer_numerical, cols_numerical),
])

In [None]:
classifiers = [
    DummyClassifier(strategy='stratified'),
    LogisticRegression(max_iter=500), # można tutaj podać hiperparametry
    KNeighborsClassifier(2), # 2 bo mamy dwie klasy
    ExtraTreeClassifier(),
    RandomForestClassifier(),
    SVC(),
    XGBClassifier(),
    CatBoostClassifier(silent=True),
    LGBMClassifier(verbose=-1)
]
 
# transformators for numerical
scalers = [StandardScaler(), MinMaxScaler(), Normalizer()]

In [None]:
# dataframe 
models_df = pd.DataFrame()
 
#  pipeline
pipe = Pipeline(steps = [
    ('preprocessor', preprocessor), 
    ('classifier', None) 
])
 
# for each model 
for model in classifiers:
    for num_tr in scalers:
            pipe_params = {
                'preprocessor__numerical__num_trans': num_tr,
                'classifier': model
            }
            pipe.set_params(**pipe_params)
 
            # time
            start_time = time.time()
            pipe.fit(X_train, y_train)   
            end_time = time.time()
 
            # score
            score = pipe.score(X_test, y_test)
 
            # dict for params
            param_dict = {
                        'model': model.__class__.__name__,
                        'num_trans': num_tr.__class__.__name__,
                        'score': score,
                        'time_elapsed': end_time - start_time
            }
 
            models_df = models_df.append(pd.DataFrame(param_dict, index=[0]))
 
models_df.reset_index(drop=True, inplace=True)

In [None]:
models_df.sort_values('score', ascending=False).head(10)

In [None]:
sns.boxplot(data=models_df, x='score', y='model')

In [None]:
models_df[['model', 'score', 'time_elapsed']] \
    .groupby('model') \
    .aggregate({
        'score': ['mean','std', 'min', 'max'],
        'time_elapsed': ['mean','std', 'min', 'max']
        }) \
    .reset_index() \
    .sort_values(('score', 'mean'), ascending=False)

In [None]:
sns.boxplot(data=models_df, x='score', y='num_trans')