In [None]:
import pandas as pd
from random import random

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score


# modele
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

# preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.decomposition import PCA, TruncatedSVD, SparsePCA

from sklearn.datasets import load_iris

In [None]:
X = load_iris()['data']
y = load_iris()['target']

iris_df = pd.DataFrame(X, columns=['sl', 'sw', 'pl', 'pw'])
iris_df['species'] = y

In [None]:
# zwiększamy liczbę wierszy
for _ in range(0, 5):
    temp_iris_df = pd.DataFrame(X+random()/10, columns=['sl', 'sw', 'pl', 'pw'])
    temp_iris_df['species'] = y
    
    iris_df = iris_df.append(temp_iris_df)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(iris_df.drop('species', axis=1),
                                                    iris_df['species'],
                                                    test_size=0.2)

In [None]:
# lista kolumn numerycznych
cols_numerical = X_train.select_dtypes(include=['int64', 'float64']).columns

# transformer dla kolumn numerycznych
transformer_numerical = Pipeline(steps =[
    ('first_scaler', StandardScaler()),
    ('dim_red', PCA()),
    ('second_scaler', StandardScaler())
])

In [None]:
# analogicznie dla kolumn categorycznych jeśli trzeba

In [None]:
# preprocesor danych
preprocessor = ColumnTransformer(transformers=[
    ('numerical', transformer_numerical, cols_numerical)
])

In [None]:
# klasyfikatory                            
classifiers = [
    ExtraTreeClassifier(),
    RandomForestClassifier(),
    SVC(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    XGBClassifier(),
    CatBoostClassifier(silent=True),
    LGBMClassifier(verbose=-1)
]

scalers = [StandardScaler(), MinMaxScaler(), Normalizer()]

dim_reduction = [PCA(), TruncatedSVD(), SparsePCA()]

In [None]:
models_df = pd.DataFrame()

for model in classifiers:
    for first_scale in scalers:
        for second_scale in scalers:
            for dim_red in dim_reduction:
                pipe = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', model)])

                pipe_params = {
                    'preprocessor__numerical__first_scaler': first_scale,
                    'preprocessor__numerical__second_scaler': second_scale,
                    'preprocessor__numerical__dim_red': dim_red
                }

                pipe.set_params(**pipe_params)
                
                pipe.fit(X_train, y_train)   

                y_pred = pipe.predict(X_test)
                score = pipe.score(X_test, y_test)

                models_df = models_df.append(
                    pd.DataFrame(
                        {
                            'model': model.__class__.__name__,
                            '1st_scaler':first_scale.__class__.__name__,
                            '2nd_scaler': second_scale.__class__.__name__,
                            'dim_red': dim_red.__class__.__name__,
                            'score': score
                        },
                        index=[0])
                        )


In [None]:
models_df.sort_values('score', ascending=False)

In [None]:
models_df[['model', 'score']] \
    .groupby('model') \
    .aggregate({
        'score': ['mean','std', 'min', 'max', 'count']
        }) \
    .reset_index() \
    .sort_values(('score', 'mean'), ascending=-False)

In [None]:
import seaborn as sns

In [None]:
sns.distplot(models_df['score'], bins=20)

In [None]:
sns.boxplot(data=models_df, x='score', y='model')

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'preprocessor__numerical__first_scaler': scalers,
    'preprocessor__numerical__second_scaler': scalers,
    'preprocessor__numerical__dim_red': dim_reduction
}


In [None]:
pipe_scheme = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', CatBoostClassifier(silent=True))])


In [None]:
pipe_scheme = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', AdaBoostClassifier())])

In [None]:
CV = GridSearchCV(pipe_scheme, param_grid, n_jobs= 1)

In [None]:
CV.fit(X_train, y_train)

In [None]:
print(CV.best_params_)    
print(CV.best_score_)

In [None]:
models_df[models_df['model'] == "AdaBoostClassifier"].sort_values('score', ascending=False)