In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

In [21]:
dataset = pd.read_csv('penguins.csv')
dataset

Unnamed: 0,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
0,39.1,18.7,181.0,3750.0,0
1,39.5,17.4,186.0,3800.0,0
2,40.3,18.0,195.0,3250.0,0
3,,,,,0
4,36.7,19.3,193.0,3450.0,0
...,...,...,...,...,...
339,55.8,19.8,207.0,4000.0,2
340,43.5,18.1,202.0,3400.0,2
341,49.6,18.2,193.0,3775.0,2
342,50.8,19.0,210.0,4100.0,2


In [22]:
features = ['CulmenLength', 'CulmenDepth', 'FlipperLength', 'BodyMass']

X = dataset[features]
y = dataset['Species'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [23]:
num_preparation = Pipeline(steps=[
    ('fill_missings', SimpleImputer(strategy='mean'))
])

print('Surowy zbiór danych - zbiór treningowy:')
print(X_train[features].isnull().mean())
X_train_trasnformed = num_preparation.fit_transform(X_train[features])
X_train_trasnformed = pd.DataFrame(X_train_trasnformed, columns=features)
print('\nWyjście Pipeline - zbiór treningowy')
print(X_train_trasnformed[features].isnull().mean())

Surowy zbiór danych - zbiór treningowy:
CulmenLength     0.007752
CulmenDepth      0.007752
FlipperLength    0.007752
BodyMass         0.007752
dtype: float64

Wyjście Pipeline - zbiór treningowy
CulmenLength     0.0
CulmenDepth      0.0
FlipperLength    0.0
BodyMass         0.0
dtype: float64


In [24]:
print('Surowy zbiór danych - zbiór testowy:')
print(X_test[features].isnull().mean())
X_test_trasnformed = num_preparation.transform(X_test[features])
X_test_trasnformed = pd.DataFrame(X_test_trasnformed, columns=features)
print('\nWyjście Pipeline - zbiór testowy')
print(X_test_trasnformed[features].isnull().mean())

Surowy zbiór danych - zbiór testowy:
CulmenLength     0.0
CulmenDepth      0.0
FlipperLength    0.0
BodyMass         0.0
dtype: float64

Wyjście Pipeline - zbiór testowy
CulmenLength     0.0
CulmenDepth      0.0
FlipperLength    0.0
BodyMass         0.0
dtype: float64


In [25]:
num_preparation = Pipeline(steps=[
    ('fill_missings', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

data_preparation = ColumnTransformer(transformers=[
    ('numeric_preprocessing', num_preparation, features)
])

model_pipeline_v1 = Pipeline(steps=[('preprocessor', data_preparation),
                                    ('model', GradientBoostingClassifier())])

model_pipeline_v1.fit(X_train, y_train)

In [26]:
predictions_train = model_pipeline_v1.predict(X_train)
predictions = model_pipeline_v1.predict(X_test)

print("F1_score 'all features' train:", f1_score(y_train, predictions_train, average='macro'))
print("F1_score 'all features' test:", f1_score(y_test, predictions, average='macro'))

F1_score 'all features' train: 0.9967490400244999
F1_score 'all features' test: 0.9758360547834233


In [27]:
list(model_pipeline_v1.get_params().keys())

['memory',
 'steps',
 'verbose',
 'preprocessor',
 'model',
 'preprocessor__n_jobs',
 'preprocessor__remainder',
 'preprocessor__sparse_threshold',
 'preprocessor__transformer_weights',
 'preprocessor__transformers',
 'preprocessor__verbose',
 'preprocessor__verbose_feature_names_out',
 'preprocessor__numeric_preprocessing',
 'preprocessor__numeric_preprocessing__memory',
 'preprocessor__numeric_preprocessing__steps',
 'preprocessor__numeric_preprocessing__verbose',
 'preprocessor__numeric_preprocessing__fill_missings',
 'preprocessor__numeric_preprocessing__scaler',
 'preprocessor__numeric_preprocessing__fill_missings__add_indicator',
 'preprocessor__numeric_preprocessing__fill_missings__copy',
 'preprocessor__numeric_preprocessing__fill_missings__fill_value',
 'preprocessor__numeric_preprocessing__fill_missings__keep_empty_features',
 'preprocessor__numeric_preprocessing__fill_missings__missing_values',
 'preprocessor__numeric_preprocessing__fill_missings__strategy',
 'preprocessor__

In [28]:
params = {
    'preprocessor__numeric_preprocessing__fill_missings__strategy': ['mean', 'median'],
    'preprocessor__numeric_preprocessing__scaler__with_mean': [True, False],
    'preprocessor__numeric_preprocessing__scaler__with_std': [True, False], 
    'model__learning_rate': [0.01, 0.1, 0.5, 1],
    'model__max_depth': [2, 3, 4, 5],
    'model__n_estimators': [10, 50, 100, 200],
    'model__min_samples_leaf':[1, 2, 3, 4, 5]
}

grid_search = GridSearchCV(model_pipeline_v1, params, cv=10, n_jobs=-1, verbose=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)
print('Wybrane hiperparametry: ', grid_search.best_params_)
model_v3 = grid_search.best_estimator_

Fitting 10 folds for each of 2560 candidates, totalling 25600 fits
Wybrane hiperparametry:  {'model__learning_rate': 0.5, 'model__max_depth': 5, 'model__min_samples_leaf': 2, 'model__n_estimators': 100, 'preprocessor__numeric_preprocessing__fill_missings__strategy': 'median', 'preprocessor__numeric_preprocessing__scaler__with_mean': True, 'preprocessor__numeric_preprocessing__scaler__with_std': False}


In [29]:
predictions_model_v3 = model_v3.predict(X_test)
print("F1_score 'all features with grid search' test:", f1_score(y_test, predictions_model_v3, average='macro'))

F1_score 'all features with grid search' test: 0.9716202270381838


In [30]:
features_v2 = ['CulmenLength', 'FlipperLength']

X = dataset[features_v2]
y = dataset['Species'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

num_preparation = Pipeline(steps=[
    ('fill_missings', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

data_preparation = ColumnTransformer(transformers=[
    ('numeric_preprocessing', num_preparation, features_v2)
])

model_pipeline_v2 = Pipeline(steps=[('preprocessor', data_preparation),
                                    ('model', GradientBoostingClassifier())])

model_pipeline_v2.fit(X_train, y_train)

In [38]:
predictions_train_v2 = model_pipeline_v2.predict(X_train)
predictions_v2 = model_pipeline_v2.predict(X_test)

print("F1_score'two features' train:", f1_score(y_train, predictions_train_v2, average='macro'))
print("F1_score 'two features' test:", f1_score(y_test, predictions_v2, average='macro'))

F1_score'two features' train: 0.9967490400244999
F1_score 'two features' test: 0.9599780701754387


In [32]:
grid_search_2 = GridSearchCV(model_pipeline_v2, params, cv=10, n_jobs=-1, verbose=10, scoring='f1_macro')
grid_search_2.fit(X_train, y_train)
print('Wybrane hiperparametry: ', grid_search.best_params_)

Fitting 10 folds for each of 2560 candidates, totalling 25600 fits
Wybrane hiperparametry:  {'model__learning_rate': 0.5, 'model__max_depth': 5, 'model__min_samples_leaf': 2, 'model__n_estimators': 100, 'preprocessor__numeric_preprocessing__fill_missings__strategy': 'median', 'preprocessor__numeric_preprocessing__scaler__with_mean': True, 'preprocessor__numeric_preprocessing__scaler__with_std': False}


In [40]:
model_v4 = grid_search_2.best_estimator_

In [42]:
predictions_model_v4 = model_v4.predict(X_test)
print("F1_score 'two features with grid search' test:", f1_score(y_test, predictions_model_v4, average='macro'))

F1_score 'two features with grid search' test: 0.9750378787878787


In [30]:
print("F1_score 'Logistic Regression' test: 0.9650793650793652\nF1_score 'KNN' test: 0.9828560296123126\nF1_score 'DecissionTree' test: 0.9329501915708813\nF1_score 'SVM' test: 0.9650793650793652\nF1_score 'RandomForest' test: 0.9662835249042147\nF1_score 'Adaboost' test: 0.9650793650793652\nF1_score 'Ensembling' test: 0.9329501915708813")

F1_score 'Logistic Regression' test: 0.9650793650793652
F1_score 'KNN' test: 0.9828560296123126
F1_score 'DecissionTree' test: 0.9329501915708813
F1_score 'SVM' test: 0.9650793650793652
F1_score 'RandomForest' test: 0.9662835249042147
F1_score 'Adaboost' test: 0.9650793650793652
F1_score 'Ensembling' test: 0.9329501915708813
