In [1]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interestes
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

In [2]:
target.value_counts()

Adelie Penguin (Pygoscelis adeliae)          151
Gentoo penguin (Pygoscelis papua)            123
Chinstrap penguin (Pygoscelis antarctica)     68
Name: Species, dtype: int64

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

In [4]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor', StandardScaler()),
  ('classifier', KNeighborsClassifier())],
 'verbose': False,
 'preprocessor': StandardScaler(),
 'classifier': KNeighborsClassifier(),
 'preprocessor__copy': True,
 'preprocessor__with_mean': True,
 'preprocessor__with_std': True,
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': None,
 'classifier__n_neighbors': 5,
 'classifier__p': 2,
 'classifier__weights': 'uniform'}

In [3]:
from sklearn.model_selection import cross_validate

cv_result_bal =cross_validate(model, data, target, cv = 10, scoring='balanced_accuracy')
cv_result_bal

{'fit_time': array([0.00419283, 0.00304103, 0.00312972, 0.00299621, 0.00296903,
        0.00298119, 0.00295615, 0.00296974, 0.00296736, 0.00296474]),
 'score_time': array([0.002846  , 0.00255847, 0.00252151, 0.00252938, 0.00250554,
        0.00251818, 0.00250673, 0.00250554, 0.00250649, 0.00249791]),
 'test_score': array([1.        , 1.        , 1.        , 0.91880342, 0.88253968,
        0.95238095, 0.97777778, 0.93015873, 0.90793651, 0.95238095])}

In [4]:
print(f"{cv_result_bal['test_score'].mean():.3f} +/- {cv_result_bal['test_score'].std():.3f}")

0.952 +/- 0.040


In [5]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import GridSearchCV

all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

#n_neighbors = [5,51,101]

#param_grid = {'preprocessor': all_preprocessors,
#    'classifier__n_neighbors':tuple(n_neighbors)
#}

model_grid_search = GridSearchCV(model, 
                                 param_grid={"preprocessor": all_preprocessors,
                                             "classifier__n_neighbors": [5, 51, 101]},)

cv_result_bal =cross_validate(model_grid_search, data, target, cv = 10, scoring='balanced_accuracy', return_estimator= True)

In [7]:
for fold_idx, estimator in enumerate(cv_result_bal["estimator"]):
    print(f"Best parameter found on fold #{fold_idx + 1}")
    print(f"{estimator.best_params_}")

Best parameter found on fold #1
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best parameter found on fold #2
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best parameter found on fold #3
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best parameter found on fold #4
{'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}
Best parameter found on fold #5
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best parameter found on fold #6
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best parameter found on fold #7
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best parameter found on fold #8
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best parameter found on fold #9
{'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}
Best parameter found on fold #1

In [None]:
cv_result_bal

In [8]:
from sklearn.model_selection import GridSearchCV
param_grid = {
  "preprocessor": all_preprocessors,
  "classifier__n_neighbors": [5, 51, 101],
}

grid_search = GridSearchCV(
    model,
    param_grid=param_grid,
    scoring="balanced_accuracy",
    cv=10,
).fit(data, target)
grid_search.cv_results_

{'mean_fit_time': array([0.00186415, 0.00289514, 0.00273228, 0.00351985, 0.00524123,
        0.0017622 , 0.00286286, 0.00271435, 0.00350351, 0.00523355,
        0.00176179, 0.00286102, 0.00274799, 0.00356288, 0.0053    ]),
 'std_fit_time': array([1.67665458e-04, 2.27522914e-05, 1.58278336e-05, 3.62036944e-05,
        2.85558642e-04, 7.29950321e-06, 1.96813201e-05, 7.76146775e-06,
        2.68005815e-05, 2.97853850e-04, 7.45320274e-06, 1.14107640e-05,
        3.07303791e-05, 8.43799696e-06, 2.95511376e-04]),
 'mean_score_time': array([0.00246036, 0.00244515, 0.00245159, 0.00256536, 0.00255234,
        0.00252199, 0.00263147, 0.00263143, 0.00278769, 0.00274069,
        0.00267751, 0.00281615, 0.0028367 , 0.00299675, 0.00298202]),
 'std_score_time': array([9.68334205e-05, 1.30987306e-05, 4.56263710e-05, 1.11009134e-05,
        3.05475613e-05, 1.96954527e-05, 2.75762395e-05, 1.04768622e-05,
        6.92088686e-05, 1.50041076e-05, 3.03715794e-05, 2.04517564e-05,
        3.60302461e-05, 2.45

In [13]:
results = (
    pd.DataFrame(grid_search.cv_results_)
    .sort_values(by="mean_test_score", ascending=False)
)

results = results[
    [c for c in results.columns if c.startswith("param_")]
    + ["mean_test_score", "std_test_score"]
]

In [14]:
results

Unnamed: 0,param_classifier__n_neighbors,param_preprocessor,mean_test_score,std_test_score
1,5,StandardScaler(),0.952198,0.039902
2,5,MinMaxScaler(),0.947778,0.034268
3,5,QuantileTransformer(n_quantiles=100),0.947094,0.033797
4,5,PowerTransformer(method='box-cox'),0.94696,0.047387
6,51,StandardScaler(),0.94188,0.038905
8,51,QuantileTransformer(n_quantiles=100),0.927277,0.043759
9,51,PowerTransformer(method='box-cox'),0.922833,0.047883
7,51,MinMaxScaler(),0.920293,0.045516
11,101,StandardScaler(),0.876642,0.041618
12,101,MinMaxScaler(),0.862357,0.046244
