In [1]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interestes
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

In [2]:
data.head()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
0,3750.0,181.0,39.1
1,3800.0,186.0,39.5
2,3250.0,195.0,40.3
4,3450.0,193.0,36.7
5,3650.0,190.0,39.3


In [3]:
target.value_counts(normalize=True)

Adelie Penguin (Pygoscelis adeliae)          0.441520
Gentoo penguin (Pygoscelis papua)            0.359649
Chinstrap penguin (Pygoscelis antarctica)    0.198830
Name: Species, dtype: float64

In [4]:
data.describe()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
count,342.0,342.0,342.0
mean,4201.754386,200.915205,43.92193
std,801.954536,14.061714,5.459584
min,2700.0,172.0,32.1
25%,3550.0,190.0,39.225
50%,4050.0,197.0,44.45
75%,4750.0,213.0,48.5
max,6300.0,231.0,59.6


In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

In [8]:
from sklearn.model_selection import cross_validate

result_classifier = cross_validate(
    model,
    data,
    target,
    cv=10,
    scoring='balanced_accuracy',
)

In [6]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor', StandardScaler()),
  ('classifier', KNeighborsClassifier())],
 'verbose': False,
 'preprocessor': StandardScaler(),
 'classifier': KNeighborsClassifier(),
 'preprocessor__copy': True,
 'preprocessor__with_mean': True,
 'preprocessor__with_std': True,
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': None,
 'classifier__n_neighbors': 5,
 'classifier__p': 2,
 'classifier__weights': 'uniform'}

In [9]:
result_classifier['test_score'].mean()

0.9521978021978021

In [11]:
result_classifier2 = cross_validate(
    model.set_params(classifier__n_neighbors=51),
    data,
    target,
    cv=10,
    scoring='balanced_accuracy',
)

result_classifier2['test_score'].mean()

0.9418803418803419

In [12]:
result_classifier3 = cross_validate(
    model.set_params(classifier__n_neighbors=5, preprocessor=None),
    data,
    target,
    cv=10,
    scoring='balanced_accuracy',
)

result_classifier3['test_score'].mean()

0.7398382173382173

In [24]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor', None), ('classifier', KNeighborsClassifier())],
 'verbose': False,
 'preprocessor': None,
 'classifier': KNeighborsClassifier(),
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': None,
 'classifier__n_neighbors': 5,
 'classifier__p': 2,
 'classifier__weights': 'uniform'}

In [13]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer

model = Pipeline(
    steps=[
        ("preprocessor", StandardScaler()),
        ("classifier", KNeighborsClassifier(n_neighbors=5)),
    ]
)

all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

In [16]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__n_neighbors': [5, 51, 101],
    'preprocessor': all_preprocessors,
}

model_grid_search = GridSearchCV(
    model,
    param_grid=param_grid,
)

model_grid_search.fit(data, target)

GridSearchCV(estimator=Pipeline(steps=[('preprocessor', StandardScaler()),
                                       ('classifier', KNeighborsClassifier())]),
             param_grid={'classifier__n_neighbors': [5, 51, 101],
                         'preprocessor': [None, StandardScaler(),
                                          MinMaxScaler(),
                                          QuantileTransformer(n_quantiles=100),
                                          PowerTransformer(method='box-cox')]})

In [17]:
model_grid_search.best_params_

{'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}

In [22]:
model_grid_search.cv_results_

{'mean_fit_time': array([0.00766058, 0.00804648, 0.00320749, 0.00728879, 0.00690727,
        0.00161228, 0.        , 0.0032074 , 0.00640011, 0.00639319,
        0.0080162 , 0.00489054, 0.0032001 , 0.00480132, 0.00638595]),
 'std_fit_time': array([2.51852670e-03, 1.35087942e-04, 3.92845944e-03, 1.41285412e-03,
        2.43948173e-03, 3.22456360e-03, 0.00000000e+00, 3.92856450e-03,
        3.20121336e-03, 3.19692229e-03, 3.05286030e-05, 3.99675019e-03,
        3.91991807e-03, 3.92054779e-03, 3.19337982e-03]),
 'mean_score_time': array([0.00568871, 0.00505395, 0.00478001, 0.00088391, 0.00365086,
        0.00650148, 0.00875392, 0.00479932, 0.00321527, 0.00320868,
        0.        , 0.00319963, 0.0047843 , 0.00478325, 0.00481796]),
 'std_score_time': array([0.00317767, 0.00368885, 0.00390297, 0.00176783, 0.00365535,
        0.00326   , 0.00101437, 0.00391895, 0.00393798, 0.00393005,
        0.        , 0.00391903, 0.0039065 , 0.0039056 , 0.00393392]),
 'param_classifier__n_neighbors': mask

In [20]:
cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values(
    "mean_test_score", ascending=False)
# get the parameter names
column_results = [f"param_{name}" for name in param_grid.keys()]
column_results += [
    "mean_test_score", "std_test_score", "rank_test_score"]
cv_results = cv_results[column_results]

In [21]:
cv_results

Unnamed: 0,param_classifier__n_neighbors,param_preprocessor,mean_test_score,std_test_score,rank_test_score
2,5,MinMaxScaler(),0.964876,0.007345,1
3,5,QuantileTransformer(n_quantiles=100),0.96202,0.011533,2
1,5,StandardScaler(),0.959037,0.011065,3
6,51,StandardScaler(),0.956095,0.009438,4
4,5,PowerTransformer(method='box-cox'),0.950341,0.014717,5
9,51,PowerTransformer(method='box-cox'),0.944331,0.025451,6
7,51,MinMaxScaler(),0.938534,0.021744,7
8,51,QuantileTransformer(n_quantiles=100),0.938534,0.014604,7
11,101,StandardScaler(),0.891731,0.015473,9
12,101,MinMaxScaler(),0.888789,0.018143,10
