In [1]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interestes
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

In [2]:
data.head()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
0,3750.0,181.0,39.1
1,3800.0,186.0,39.5
2,3250.0,195.0,40.3
4,3450.0,193.0,36.7
5,3650.0,190.0,39.3


In [5]:
target.value_counts(normalize=True)

Adelie Penguin (Pygoscelis adeliae)          0.441520
Gentoo penguin (Pygoscelis papua)            0.359649
Chinstrap penguin (Pygoscelis antarctica)    0.198830
Name: Species, dtype: float64

In [6]:
data.describe()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
count,342.0,342.0,342.0
mean,4201.754386,200.915205,43.92193
std,801.954536,14.061714,5.459584
min,2700.0,172.0,32.1
25%,3550.0,190.0,39.225
50%,4050.0,197.0,44.45
75%,4750.0,213.0,48.5
max,6300.0,231.0,59.6


In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

In [10]:
from sklearn.model_selection import cross_validate

result_classifier = cross_validate(
    model,
    data,
    target,
    cv=10,
    scoring='balanced_accuracy',
)

In [11]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor', StandardScaler()),
  ('classifier', KNeighborsClassifier())],
 'verbose': False,
 'preprocessor': StandardScaler(),
 'classifier': KNeighborsClassifier(),
 'preprocessor__copy': True,
 'preprocessor__with_mean': True,
 'preprocessor__with_std': True,
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': None,
 'classifier__n_neighbors': 5,
 'classifier__p': 2,
 'classifier__weights': 'uniform'}

In [18]:
import numpy as np
np.mean(result_classifier['test_score'])

0.9521978021978021

In [21]:
result_classifier2 = cross_validate(
    model.set_params(classifier__n_neighbors=51),
    data,
    target,
    cv=10,
    scoring='balanced_accuracy',
)

np.mean(result_classifier2['test_score'])

0.9418803418803419

In [23]:
result_classifier3 = cross_validate(
    model.set_params(classifier__n_neighbors=5, preprocessor=None),
    data,
    target,
    cv=10,
    scoring='balanced_accuracy',
)

np.mean(result_classifier3['test_score'])

0.7398382173382173

In [24]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor', None), ('classifier', KNeighborsClassifier())],
 'verbose': False,
 'preprocessor': None,
 'classifier': KNeighborsClassifier(),
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': None,
 'classifier__n_neighbors': 5,
 'classifier__p': 2,
 'classifier__weights': 'uniform'}

In [25]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer

all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

In [33]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__n_neighbors': [5, 51, 101],
    'preprocessor': all_preprocessors,
}

model_grid_search = GridSearchCV(
    model,
    param_grid=param_grid,
)

cv_results = cross_validate(
    model_grid_search,
    data,
    target,
    scoring='balanced_accuracy',
    cv=10,
)


In [37]:
scores = cv_results["test_score"]
print(f"Accuracy score by cross-validation combined with hyperparameters "
      f"search:\n{scores.mean():.3f} +/- {scores.std():.3f}")

Accuracy score by cross-validation combined with hyperparameters search:
0.947 +/- 0.036


In [36]:
cv_results

{'fit_time': array([1.21601486, 1.15194297, 1.04002666, 1.11203361, 1.03999496,
        1.04800534, 1.06396699, 1.20799613, 1.23999524, 1.20799351]),
 'score_time': array([0.00803185, 0.00800061, 0.00797033, 0.00799584, 0.        ,
        0.00798845, 0.0080328 , 0.        , 0.00800323, 0.00800872]),
 'test_score': array([0.95238095, 0.92673993, 1.        , 0.94444444, 0.88253968,
        1.        , 0.97777778, 0.93015873, 0.90793651, 0.95238095])}