# Module 3 - Wrap-Up Quiz

## Importing Data

In [1]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interest
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

## Checking Target

In [2]:
target.value_counts(normalize=True)

Species
Adelie Penguin (Pygoscelis adeliae)          0.441520
Gentoo penguin (Pygoscelis papua)            0.359649
Chinstrap penguin (Pygoscelis antarctica)    0.198830
Name: proportion, dtype: float64

## Checking Features

In [4]:
data.describe()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
count,342.0,342.0,342.0
mean,4201.754386,200.915205,43.92193
std,801.954536,14.061714,5.459584
min,2700.0,172.0,32.1
25%,3550.0,190.0,39.225
50%,4050.0,197.0,44.45
75%,4750.0,213.0,48.5
max,6300.0,231.0,59.6


## Model

In [32]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

In [16]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, data, target, cv=10, scoring="balanced_accuracy")

scores_5 = cv_results["test_score"]

print(
    "The mean cross-validation test accuracy is: "
    f"{scores_5.mean():.3f} ± {scores_5.std():.3f} "
)

The mean cross-validation test accuracy is: 0.952 ± 0.040 


In [17]:
print(scores_5)

[1.         1.         1.         0.91880342 0.88253968 0.95238095
 0.97777778 0.93015873 0.90793651 0.95238095]


## Manual Parameter Search

### n_neighbors = 51

In [18]:
model.set_params(classifier__n_neighbors=51)

cv_results = cross_validate(model, data, target, cv=10, scoring="balanced_accuracy")

scores_51 = cv_results["test_score"]

print(
    "The mean cross-validation test accuracy is: "
    f"{scores_51.mean():.3f} ± {scores_51.std():.3f} "
)

The mean cross-validation test accuracy is: 0.942 ± 0.039 


In [30]:
print(scores_51)
# count the cases where the scores for 51 are bigger than 5
(scores_5 > scores_51).sum()

[0.95238095 0.97777778 1.         0.86324786 0.88253968 0.95238095
 0.95555556 0.95238095 0.93015873 0.95238095]


4

### n_neighbors = 101

In [22]:
model.set_params(classifier__n_neighbors=101)

cv_results = cross_validate(model, data, target, cv=10, scoring="balanced_accuracy")

scores_101 = cv_results["test_score"]

print(
    "The mean cross-validation test accuracy is: "
    f"{scores_101.mean():.3f} ± {scores_101.std():.3f} "
)

The mean cross-validation test accuracy is: 0.877 ± 0.042 


In [29]:
print(scores_101)
# count the cases where the scores for 51 are bigger than 5
# we can also sort before comparing...
(scores_5 > scores_101).sum()

[0.85714286 0.95238095 0.94444444 0.86324786 0.83492063 0.85714286
 0.83492063 0.88253968 0.83492063 0.9047619 ]


10

### Withou StandardScaler

In [26]:
model_2 = KNeighborsClassifier(n_neighbors=5)

cv_results = cross_validate(model_2, data, target, cv=10, scoring="balanced_accuracy")

scores_wss = cv_results["test_score"]

print(
    "The mean cross-validation test accuracy is: "
    f"{scores_wss.mean():.3f} ± {scores_wss.std():.3f} "
)

The mean cross-validation test accuracy is: 0.740 ± 0.087 


In [28]:
print(scores_wss)
(scores_5 > scores_wss).sum()

[0.66468254 0.73601954 0.74102564 0.7042735  0.58412698 0.66984127
 0.83492063 0.74285714 0.88253968 0.83809524]


10

We see the model performs worse without StandardScaler, which is expected.

## Preprocessing Impact

In [31]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer


all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

In [68]:
from sklearn.model_selection import GridSearchCV

param_grid = {"preprocessor": all_preprocessors, "classifier__n_neighbors": [5, 51, 101]}

model_grid_search = GridSearchCV(model, param_grid=param_grid, n_jobs=2, cv=10, verbose=1, scoring = "balanced_accuracy")

model_grid_search.fit(data, target)

Fitting 10 folds for each of 15 candidates, totalling 150 fits


In [75]:
cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values(
    "rank_test_score", ascending=True, ignore_index=True)
# convert the name of the preprocessor for later display. The name is an object by default.
cv_results["param_preprocessor"] = cv_results["param_preprocessor"].apply(
    lambda x: x.__class__.__name__ if x is not None else "None"
)
cv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,param_preprocessor,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001639,0.00348,0.005497,0.004669,5,StandardScaler,"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,1.0,1.0,0.918803,0.88254,0.952381,0.977778,0.930159,0.907937,0.952381,0.952198,0.039902,1
1,0.003805,0.00174,0.004896,0.00131,5,MinMaxScaler,"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.952381,1.0,0.944444,0.88254,0.930159,0.955556,0.952381,0.907937,0.952381,0.947778,0.034268,2
2,0.004227,0.002767,0.004545,0.002955,5,QuantileTransformer,"{'classifier__n_neighbors': 5, 'preprocessor':...",0.952381,0.92674,1.0,0.918803,0.904762,1.0,0.977778,0.930159,0.907937,0.952381,0.947094,0.033797,3
3,0.010285,0.011328,0.005866,0.002368,5,PowerTransformer,"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.977778,1.0,0.863248,0.88254,0.952381,0.955556,0.930159,0.907937,1.0,0.94696,0.047387,4
4,0.003356,0.000502,0.004454,0.000681,51,StandardScaler,"{'classifier__n_neighbors': 51, 'preprocessor'...",0.952381,0.977778,1.0,0.863248,0.88254,0.952381,0.955556,0.952381,0.930159,0.952381,0.94188,0.038905,5


The best ranked model using a StandardScaler is substantially better (at least 7 of the cross-validations scores are better) than using any other preprocessor?

In [78]:
reference_model = cv_results.iloc[0]
other_models = cv_results.iloc[1:4]
cv_score_columns = cv_results.columns[cv_results.columns.str.startswith("split")]
for idx, other_model in other_models.iterrows():
    score_reference_model = reference_model[cv_score_columns]
    score_other_model = other_model[cv_score_columns]
    print(
        f"{reference_model['param_classifier__n_neighbors']}-NN with "
        f"{reference_model['param_preprocessor']} is strictly better than "
        f"{other_model['param_classifier__n_neighbors']}-NN with "
        f"{other_model['param_preprocessor']} for "
        f"{sum(score_reference_model > score_other_model)} CV iterations "
        f"out of 10."
    )

5-NN with StandardScaler is strictly better than 5-NN with MinMaxScaler for 3 CV iterations out of 10.
5-NN with StandardScaler is strictly better than 5-NN with QuantileTransformer for 2 CV iterations out of 10.
5-NN with StandardScaler is strictly better than 5-NN with PowerTransformer for 3 CV iterations out of 10.


The model with n_neighbors=5 and StandardScaler is substantially better (at least 7 of the cross-validations scores are better) than the model with n_neighbors=51 and StandardScaler?

In [81]:
import numpy as np

reference_model = cv_results.iloc[0][cv_score_columns]
other_model = cv_results.iloc[4][cv_score_columns]
print(
    f"5-NN with StandardScaler is strictly better 51-NN with StandardScaler for "
    f"{np.sum(reference_model.to_numpy() > other_model.to_numpy())} "
    "CV iterations out of 10."
)

5-NN with StandardScaler is strictly better 51-NN with StandardScaler for 4 CV iterations out of 10.


The model with n_neighbors=51 and StandardScaler is substantially better (at least 7 of the cross-validations scores are better) than the model with n_neighbors=101 and StandardScaler?

In [83]:
reference_model = cv_results.iloc[0][cv_score_columns]
other_model = cv_results.iloc[8][cv_score_columns]
print(
    f"51-NN with StandardScaler is strictly better than 101-NN with StandardScaler for "
    f"{np.sum(reference_model.to_numpy() > other_model.to_numpy())} "
    "CV iterations out of 10."
)

51-NN with StandardScaler is strictly better than 101-NN with StandardScaler for 10 CV iterations out of 10.


## Nested Cross-Validation

In [86]:
cv_results = cross_validate(
    model_grid_search, data, target, cv=10, n_jobs=2, return_estimator=True, scoring = "balanced_accuracy"
)

In [87]:
cv_results = pd.DataFrame(cv_results)
cv_test_scores = cv_results["test_score"]
print(
    "Generalization score with hyperparameters tuning:\n"
    f"{cv_test_scores.mean():.3f} ± {cv_test_scores.std():.3f}"
)

Generalization score with hyperparameters tuning:
0.943 ± 0.038


Best estimators

In [88]:
for cv_fold, estimator_in_fold in enumerate(cv_results["estimator"]):
    print(
        f"Best hyperparameters for fold #{cv_fold + 1}:\n"
        f"{estimator_in_fold.best_params_}"
    )

Best hyperparameters for fold #1:
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best hyperparameters for fold #2:
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best hyperparameters for fold #3:
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #4:
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #5:
{'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}
Best hyperparameters for fold #6:
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best hyperparameters for fold #7:
{'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}
Best hyperparameters for fold #8:
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #9:
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best hyperparameters for fold #10:
{'classifier__n_ne