# Preprocessor Tuning

👇 Consider the following dataset as your training set

In [1]:
import pandas as pd

data = pd.read_csv("data.csv")

data.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,malignant
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
4,20.29,14.34,,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,,0.205,0.4,0.1625,0.2364,0.07678,1


The dataset describes tumors that are either malignant or benign. The task is to detect as many malignant tumors as possible.

👇 Combine the following steps in a `Pipeline` object named `pipe`:

- Impute missing values with a `KNNImputer`
- Scale all the features with a `MinMaxScaler`
- Model a `LogisticRegression` with default parameters

In [2]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn import set_config; set_config(display='diagram')

In [3]:
X = data.drop(columns='malignant')
y = data.malignant.copy()

In [4]:
# Preprocessor
prep =  make_pipeline(KNNImputer(), MinMaxScaler())


prep.fit_transform(X)

KNNImputer.get_feature_names_out = (lambda self, names=None: self.feature_names_in_)

pd.DataFrame(prep.transform(X), columns=prep.get_feature_names_out())

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,0.521037,0.022658,0.545989,0.363733,0.593753,0.792037,0.703140,0.731113,0.686364,0.605518,...,0.620776,0.141525,0.668310,0.450698,0.601136,0.619292,0.568610,0.912027,0.598462,0.418864
1,0.643144,0.272574,0.615783,0.501591,0.289880,0.181768,0.203608,0.348757,0.379798,0.141323,...,0.606901,0.303571,0.539818,0.435214,0.347553,0.154563,0.192971,0.639175,0.233590,0.222878
2,0.601496,0.390260,0.595743,0.449417,0.514309,0.431017,0.462512,0.635686,0.509596,0.211247,...,0.556386,0.360075,0.508442,0.374508,0.483590,0.385375,0.359744,0.835052,0.403706,0.213433
3,0.210090,0.360839,0.233501,0.102906,0.811321,0.811361,0.565604,0.522863,0.776263,1.000000,...,0.248310,0.385928,0.184382,0.094008,0.915472,0.814012,0.548642,0.884880,1.000000,0.773711
4,0.629893,0.156578,0.629742,0.489290,0.430351,0.347893,0.463918,0.518390,0.378283,0.186816,...,0.519744,0.123934,0.506948,0.341575,0.354553,0.172415,0.319489,0.558419,0.157500,0.142595
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,0.690000,0.428813,0.678668,0.566490,0.526948,0.296055,0.571462,0.690358,0.336364,0.132056,...,0.623266,0.383262,0.576174,0.452664,0.461137,0.178527,0.328035,0.761512,0.097575,0.105667
565,0.622320,0.626987,0.604036,0.474019,0.407782,0.257714,0.337395,0.486630,0.349495,0.113100,...,0.560655,0.699094,0.520892,0.379915,0.300007,0.159997,0.256789,0.559450,0.198502,0.074315
566,0.455251,0.621238,0.445788,0.303118,0.288165,0.254340,0.216753,0.263519,0.267677,0.137321,...,0.393099,0.589019,0.379949,0.230731,0.282177,0.273705,0.271805,0.487285,0.128721,0.151909
567,0.644564,0.663510,0.665538,0.475716,0.588336,0.790197,0.823336,0.755467,0.675253,0.425442,...,0.633582,0.730277,0.668310,0.402035,0.619626,0.815758,0.749760,0.910653,0.497142,0.452315


In [5]:
# Add Estimator
pipe = make_pipeline(prep, LogisticRegression())

👉 With how many neighbors does the `KNNImputer` produce the optimal pipeline: 2, 5, or 10? Store it as under `n_best`

❗️ Use the scoring metric relevant for the task



In [6]:
from sklearn.model_selection import GridSearchCV

In [7]:
# Inspect all pipe components parameters to find the one you want to gridsearch
pipe.get_params()

{'memory': None,
 'steps': [('pipeline',
   Pipeline(steps=[('knnimputer', KNNImputer()), ('minmaxscaler', MinMaxScaler())])),
  ('logisticregression', LogisticRegression())],
 'verbose': False,
 'pipeline': Pipeline(steps=[('knnimputer', KNNImputer()), ('minmaxscaler', MinMaxScaler())]),
 'logisticregression': LogisticRegression(),
 'pipeline__memory': None,
 'pipeline__steps': [('knnimputer', KNNImputer()),
  ('minmaxscaler', MinMaxScaler())],
 'pipeline__verbose': False,
 'pipeline__knnimputer': KNNImputer(),
 'pipeline__minmaxscaler': MinMaxScaler(),
 'pipeline__knnimputer__add_indicator': False,
 'pipeline__knnimputer__copy': True,
 'pipeline__knnimputer__metric': 'nan_euclidean',
 'pipeline__knnimputer__missing_values': nan,
 'pipeline__knnimputer__n_neighbors': 5,
 'pipeline__knnimputer__weights': 'uniform',
 'pipeline__minmaxscaler__clip': False,
 'pipeline__minmaxscaler__copy': True,
 'pipeline__minmaxscaler__feature_range': (0, 1),
 'logisticregression__C': 1.0,
 'logisticreg

In [8]:
# Instanciate grid search
grid_search = GridSearchCV(
    pipe, 
    param_grid={
        'pipeline__knnimputer__n_neighbors': [2, 5, 10]},
    cv=5,
    scoring="recall")

grid_search.fit(X, y)
grid_search.best_params_

{'pipeline__knnimputer__n_neighbors': 5}

In [9]:
n_best = 5

👇 What is the performance of the optimal pipeline? Make sure you cross validate! Store your result as `float` under the variable named `cv_score`

In [10]:
grid_search.best_score_

0.9197120708748615

In [11]:
from sklearn.model_selection import cross_val_score

In [14]:
cv_score = cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='recall').mean()
cv_score

0.9197120708748615

In [15]:
from nbresult import ChallengeResult
result = ChallengeResult('solution', 
                         n_best = n_best,
                         cv_score=cv_score)
result.write()
print(result.check())

platform darwin -- Python 3.8.12, pytest-6.2.5, py-1.10.0, pluggy-1.0.0 -- /Users/fabienpardo/.pyenv/versions/lewagon/bin/python3
cachedir: .pytest_cache
rootdir: /Users/fabienpardo/code/fabienpardo/data-challenges/05-ML/08-Workflow/01-Preprocessor-Tuning
plugins: dash-2.0.0, anyio-3.3.2
[1mcollecting ... [0mcollected 2 items

tests/test_solution.py::TestSolution::test_n_neighbours [32mPASSED[0m[32m           [ 50%][0m
tests/test_solution.py::TestSolution::test_score_good_enough [32mPASSED[0m[32m      [100%][0m



💯 You can commit your code:

[1;32mgit[39m add tests/solution.pickle

[32mgit[39m commit -m [33m'Completed solution step'[39m

[32mgit[39m push origin master


👇 Using your optimal pipeline, predict wether the following tumor is malignant or not

In [16]:
new_data = pd.read_csv("new_data.csv")
new_data

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902


In [21]:
f'prob that is malignant is {round(grid_search.best_estimator_.predict_proba(new_data)[0][1],2)}'

'prob that is malignant is 0.97'

🏁 Congratulation. Don't forget to add, commit and push your notebook.