## Imports

In [5]:
import pandas as pd
import numpy as np
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns
# Checking whether a numerical feature has a normal distribution or not
from statsmodels.graphics.gofplots import qqplot
from sklearn.model_selection import cross_validate, learning_curve, train_test_split, cross_val_score, cross_val_predict, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDRegressor,SGDClassifier,SGDOneClassSVM
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, RobustScaler, OrdinalEncoder, OneHotEncoder, StandardScaler, LabelEncoder, KBinsDiscretizer
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score, get_scorer_names, classification_report
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.inspection import permutation_importance
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector

In [1]:
import pandas as pd

from scipy.stats import uniform

from sklearn import set_config; set_config(display='diagram')

from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC

# Tuning Pipeline

👇 Consider the following dataset.

In [2]:
data = pd.read_csv("https://wagon-public-datasets.s3.amazonaws.com/05-Machine-Learning/08-Workflow/tuning_pipeline_data.csv")
data.head()

Unnamed: 0,games played,minutes played,points per game,field goals made,field goal attempts,field goal percent,3 point made,3 point attempt,3 point %,free throw made,free throw attempts,free throw %,offensive rebounds,defensive rebounds,rebounds,assists,steals,blocks,turnovers,target_5y
0,36.0,27.4,7.4,2.6,7.6,,0.5,2.1,25.0,1.6,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0
1,35.0,26.9,,2.0,6.7,29.6,0.7,2.8,23.5,2.6,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0
2,,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,0.9,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0
3,58.0,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,0.9,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1
4,48.0,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,1.3,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1


- Each observation represents a player
- Each column represents a characteristic of a player's performance

The target defines whether the player lasted less than 5 years (`0`) vs. 5 years or more (`1`) as a professional.

In [10]:
X = data.drop(columns="target_5y")
y = data['target_5y']

## Pipeline

👇 We are giving you the simple pipeline below

In [11]:
# Preprocessing pipe
preprocessor = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaling', MinMaxScaler())
])

# Final pipe
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model_svm', SVC())
])

pipe

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('imputer', SimpleImputer()),
                                 ('scaling', MinMaxScaler())])),
                ('model_svm', SVC())])

## Fine-Tuning

Our task is to assist in the recruitment process of promising young players.  
The model should **limit false alarms as much as possible** to avoid recruiting players that will flop.

❓ **Fine-tune this pipeline to maximize your objective**

- Use the `scoring` metric appropriate for the task
- Do a (randomized) search for the optimal
    - imputing `strategy`
    - `kernel`
    - regularization factor `C`
- Store your randomized search results in a `search` variable

In [12]:
#?SimpleImputer

In [21]:
set_config(display='text')

grid = {
    'preprocessing__imputer__strategy': ['mean', 'median', 'most_frequent'],
    'model_svm__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'model_svm__C': uniform(0.1, 10)
}

search = RandomizedSearchCV(
    pipe,
    grid,
    scoring='precision',
    cv=5,
    n_jobs=-1
)

search.fit(X, y)
# YOUR CODE BELOW

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessing',
                                              Pipeline(steps=[('imputer',
                                                               SimpleImputer()),
                                                              ('scaling',
                                                               MinMaxScaler())])),
                                             ('model_svm', SVC())]),
                   n_jobs=-1,
                   param_distributions={'model_svm__C': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7fdc4d6fae10>,
                                        'model_svm__kernel': ['linear', 'poly',
                                                              'rbf',
                                                              'sigmoid'],
                                        'preprocessing__imputer__strategy': ['mean',
                                                      

In [22]:
print(rand_search.best_params_, rand_search.best_score_)

{'model_svm__C': 0.7994232833115676, 'model_svm__kernel': 'poly', 'preprocessing__imputer__strategy': 'median'} 0.7543605009487402


In [23]:
from nbresult import ChallengeResult

result = ChallengeResult(
    'solution',
    scoring = search.scoring,
    cv = search.cv,
    mean_test_score = search.cv_results_['mean_test_score']
)

result.write()
print(result.check())


platform linux -- Python 3.12.9, pytest-8.3.4, pluggy-1.5.0 -- /home/glaznos/.pyenv/versions/3.12.9/envs/lewagon/bin/python
cachedir: .pytest_cache
rootdir: /home/glaznos/code/roninrp/05-ML/08-Workflow/data-tuning-pipeline/tests
plugins: anyio-4.8.0, typeguard-4.4.2, dash-3.1.1
[1mcollecting ... [0mcollected 1 item

test_solution.py::TestSolution::test_cv_results [32mPASSED[0m[32m                   [100%][0m



💯 You can commit your code:

[1;32mgit[39m add tests/solution.pickle

[32mgit[39m commit -m [33m'Completed solution step'[39m

[32mgit[39m push origin master



In [24]:
!git add tests/solution.pickle

!git commit -m 'Completed solution step'

!git push origin master

[master f962982] Completed solution step
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tests/solution.pickle
Enumerating objects: 13, done.
Counting objects: 100% (13/13), done.
Delta compression using up to 4 threads
Compressing objects: 100% (12/12), done.
Writing objects: 100% (13/13), 3.17 KiB | 1.58 MiB/s, done.
Total 13 (delta 1), reused 0 (delta 0)
remote: Resolving deltas: 100% (1/1), done.[K
To github.com:roninrp/data-tuning-pipeline.git
 * [new branch]      master -> master


## Export

In [25]:
pipe_best = search.best_estimator_
pipe_best

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('imputer',
                                  SimpleImputer(strategy='most_frequent')),
                                 ('scaling', MinMaxScaler())])),
                ('model_svm', SVC(C=1.6470375083658972, kernel='linear'))])

Once you have built your optimal pipeline, export it as a pickle file

In [None]:
# YOUR CODE HERE
import pickle

# Export pipeline as pickle file
with open("pipeline.pkl", "wb") as file:
    pickle.dump(pipe_tuned, file)

🏁 Congratulation. Don't forget to add, commit and push your notebook.