In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.datasets import fetch_openml

import skrub
from skrub import tabular_learner, DropCols, MinHashEncoder, TableVectorizer, AggJoiner, TableReport
from skrub import _selectors as s

import skore
from skore import EstimatorReport
from skore_hub_project.project.project import Project

Fetch the dataset. We will use the credit fraud dataset from skrub.

In [2]:
X, y = fetch_openml("adult", version=2, as_frame=True, return_X_y=True)

In [3]:
skrub.TableReport(X)

Processing column  14 / 14


Unnamed: 0_level_0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
Unnamed: 0_level_1,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0.0,25.0,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States
1.0,38.0,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States
2.0,28.0,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States
3.0,44.0,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States
4.0,18.0,,103497.0,Some-college,10.0,Never-married,,Own-child,White,Female,0.0,0.0,30.0,United-States
,,,,,,,,,,,,,,
48837.0,27.0,Private,257302.0,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,38.0,United-States
48838.0,40.0,Private,154374.0,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States
48839.0,58.0,Private,151910.0,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0.0,0.0,40.0,United-States
48840.0,22.0,Private,201490.0,HS-grad,9.0,Never-married,Adm-clerical,Own-child,White,Male,0.0,0.0,20.0,United-States

Column,Column name,dtype,Null values,Unique values,Mean,Std,Min,Median,Max
0,age,Int64DType,0 (0.0%),74 (0.2%),38.6,13.7,17.0,37.0,90.0
1,workclass,CategoricalDtype,2799 (5.7%),8 (< 0.1%),,,,,
2,fnlwgt,Int64DType,0 (0.0%),28523 (58.4%),190000.0,106000.0,12285.0,178142.0,1490400.0
3,education,CategoricalDtype,0 (0.0%),16 (< 0.1%),,,,,
4,education-num,Int64DType,0 (0.0%),16 (< 0.1%),10.1,2.57,1.0,10.0,16.0
5,marital-status,CategoricalDtype,0 (0.0%),7 (< 0.1%),,,,,
6,occupation,CategoricalDtype,2809 (5.8%),14 (< 0.1%),,,,,
7,relationship,CategoricalDtype,0 (0.0%),6 (< 0.1%),,,,,
8,race,CategoricalDtype,0 (0.0%),5 (< 0.1%),,,,,
9,sex,CategoricalDtype,0 (0.0%),2 (< 0.1%),,,,,

Column 1,Column 2,Cramér's V,Pearson's Correlation
education,education-num,1.0,
relationship,sex,0.655,
marital-status,relationship,0.49,
marital-status,sex,0.46,
occupation,sex,0.416,
workclass,occupation,0.404,
race,native-country,0.307,
age,marital-status,0.288,
age,relationship,0.273,
sex,hours-per-week,0.252,


In [4]:
y.value_counts()

class
<=50K    37155
>50K     11687
Name: count, dtype: int64

In [5]:
y = 1*(y == ">50K")

In [6]:
X_train, X_test, y_train, y_test = skore.train_test_split(X, y, random_state=1)

Simpler is better.
Let's do a simple baseline.

In [7]:
baseline = tabular_learner('classification')
baseline

In [8]:
baseline_report = EstimatorReport(baseline, X_train = X_train, y_train=y_train, X_test = X_test, y_test = y_test)
baseline_report.help()

In [9]:
baseline_report.metrics.report_metrics()

Unnamed: 0_level_0,Unnamed: 1_level_0,HistGradientBoostingClassifier
Metric,Label / Average,Unnamed: 2_level_1
Precision,0.0,0.898685
Precision,1.0,0.789835
Recall,0.0,0.943779
Recall,1.0,0.665082
ROC AUC,,0.930531
Brier score,,0.0867
Fit time (s),,0.819292
Predict time (s),,0.087698


In [10]:
# create project
project = Project(name="project demo - census", tenant="Probabl")

In [None]:
project.put("baseline", baseline_report)

Let's go a bit further in that baseline by optimizing the parameters.

In [None]:
from sklearn.model_selection import GridSearchCV

baseline_2 = GridSearchCV(
                estimator = baseline,
                param_grid = {
                    "randomforestclassifier__max_leaf_nodes": np.arange(10, 2_000, 30),
                    "randomforestclassifier__n_estimators": [100, 200],
                    "randomforestclassifier__max_depth": [None, 10, 20],
                },
                cv = 5,
                n_jobs = -1,
                refit=True,
                scoring="neg_log_loss",
            )

In [None]:
baseline_2 = tabular_learner("classification")
baseline_report_2 = EstimatorReport(baseline_2, X_train = X_train, y_train=y_train, X_test = X_test, y_test = y_test)

In [None]:
project.put("baseline_2", baseline_report_2)

In [None]:
baseline_report_2.metrics.report_metrics()

In [None]:
comp = skore.ComparisonReport([baseline_report, baseline_report_2])
comp.help()

In [None]:
comp.metrics.report_metrics(pos_label = 1)

DEMO PART 2 - after superior review