In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.datasets import fetch_openml

from skrub import tabular_learner, TableReport

import skore
from skore import EstimatorReport
from skore_hub_project.project.project import Project

Fetch the dataset. We will use the census dataset from openml.
It's a binary classification problem, where the target is whether a person earns more than 50K a year.
https://www.openml.org/search?type=data&sort=runs&id=1590&status=active

In [None]:
X, y = fetch_openml("adult", version=2, as_frame=True, return_X_y=True)

In [None]:
X = X.drop(columns=["relationship", "fnlwgt"])

In [None]:
TableReport(X)

In [None]:
y.value_counts()

In [None]:
y = 1*(y == ">50K")

In [None]:
X_train, X_test, y_train, y_test = skore.train_test_split(X, y, random_state=1)

Simpler is better.
Let's do a simple baseline.

In [None]:
baseline = tabular_learner('classification')
baseline

In [None]:
baseline_report = EstimatorReport(baseline, X_train = X_train, y_train=y_train, X_test = X_test, y_test = y_test)
baseline_report.help()

In [None]:
baseline_report.metrics.report_metrics()

In [None]:
baseline_report.help()

In [None]:
fi = baseline_report.feature_importance.permutation()


In [None]:
fi

In [None]:
fi.T.plot(kind = "box", figsize=(10, 5), title="Permutation Feature Importance", vert=False)

In [None]:
# create project
project = Project(name="project demo - census", tenant="Probabl")

In [None]:
project.put("baseline", baseline_report)

Let's go a bit further in that baseline by optimizing the parameters.

In [None]:
from sklearn.model_selection import GridSearchCV

baseline_2 = GridSearchCV(
                estimator = baseline,
                param_grid = {
                    "histgradientboostingclassifier__learning_rate":[0.01, 0.1, 0.2],
                    "histgradientboostingclassifier__max_depth":[1, 3, 5],
                    "histgradientboostingclassifier__max_leaf_nodes":[30, 60, 90],
                },
                cv = 5,
                n_jobs = -1,
                refit=True,
                scoring="neg_log_loss",
            )
baseline_2

In [None]:
baseline_report_2 = EstimatorReport(baseline_2, X_train = X_train, y_train=y_train, X_test = X_test, y_test = y_test)

In [None]:
baseline_report_2.metrics.report_metrics()

In [None]:
project.put("baseline_2", baseline_report_2)

In [None]:
comp = skore.ComparisonReport([baseline_report, baseline_report_2])
comp.help()

In [None]:
comp.metrics.report_metrics(pos_label = 1)

DEMO PART 2 - after superior review