# Welcome to the demo of Skore!

Let's start by fetching the dataset. We will use the census dataset from openml.

It's a binary classification problem, where the target is whether a person earns more than 50K a year.

https://www.openml.org/search?type=data&sort=runs&id=1590&status=active

In [None]:
from sklearn.datasets import fetch_openml

X, y = fetch_openml("adult", version=2, as_frame=True, return_X_y=True)

In [None]:
from skrub import TableReport

TableReport(X)

In [None]:
y.value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [None]:
import pandas as pd

pd.Series(y_encoded).value_counts()

In [None]:
import skore

X_train, X_test, y_train, y_test = skore.train_test_split(X, y_encoded, random_state=1)

Simpler is better.

Let's do a simple baseline.

In [None]:
from skrub import tabular_learner

baseline = tabular_learner("classification")
baseline

In [None]:
from skore import EstimatorReport

baseline_report = EstimatorReport(
    baseline,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
)
baseline_report.help()

In [None]:
baseline_report.metrics.report_metrics()

In [None]:
# create or connect to project
from skore_hub_project.project.project import Project

project = Project(name="project demo - census", tenant="Probabl")

In [None]:
project.put("baseline", baseline_report)

Let's go a bit further in that baseline by optimizing the parameters.

In [None]:
from sklearn.model_selection import GridSearchCV

tuned_baseline = GridSearchCV(
    estimator=baseline,
    param_grid={
        "histgradientboostingclassifier__learning_rate": [0.01, 0.1, 0.2],
        "histgradientboostingclassifier__max_depth": [1, 3, 5],
        "histgradientboostingclassifier__max_leaf_nodes": [30, 60, 90],
    },
    cv=5,
    n_jobs=-1,
    refit=True,
    scoring="neg_log_loss",
)
tuned_baseline

In [None]:
tuned_baseline_report = EstimatorReport(
    tuned_baseline, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test
)

In [None]:
project.put("tuned_baseline", tuned_baseline_report)

In [None]:
tuned_baseline_report.metrics.report_metrics()

In [None]:
comp = skore.ComparisonReport(
    {"Baseline Model": baseline_report, "Tuned model": tuned_baseline_report}
)
comp.help()

In [None]:
comp.metrics.report_metrics(pos_label=1, indicator_favorability=True)

# DEMO PART 2 - after superior review

Their request: even more simple baselines: dummy classifier, and a linear model.

In [None]:
from sklearn.dummy import DummyClassifier

dummy = DummyClassifier(strategy="most_frequent")
dummy_report = EstimatorReport(
    dummy,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
)
dummy_report.help()

In [None]:
dummy_report.metrics.report_metrics()

In [None]:
project.put("dummy", dummy_report)

In [None]:
from sklearn.linear_model import LogisticRegression

logistic_report = EstimatorReport(
    tabular_learner(LogisticRegression()),
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
)
logistic_report.help()

In [None]:
logistic_report.metrics.report_metrics()

In [None]:
project.put("logistic", logistic_report)