# 01 â€” Quickstart (real-world data + Cleanlab)

This notebook downloads a real-world dataset, trains a model, and uses Cleanlab to find potential label issues.

In [1]:
from pathlib import Path
import sys

cwd = Path.cwd()
if (cwd / "src").exists():
    sys.path.insert(0, str(cwd / "src"))
elif (cwd.parent / "src").exists():
    sys.path.insert(0, str(cwd.parent / "src"))


In [2]:
from cleanlab_demo.config import CleanlabConfig, DatasetName, DemoConfig, ModelConfig, ModelName, RunConfig
from cleanlab_demo.experiments import run_experiment

config = RunConfig(
    dataset=DatasetName.adult_income,
    model=ModelConfig(name=ModelName.logistic_regression),
    cleanlab=CleanlabConfig(enabled=True, cv_folds=3, use_datalab=True, max_issues=50),
    demo=DemoConfig(label_noise_fraction=0.05, max_rows=8000),
)

result = run_experiment(config)
result


2026-02-07 21:30:17 | INFO     | cleanlab_demo | Loading dataset: adult_income
2026-02-07 21:30:17 | INFO     | cleanlab_demo | Downloading https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data (attempt 1/3)
2026-02-07 21:30:20 | INFO     | cleanlab_demo | Successfully downloaded to data/adult_income/adult.data
2026-02-07 21:30:20 | INFO     | cleanlab_demo | Dataset loaded: 8,000 rows
  from .autonotebook import tqdm as notebook_tqdm


RunResult(dataset=<DatasetName.adult_income: 'adult_income'>, task=<TaskType.classification: 'classification'>, model=<ModelName.logistic_regression: 'logistic_regression'>, n_train=6400, n_test=1600, metrics=Metrics(primary=0.9014128559102674, details={'accuracy': 0.856875, 'f1_weighted': 0.8489472102075996, 'roc_auc': 0.9014128559102674}), label_issues=[LabelIssue(index=5686, label='<=50K', suggested_label='>50K', score=2.078914818071098e-11), LabelIssue(index=4184, label='>50K', suggested_label='<=50K', score=0.003887453618319564), LabelIssue(index=390, label='>50K', suggested_label='<=50K', score=0.0039202938388768205), LabelIssue(index=3025, label='<=50K', suggested_label='>50K', score=0.006250149287015505), LabelIssue(index=3297, label='>50K', suggested_label='<=50K', score=0.007057242099685898), LabelIssue(index=4193, label='<=50K', suggested_label='>50K', score=0.008659379661295707), LabelIssue(index=4270, label='>50K', suggested_label='<=50K', score=0.009010053587605085), Labe

In [3]:
import pandas as pd

pd.Series(result.metrics.details).to_frame("value")


Unnamed: 0,value
accuracy,0.856875
f1_weighted,0.848947
roc_auc,0.901413


In [4]:
issues_df = pd.DataFrame([li.model_dump() for li in result.label_issues])
issues_df.head(20)


Unnamed: 0,index,label,suggested_label,score
0,5686,<=50K,>50K,2.078915e-11
1,4184,>50K,<=50K,0.003887454
2,390,>50K,<=50K,0.003920294
3,3025,<=50K,>50K,0.006250149
4,3297,>50K,<=50K,0.007057242
5,4193,<=50K,>50K,0.00865938
6,4270,>50K,<=50K,0.009010054
7,4517,>50K,<=50K,0.009690615
8,1496,>50K,<=50K,0.009756809
9,1235,>50K,<=50K,0.01080796


In [5]:
issue_summary = result.cleanlab_summary.get("issue_summary")
pd.DataFrame(issue_summary) if issue_summary else result.cleanlab_summary


{'n_label_issues': 50,
 'label_issue_examples': [{'train_row': 5686,
   'df_index': 42,
   'score': 2.078914818071098e-11,
   'label': '<=50K',
   'suggested_label': '>50K',
   'row': {'age': 48,
    'workclass': 'Private',
    'fnlwgt': 107231,
    'education': 'Prof-school',
    'education_num': 15,
    'marital_status': 'Married-civ-spouse',
    'occupation': 'Exec-managerial',
    'relationship': 'Husband',
    'race': 'White',
    'sex': 'Male',
    'capital_gain': 99999,
    'capital_loss': 0,
    'hours_per_week': 50,
    'native_country': 'United-States'}},
  {'train_row': 4184,
   'df_index': 5615,
   'score': 0.003887453618319564,
   'label': '>50K',
   'suggested_label': '<=50K',
   'row': {'age': 31,
    'workclass': 'Private',
    'fnlwgt': 248653,
    'education': '1st-4th',
    'education_num': 2,
    'marital_status': 'Never-married',
    'occupation': 'Handlers-cleaners',
    'relationship': 'Not-in-family',
    'race': 'White',
    'sex': 'Male',
    'capital_gain': 0

In [6]:
artifacts = Path("artifacts")
artifacts.mkdir(exist_ok=True)
path = artifacts / "last_result.json"
path.write_text(result.model_dump_json(indent=2), encoding="utf-8")
path


PosixPath('artifacts/last_result.json')