In [None]:
import os, sys
sys.version

## Save a bunch of pipelines to disk

In [None]:
import logging
from pathlib import Path
import warnings

import numpy as np
import pandas as pd

from ngautonml.problem_def.problem_def import ProblemDefinition
from ngautonml.wrangler.wrangler import Wrangler
from ngautonml.wrangler.dataset import Dataset, DatasetKeys

logging.basicConfig(level=logging.ERROR)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

try:
    dirpath = str(Path(__file__).parent)
except NameError:
    dirpath = str('.')

<h2>Problem Definition</h2>

All values are case-independent. The data format is `json`.

We currently output predictions, trained models, and instantiations to memory, or to disk in our JSON format.

This problem definition specifies a `JSON` instantiator in the `output` clause, so that all the pipelines and their trained models are saved to disk.

In [None]:
problem_definition = ProblemDefinition(
'{'
'    "_comments" : ['
'        "A json file fully encapsulating the problem definition for openml dataset #31.",'
'        "This dataset is a tabular binary classification problem.",'
'        "People are classified as good or bad credit risks based on attributes."'
'    ],'
'    "dataset" : {'
'        "config" : "local",'
f'        "test_path": "{dirpath}/examples/classification/credit-test.csv",'
f'        "train_path" : "{dirpath}/examples/classification/credit-train.csv",'
'        "column_roles": {'
'            "target": {'
'                "name": "class"'
'            }'
'        }'
'    },'
'    "problem_type" : {'
'        "data_type": "TABULAR",'
'        "task": "BINARY_CLASSIFICATION"'
'    },'
'    "metrics" :  {'
'        "accuracy_score": {},'
'        "roc_auc_score": {}'
'    },'
'    "output" : {'
f'       "path" : "{dirpath}/output/classification/credit-output",'
'            "instantiations": ['
'                "JSON"'
'            ]'
'    },'
'    "hyperparams": ['
'        "disable_grid_search"'
'    ]'
'}'
)

## 

## Minimal example: the Wrangler
The `wrangler` puts all the pieces together. In this example, we allow all components to default. The `wrangler.fit_predict_rank()` method runs all pipelines described from the problem definition and ranks them for each requested metric. The problem def saves all tested pipelines to `examples/classification/credit-output/`.

In [None]:
wrangler = Wrangler(
    problem_definition=problem_definition,
)

got = wrangler.fit_predict_rank()
print(got.train_results)
summary = 'The rankings are:\n'
for rank in got.rankings:
    summary = f'{summary} {rank}\n'
print(summary)