In [None]:
import sys
sys.version

## Load a saved pipeline

In [None]:
import logging
from pathlib import Path
import warnings

import numpy as np
import pandas as pd

from ngautonml.algorithms.impl.algorithm_auto import AlgorithmCatalogAuto
from ngautonml.executor.executor_kind import ExecutorKind
from ngautonml.instantiator.instantiator_factory import InstantiatorFactory
from ngautonml.instantiator.json_loader import JsonLoader
from ngautonml.problem_def.problem_def import ProblemDefinition
from ngautonml.wrangler.wrangler import Wrangler
from ngautonml.wrangler.dataset import DatasetKeys

logging.basicConfig(level=logging.ERROR)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

try:
    dirpath = str(Path(__file__).parent)
except NameError:
    dirpath = str('.')

<h2>Problem Definition</h2>

All values are case-independent. The data format is `json`.

We currently output predictions, trained models, and instantiations to memory, or to disk in our JSON format.

In [None]:
problem_definition = ProblemDefinition(
'{'
'    "_comments" : ['
'        "A json file fully encapsulating the problem definition for openml dataset #31.",'
'        "This dataset is a tabular binary classification problem.",'
'        "People are classified as good or bad credit risks based on attributes."'
'    ],'
'    "dataset" : {'
'        "config" : "local",'
f'        "test_path": "{dirpath}/examples/classification/credit-test.csv",'
f'        "train_path" : "{dirpath}/examples/classification/credit-train.csv",'
'        "column_roles": {'
'            "target": {'
'                "name": "class"'
'            }'
'        }'
'    },'
'    "problem_type" : {'
'        "data_type": "TABULAR",'
'        "task": "BINARY_CLASSIFICATION"'
'    },'
'    "metrics" :  {'
'        "accuracy_score": {},'
'        "roc_auc_score": {}'
'    },'
'    "output" : {'
f'       "path" : "{dirpath}/output/classification/credit-output",'
'            "instantiations": ['
'                "JSON"'
'            ]'
'     },'
'     "hyperparams": ['
'        "disable_grid_search"'
'     ]'
'}'
)

## 

## Minimal example: the Wrangler
The `wrangler` puts all the pieces together. In this example, we allow all components to default.

We load one saved pipeline, run it against the test data, and test it will all configured metrics.

In [None]:
wrangler = Wrangler(
    problem_definition=problem_definition,
)

pipeline_file = Path(f"{dirpath}/output/classification/credit-output/pipelines/tabular_classification@sklearn.naive_bayes.multinomialnb.json")

loader = JsonLoader(
        saver_version='1.0',
        algorithm_catalog=AlgorithmCatalogAuto(),
        pipeline_file=pipeline_file,
        load_models=True)

bound_pipeline = loader.pipeline

instantiator = InstantiatorFactory().build(kind=ExecutorKind('simple'))

pipelines = {bound_pipeline.designator: instantiator.instantiate(bound_pipeline)}

test_data = pd.read_csv(f'{dirpath}/examples/classification/credit-test.csv')

test_dataset = wrangler.dataset(data=test_data.drop(labels=['class'], axis=1))

test_predictions = wrangler.predict(new_data=test_dataset, trained_pipelines=pipelines)

rankings = wrangler.rank(results=test_predictions)

summary = 'Test rankings are:\n'
for rank in rankings:
    summary = f'{summary} {rank}\n'
print(summary)
