In [None]:
import sys
from pathlib import Path

try:
    dirpath = Path(globals()['_dh'][0])
except KeyError:
    dirpath = Path(__file__).parent
sys.path.append(str(dirpath))
sys.version

## Installation

At this time, the only installation available is via cloning the repository on GitLab:
```
git clone git@gitlab.com:autonlab/ngautonml.git
```
As the project leaves Alpha stage, a Pypi package will be posted for easy installation.

It is recommended to create a virtual environment to run ngautonml.  To do so with conda, run:
```
conda create -n env-name python=3.9
conda activate env-name
```

ngAutonML is designed to run on Python 3.9 and above.

A ```requirements.txt``` file is provided to install necessary libraries. Use:

```
pip install -r requirements.txt
```

In [None]:

import logging
import warnings

import pandas as pd

from ngautonml.algorithms.impl.algorithm_auto import AlgorithmCatalogAuto
from ngautonml.executor.simple.simple_executor import SimpleExecutor
from ngautonml.generator.generator import GeneratorImpl
from ngautonml.instantiator.instantiator_factory import InstantiatorFactory
from ngautonml.metrics.impl.metric_auto import MetricCatalogAuto
from ngautonml.problem_def.problem_def import ProblemDefinition
from ngautonml.ranker.ranker_impl import RankerImpl
from ngautonml.templates.impl.template_auto import TemplateCatalogAuto
from ngautonml.wrangler.wrangler import Wrangler

logging.basicConfig(level=logging.ERROR)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

<h2>Problem Definition</h2>

All values are case-independent. The data format is `json`.

The dataset clause currently only supports a `config` of `local`. This is for loading from a local file.

Supported data formats for the `train_path` are `csv` and `arff`.

In `column_roles` we identify the `class` column as the target.

For `problem_type` we support `data_types` of `tabular` and `timeseries`. The supported `task` options as of 2023-07 are `binary_classification`, `regression`, and `forecasting`. Coming soon is `multiclass_classification`.

Currently, for classification problems, the `accuracy_score` and `roc_auc_score` metrics are supported, and they have no parameters. Future metrics may specify parameters.

We currently output predictions, trained models, and instantiations to memory. we will be saving these outputs to disk soon.

In [None]:
problem_definition = ProblemDefinition(
'{'
'    "_comments" : ['
'        "A json file fully encapsulating the problem definition for openml dataset #31.",'
'        "This dataset is a tabular binary classification problem.",'
'        "People are classified as good or bad credit risks based on attributes."'
'    ],'
'    "dataset" : {'
'        "config" : "local",'
f'        "test_path": "{dirpath}/examples/classification/credit-test.csv",'
f'        "train_path" : "{dirpath}/examples/classification/credit-train.csv",'
'        "column_roles": {'
'            "target": {'
'                "name": "class"'
'            }'
'        }'
'    },'
'    "problem_type" : {'
'        "data_type": "TABULAR",'
'        "task": "BINARY_CLASSIFICATION"'
'    },'
'    "metrics" :  {'
'        "accuracy_score": {},'
'        "roc_auc_score": {}'
'    },'
'    "output" : {'
'        "_comments" : ['
'            "uncomment these lines if you want to save predictions and instantiations to disk.",'
'            {'
f'               "path" : "{dirpath}/examples/classification/credit-output",'
'                "instantiations": ['
'                    "SIMPLE"'
'                ]'
'            }'
'        ]'
'    },'
'    "hyperparams": ['
'        "disable_grid_search"'
'    ]'
'}'
)

<h2>Metrics</h2>

Metrics are used to rank pipelines.

The `metric_catalog` loads metrics from the top level `metrics` directory. Also available is `MetricCatalogSimple` which has an explicit list of metrics.

In [None]:
metric_catalog = MetricCatalogAuto

<h2>Algorithms</h2>

Algorithms are the steps in a data science pipeline; they can be preprocessors, or prediction models that can be trained such as classifiers or regressors.
    
Every algorithm has `fit`, `predict`, and `save` methods. You load a saved model by passing it to the constuctor to the algorithm.

The `algorithm_catalog` supports loading algorithms from the top level `models/` directory. No other configuration is needed to add a new algorithm. There is also the option of `AlgorithmCatalogSimple()` which has a predefined set of models.

In [None]:
algorithm_catalog = AlgorithmCatalogAuto

<h2>Generator</h2>

The generator is responsible for converting the pipeline templates into bound pipelines, where hyperparams are set and algorithms are fully resolved to a specific algorithm catalog entry. A single pipeline template typically resolves to a set of bound pipelines.

In [None]:
generator = GeneratorImpl

<h2>Templates</h2>

The `template_catalog` contains the template pipelines for different problem types. As of 2023-07, there are exactly two templates.

You can add new templates by putting them in the `templates/` top level directory. Alternatively, there is a TemplateCatalogSimple() which has a predefined list of templates.

Templates need references to the `algorithm_catalog` and the `generator` for use in the generator phase.


In [None]:
template_catalog = TemplateCatalogAuto

<h2>Ranker</h2>

The `ranker` is responsible for applying `metrics` to decide which instantiated pipeline is the best for a given metric.

In [None]:
ranker = RankerImpl

<h2>Executor & Instantiator</h2>

The `executor` works with the `instantiator` to turn bound pipelines into executable code. The `simple` instantiator compiles a bound pipeline into a format that can be run by the `simple` executor, a single-threaded naive execution model.

Pipeline output also uses the instantiators to produce executable forms specified in the `output.instantiations` clause of the problem definition. As of 2023-07 the only supported instantiators are `simple` and `stub`, nether of which produces a persistant form, such as a file on the filesystem.

In [None]:
executor = SimpleExecutor
instantiator_factory=InstantiatorFactory

## Putting it all together: the Wrangler
The `wrangler` puts all the pieces together. The `wrangler.fit_predict_rank()` method runs all pipelines described from the problem definition and ranks them for each requested metric.

In [None]:
wrangler = Wrangler(
    problem_definition=problem_definition,
    metric_catalog=metric_catalog,
    algorithm_catalog=algorithm_catalog,
    ranker=ranker,
    template_catalog=template_catalog,
    generator=generator,
    executor=executor,
    instantiator_factory=instantiator_factory,
)

got = wrangler.fit_predict_rank()
print(got.train_results)
print(got.rankings)

## Test data
You can specify an additional test set in the problem definition, and the wrangler will predict on it using pipelines trained on the full train set.

In [None]:
print(got.test_results)

## Sending more data through a pipeline
You can also send test data through trained pipelines manually, without specifying it in the problem definition, using `wrangler.predict()`

In [None]:
test_dataset = wrangler.dataset(
    data=f'{dirpath}/examples/classification/credit-test.csv',
    roles=['attribute'])

test_predictions = wrangler.predict(new_data=test_dataset, trained_pipelines=got.executable_pipelines)
print(test_predictions)

# get rankings on new predictions by supplying ground truth
ground_truth = wrangler.dataset(
    data=f'{dirpath}/examples/classification/credit-test.csv',
    key='ground_truth',
    roles=['target'])

rankings = wrangler.rank(results=test_predictions, ground_truth=ground_truth)

print(rankings)