# Get predictions on test

In [14]:
from tempfile import TemporaryDirectory

import ray
from ray.train.sklearn import SklearnCheckpoint, SklearnPredictor
from ray.train.batch_predictor import BatchPredictor

import pandas as pd
import numpy as np
import mlflow

## Configuration

In [2]:
MLFLOW_TRACKING_URI = 'http://127.0.0.1:5000'
MLFLOW_EXPERIMENT_NAME = 'openfoodfacts-nova-dev'

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)

<Experiment: artifact_location='mlflow-artifacts:/1', experiment_id='1', lifecycle_stage='active', name='openfoodfacts-nova-dev', tags={}>

In [3]:
mlflow.start_run(run_id='a0652b7d994e4c38b516f674289f89b0')

<ActiveRun: >

In [4]:
# INPUT_MODEL_PATH = '/Users/rgareev/projects/mlops-openfoodfacts/wrk/trainings/20220831-dev/model'
INPUT_DATA_PATH = '/Users/rgareev/data/openfoodfacts/wrk/20220831-dev/test.parquet'
# TODO this script should not deal with labels at all
LABEL_COLUMN = 'nova_group'
#
OUTPUT_DATA_PATH = '/Users/rgareev/projects/mlops-openfoodfacts/wrk/testings/20220831-dev/model'

## Script

In [5]:
input_ds = ray.data.read_parquet(INPUT_DATA_PATH)

2022-09-11 21:57:26,174	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8266 [39m[22m


In [6]:
input_ds.schema()

product_name: string
nova_group: int8
ingredients_list: list<item: string>
  child 0, item: string
code: string
-- schema metadata --
pandas: '{"index_columns": ["code"], "column_indexes": [{"name": null, "f' + 684

In [7]:
input_ds = input_ds.repartition(10)

Read: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.72it/s]
Repartition: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 32.02it/s]


In [8]:
from ray.data.context import DatasetContext

ctx = DatasetContext.get_current()
ctx.enable_tensor_extension_casting = False

In [9]:
input_ds = input_ds.drop_columns([LABEL_COLUMN])

Map_Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 12.61it/s]


Restore model from run artifacts

In [11]:
sk_pipe = mlflow.sklearn.load_model(f'runs:/{mlflow.active_run().info.run_id}/model')

In [15]:
with TemporaryDirectory() as tmpdir:
    model_checkpoint = SklearnCheckpoint.from_estimator(sk_pipe, path=tmpdir)
    predictor = BatchPredictor(model_checkpoint, SklearnPredictor)

In [16]:
# does not work
# model_output_ds = predictor.predict(input_ds, keep_columns='code')
model_output_ds = predictor.predict(input_ds)

Map Progress (2 actors 1 pending): 100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  3.91it/s]


In [17]:
result_ds = ray.data.from_arrow_refs(input_ds.to_arrow_refs()).zip(
    ray.data.from_arrow_refs(model_output_ds.to_arrow_refs()))

In [18]:
result_ds.schema()

product_name: string
ingredients_list: list<item: string>
  child 0, item: string
predictions: int8
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 524

In [22]:
result_ds.write_parquet(OUTPUT_DATA_PATH)

Write Progress: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 16.74it/s]


In [23]:
!ls -alh $OUTPUT_DATA_PATH

total 17776
drwxr-xr-x  12 rgareev  staff   384B Sep 11 22:06 [1m[36m.[m[m
drwxr-xr-x   3 rgareev  staff    96B Sep 11 22:06 [1m[36m..[m[m
-rw-r--r--   1 rgareev  staff   895K Sep 11 22:06 ff7876293a2f47e9b5d2602c193d6378_000000.parquet
-rw-r--r--   1 rgareev  staff   878K Sep 11 22:06 ff7876293a2f47e9b5d2602c193d6378_000001.parquet
-rw-r--r--   1 rgareev  staff   886K Sep 11 22:06 ff7876293a2f47e9b5d2602c193d6378_000002.parquet
-rw-r--r--   1 rgareev  staff   890K Sep 11 22:06 ff7876293a2f47e9b5d2602c193d6378_000003.parquet
-rw-r--r--   1 rgareev  staff   892K Sep 11 22:06 ff7876293a2f47e9b5d2602c193d6378_000004.parquet
-rw-r--r--   1 rgareev  staff   879K Sep 11 22:06 ff7876293a2f47e9b5d2602c193d6378_000005.parquet
-rw-r--r--   1 rgareev  staff   889K Sep 11 22:06 ff7876293a2f47e9b5d2602c193d6378_000006.parquet
-rw-r--r--   1 rgareev  staff   896K Sep 11 22:06 ff7876293a2f47e9b5d2602c193d6378_000007.parquet
-rw-r--r--   1 rgareev  staff   881K Sep 11 22:06 ff7876293a2f47e9b5d