In [20]:
import os
from pathlib import Path

import ray
import pandas as pd
import numpy as np

from ray.train.sklearn import SklearnTrainer
from ray.data import Dataset
from ray.data.preprocessors import BatchMapper, Chain
from ray.data.preprocessor import Preprocessor
from ray.air.config import ScalingConfig
from ray.train.sklearn import SklearnCheckpoint

import mlflow

from pprint import pprint

# Configuration

In [4]:
MLFLOW_TRACKING_URI = 'http://127.0.0.1:5000'
MLFLOW_EXPERIMENT_NAME = 'openfoodfacts-nova-dev'

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)

<Experiment: artifact_location='mlflow-artifacts:/1', experiment_id='1', lifecycle_stage='active', name='openfoodfacts-nova-dev', tags={}>

In [5]:
INPUT_DATA_PATH = '/Users/rgareev/data/openfoodfacts/wrk/20220831-dev/train.parquet'
LABEL_COLUMN = 'nova_group'

In [6]:
OUTPUT_MODEL_PATH = '/Users/rgareev/projects/mlops-openfoodfacts/wrk/trainings/20220831-dev/model'

In [7]:
ray.init()

2022-09-11 21:16:32,988	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


0,1
Python version:,3.9.12
Ray version:,2.0.0
Dashboard:,http://127.0.0.1:8265


In [8]:
ray.available_resources()

{'object_store_memory': 2147483648.0,
 'CPU': 8.0,
 'memory': 12221115597.0,
 'node:127.0.0.1': 1.0}

# Script
## Read data

In [9]:
mlflow.start_run()

mlflow.active_run().info.run_id

'a0652b7d994e4c38b516f674289f89b0'

In [10]:
ds = ray.data.read_parquet(INPUT_DATA_PATH)



In [11]:
ds.schema()

product_name: string
nova_group: int8
ingredients_list: list<item: string>
  child 0, item: string
code: string
-- schema metadata --
pandas: '{"index_columns": ["code"], "column_indexes": [{"name": null, "f' + 684

In [12]:
from ray.data.context import DatasetContext

ctx = DatasetContext.get_current()
ctx.enable_tensor_extension_casting = False

In [13]:
ds = ds.repartition(5)

Read: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.16it/s]
Repartition: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 17.14it/s]


## Train and tune model

In [14]:
# baseline
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import FunctionTransformer

FEATURE_COLS = ['ingredients_list']

df_converter = FunctionTransformer(lambda X: X[FEATURE_COLS].to_dict(orient='records'))
ingredient_encoder = DictVectorizer()
nb_clf = BernoulliNB(binarize=None)
sk_pipe = Pipeline([
    ('df_converter', df_converter),
    ('encoder', ingredient_encoder),
    ('clf', nb_clf)
])

In [15]:
trainer = SklearnTrainer(
    estimator = sk_pipe,
    datasets = {
        'train' : ds,
    },
    label_column = LABEL_COLUMN,
    cv = 5,
    parallelize_cv = True,
    scaling_config = ScalingConfig(trainer_resources = {'CPU' : 5})
)

train_result = trainer.fit()

Trial name,status,loc,iter,total time (s),fit_time
SklearnTrainer_dc4ef_00000,TERMINATED,127.0.0.1:32106,1,24.6819,6.70998




Result for SklearnTrainer_dc4ef_00000:
  cv:
    fit_time: [7.80651593208313, 7.634344816207886, 7.800932884216309, 7.588690280914307,
      7.8246169090271]
    fit_time_mean: 7.731020164489746
    fit_time_std: 0.0989461152507635
    score_time: [1.689162015914917, 1.6113872528076172, 1.6546552181243896, 1.6345617771148682,
      1.4822258949279785]
    score_time_mean: 1.614398431777954
    score_time_std: 0.07084547910949929
    test_score: [0.8784641284641285, 0.8788803788803788, 0.879009879009879, 0.8782976282976283,
      0.8792318792318792]
    test_score_mean: 0.8787767787767787
    test_score_std: 0.00034627172106322605
  date: 2022-09-11_21-18-15
  done: false
  experiment_id: cf367dc7e0d74e3d99f5bba4a91c4675
  fit_time: 6.709980249404907
  hostname: GRM-MacBook-Prov.local
  iterations_since_restore: 1
  node_ip: 127.0.0.1
  pid: 32106
  should_checkpoint: true
  time_since_restore: 24.681931972503662
  time_this_iter_s: 24.681931972503662
  time_total_s: 24.681931972503662


2022-09-11 21:18:15,228	INFO tune.py:758 -- Total run time: 26.41 seconds (26.27 seconds for the tuning loop).


In [16]:
train_result.metrics_dataframe

Unnamed: 0,fit_time,time_this_iter_s,should_checkpoint,done,timesteps_total,episodes_total,training_iteration,trial_id,experiment_id,date,...,warmup_time,cv/fit_time,cv/score_time,cv/test_score,cv/fit_time_mean,cv/fit_time_std,cv/score_time_mean,cv/score_time_std,cv/test_score_mean,cv/test_score_std
0,6.70998,24.681932,True,False,,,1,dc4ef_00000,cf367dc7e0d74e3d99f5bba4a91c4675,2022-09-11_21-18-15,...,0.002449,[7.80651593 7.63434482 7.80093288 7.58869028 7...,[1.68916202 1.61138725 1.65465522 1.63456178 1...,[0.87846413 0.87888038 0.87900988 0.87829763 0...,7.73102,0.098946,1.614398,0.070845,0.878777,0.000346


In [17]:
train_result.metrics

{'cv': {'fit_time': array([7.80651593, 7.63434482, 7.80093288, 7.58869028, 7.82461691]),
  'score_time': array([1.68916202, 1.61138725, 1.65465522, 1.63456178, 1.48222589]),
  'test_score': array([0.87846413, 0.87888038, 0.87900988, 0.87829763, 0.87923188]),
  'fit_time_mean': 7.731020164489746,
  'fit_time_std': 0.0989461152507635,
  'score_time_mean': 1.614398431777954,
  'score_time_std': 0.07084547910949929,
  'test_score_mean': 0.8787767787767787,
  'test_score_std': 0.00034627172106322605},
 'fit_time': 6.709980249404907,
 'time_this_iter_s': 24.681931972503662,
 'should_checkpoint': True,
 'done': True,
 'timesteps_total': None,
 'episodes_total': None,
 'training_iteration': 1,
 'trial_id': 'dc4ef_00000',
 'experiment_id': 'cf367dc7e0d74e3d99f5bba4a91c4675',
 'date': '2022-09-11_21-18-15',
 'timestamp': 1662956295,
 'time_total_s': 24.681931972503662,
 'pid': 32106,
 'hostname': 'GRM-MacBook-Prov.local',
 'node_ip': '127.0.0.1',
 'config': {},
 'time_since_restore': 24.68193197

In [18]:
train_result.checkpoint

Checkpoint(local_path=/Users/rgareev/ray_results/SklearnTrainer_2022-09-11_21-17-48/SklearnTrainer_dc4ef_00000_0_2022-09-11_21-17-48/checkpoint_000001)

In [17]:
train_result.checkpoint.to_directory(OUTPUT_MODEL_PATH)

'/Users/rgareev/projects/mlops-openfoodfacts/wrk/trainings/20220831-dev/model'

In [41]:
# TODO register experiment metrics
# use Ray Tune and its ray.tune.integration.mlflow.MLflowLoggerCallback

In [21]:
checkpoint = SklearnCheckpoint.from_checkpoint(train_result.checkpoint)

In [22]:
mlflow.sklearn.log_model(
    checkpoint.get_estimator(),
    artifact_path = 'model')



ModelInfo(artifact_path='model', flavors={'python_function': {'model_path': 'model.pkl', 'loader_module': 'mlflow.sklearn', 'python_version': '3.9.12', 'env': 'conda.yaml'}, 'sklearn': {'pickled_model': 'model.pkl', 'sklearn_version': '1.1.2', 'serialization_format': 'cloudpickle', 'code': None}}, model_uri='runs:/a0652b7d994e4c38b516f674289f89b0/model', model_uuid='597a895713d340b386077740c3d67aa6', run_id='a0652b7d994e4c38b516f674289f89b0', saved_input_example_info=None, signature_dict=None, utc_time_created='2022-09-12 04:52:50.942683', mlflow_version='1.28.0')