<img src="Tarjeta.png">

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Conectar-área-de-trabajo" data-toc-modified-id="Conectar-área-de-trabajo-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Conectar área de trabajo</a></span></li><li><span><a href="#Entrenar-y-registrar-modelo" data-toc-modified-id="Entrenar-y-registrar-modelo-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Entrenar y registrar modelo</a></span></li><li><span><a href="#Generar-y-subir-datos-batch" data-toc-modified-id="Generar-y-subir-datos-batch-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Generar y subir datos batch</a></span></li><li><span><a href="#Crear-instancia" data-toc-modified-id="Crear-instancia-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Crear instancia</a></span></li><li><span><a href="#Crear-canalización-para-inferencia-batch" data-toc-modified-id="Crear-canalización-para-inferencia-batch-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Crear canalización para inferencia batch</a></span></li><li><span><a href="#Publicar-la-canalización-y-usar-como-una-interfaz-REST" data-toc-modified-id="Publicar-la-canalización-y-usar-como-una-interfaz-REST-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Publicar la canalización y usar como una interfaz REST</a></span></li></ul></div>

# Inferencia Batch

## Conectar área de trabajo

In [7]:
import azureml.core
from azureml.core import Workspace

# Cargar el área de trabajo del fichero de configuración
ws = Workspace.from_config()
print('Versión de Azure ML {} y área de trabajo {}'.format(azureml.core.VERSION, ws.name))

Versión de Azure ML 1.32.0 y área de trabajo aml-nuclio


## Entrenar y registrar modelo

In [8]:
from azureml.core import Experiment
from azureml.core import Model
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# Crear el experimento
experiment = Experiment(workspace=ws, name='nuclio-train-vino')
run = experiment.start_logging()
print("Empezando experimento:", experiment.name)

# Cargar los datos de un fichero local
data = pd.read_csv('data/winequality.csv', sep=';', decimal='.')

# Separar features and target
features = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'ph', 'sulphates']
X, y = data[features].values, data['top_quality'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Entrenamiento de un arbol de decision
print('Entrenando un decision tree model')
model = DecisionTreeClassifier().fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# Calcular AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

# Guardar el modelo entrenado en la carpeta outputs
print("Guardando el modelo...")
os.makedirs('outputs', exist_ok=True)
model_file = os.path.join('outputs', 'vino_model.pkl')
joblib.dump(value=model, filename=model_file)

# Completar ejecución
run.complete()

# Registrar el modelo
print('Registrando el modelo...')
Model.register(workspace=run.experiment.workspace,
               model_path = model_file,
               model_name = 'vino_model',
               tags={'Training context':'Inline Training'},
               properties={'AUC': np.float(auc), 'Accuracy': np.float(acc)})

print('Modelo entrenado y registrado')

Empezando experimento: nuclio-train-vino
Entrenando un decision tree model
Accuracy: 0.8115646258503402
AUC: 0.7299711990633483
Guardando el modelo...
Registrando el modelo...
Registering model vino_model
Modelo entrenado y registrado


## Generar y subir datos batch

In [9]:
from azureml.core import Datastore, Dataset
import pandas as pd
import os

# Conseguir los datastores que hay por defecto
ws.set_default_datastore('workspaceblobstore')
default_ds = ws.get_default_datastore()

# Enumera los datastores e indica cual es el de por defecto
for ds_name in ws.datastores:
    print(ds_name, "- Por defecto =", ds_name == default_ds.name)

# Load the diabetes data
data = pd.read_csv('data/winequality.csv', sep=';', decimal='.')
# Quedarnos con las features y unos 100 registros
features = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'ph', 'sulphates']
sample = data[features].sample(n=100).values

# Crear carpeta
batch_folder = './batch-data'
os.makedirs(batch_folder, exist_ok=True)
print("Carpeta creada!")

# Guardar los ejemplos en un fichero separado cada registro
print("Guardando ficheros...")
for i in range(100):
    fname = str(i+1) + '.csv'
    sample[i].tofile(os.path.join(batch_folder, fname), sep=",")
print("ficheros guardados!")

# Cargar los ficheros en el datastore
print("Subiendo ficheros al datastore...")
default_ds = ws.get_default_datastore()
default_ds.upload(src_dir="batch-data", target_path="batch-data", overwrite=True, show_progress=True)

# Registrar dataset para el input
batch_data_set = Dataset.File.from_files(path=(default_ds, 'batch-data/'), validate=False)
try:
    batch_data_set = batch_data_set.register(workspace=ws, 
                                             name='batch-data',
                                             description='batch data',
                                             create_new_version=True)
except Exception as ex:
    print(ex)

print("Hecho!")

azureml_globaldatasets - Por defecto = False
workspacefilestore - Por defecto = False
workspaceblobstore - Por defecto = True
Carpeta creada!
Guardando ficheros...
ficheros guardados!
Subiendo ficheros al datastore...
Uploading an estimated of 100 files
Uploading batch-data/1.csv
Uploaded batch-data/1.csv, 1 files out of an estimated total of 100
Uploading batch-data/10.csv
Uploaded batch-data/10.csv, 2 files out of an estimated total of 100
Uploading batch-data/100.csv
Uploaded batch-data/100.csv, 3 files out of an estimated total of 100
Uploading batch-data/11.csv
Uploaded batch-data/11.csv, 4 files out of an estimated total of 100
Uploading batch-data/12.csv
Uploaded batch-data/12.csv, 5 files out of an estimated total of 100
Uploading batch-data/13.csv
Uploaded batch-data/13.csv, 6 files out of an estimated total of 100
Uploading batch-data/14.csv
Uploaded batch-data/14.csv, 7 files out of an estimated total of 100
Uploading batch-data/15.csv
Uploaded batch-data/15.csv, 8 files out

## Crear instancia

In [16]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "NuclioCluster"

try:
    # Validar si existe
    inference_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Cluster de proceso encontrado.')
except ComputeTargetException:
    # Si no existe, se crea
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        inference_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        inference_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Cluster de proceso encontrado.


## Crear canalización para inferencia batch

In [11]:
import os
# Crear una carpeta para el experimento
experiment_folder = 'batch_pipeline'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

batch_pipeline


In [12]:
%%writefile $experiment_folder/batch_vino.py
import os
import numpy as np
from azureml.core import Model
import joblib


def init():
    # Ejecutar cuando el paso de la canalización se inicia
    global model

    # cargar modelo
    model_path = Model.get_model_path('vino_model')
    model = joblib.load(model_path)


def run(mini_batch):
    # Ejecutar para cada batch
    resultList = []

    # procesa cada fichero en batch
    for f in mini_batch:
        # leer datos
        data = np.genfromtxt(f, delimiter=',')
        # Transformar 
        prediction = model.predict(data.reshape(1, -1))
        # Append de las predicciones
        resultList.append("{}: {}".format(os.path.basename(f), prediction[0]))
    return resultList

Writing batch_pipeline/batch_vino.py


In [13]:
%%writefile $experiment_folder/batch_environment.yml
name: batch_environment
dependencies:
- python=3.6.2
- scikit-learn
- pip
- pip:
  - azureml-defaults

Writing batch_pipeline/batch_environment.yml


Next we'll define a run context that includes the Conda environment.

In [14]:
from azureml.core import Environment
from azureml.core.runconfig import DEFAULT_CPU_IMAGE

# Crear un entorno para el experimento
batch_env = Environment.from_conda_specification("experiment_env", experiment_folder + "/batch_environment.yml")
batch_env.docker.base_image = DEFAULT_CPU_IMAGE
print('Configuración preparada.')

Configuración preparada.


Se va a usar una canalización que ejecuta una predicción batch y guarda los resultados en un fichero de texto. Para ello, se puede usar **ParallelRunStep** que permite procesar los datos batch en paralelo.

In [17]:
from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep
from azureml.data import OutputFileDatasetConfig
from azureml.core.runconfig import DockerConfiguration

output_dir = OutputFileDatasetConfig(name='inferences')

parallel_run_config = ParallelRunConfig(
    source_directory=experiment_folder,
    entry_script="batch_vino.py",
    mini_batch_size="5",
    error_threshold=10,
    output_action="append_row",
    environment=batch_env,
    compute_target=inference_cluster,
    node_count=2)

parallelrun_step = ParallelRunStep(
    name='batch-score-vino',
    parallel_run_config=parallel_run_config,
    inputs=[batch_data_set.as_named_input('vino_batch')],
    output=output_dir,
    arguments=[],
    allow_reuse=True
)

print('Paso definido')

Paso definido


> **Nota**: Esto tardará tiempo

In [18]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline

pipeline = Pipeline(workspace=ws, steps=[parallelrun_step])
pipeline_run = Experiment(ws, 'nuclio-vino-batch').submit(pipeline)
pipeline_run.wait_for_completion(show_output=True)

Created step batch-score-vino [012ab228][5ba56d6a-4cd2-4521-91c2-1919a8bbe84c], (This step will run and generate new outputs)
Submitted PipelineRun 3f81b6ca-05a8-4203-94ff-0a858883ea44
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/3f81b6ca-05a8-4203-94ff-0a858883ea44?wsid=/subscriptions/87f9793d-5515-43eb-b182-0f27b97da8b3/resourcegroups/nuclio/workspaces/aml-nuclio&tid=e93a2455-92c2-4ff2-add1-a6a39d490ed6
PipelineRunId: 3f81b6ca-05a8-4203-94ff-0a858883ea44
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/3f81b6ca-05a8-4203-94ff-0a858883ea44?wsid=/subscriptions/87f9793d-5515-43eb-b182-0f27b97da8b3/resourcegroups/nuclio/workspaces/aml-nuclio&tid=e93a2455-92c2-4ff2-add1-a6a39d490ed6
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 0fd5b744-5f93-4374-9f8a-669b8aa890f2
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/0fd5b744-5f93-4374-9f8a-669b8aa890f2?wsid=/subscriptions/87f9793d-5515-43eb-b182-0f27b97da8b3/resourc

'Finished'

In [19]:
import pandas as pd
import shutil

# Elimina la carpeta con los resultados locales
shutil.rmtree('vino-results', ignore_errors=True)

# Consigue la ejecución del primer paso y descarga el output
prediction_run = next(pipeline_run.get_children())
prediction_output = prediction_run.get_output_data('inferences')
prediction_output.download(local_path='vino-results')

# Recorre la jerarquía de carpetas y encuentra el archivo de resultados
for root, dirs, files in os.walk('vino-results'):
    for file in files:
        if file.endswith('parallel_run_step.txt'):
            result_file = os.path.join(root,file)

# Output
df = pd.read_csv(result_file, delimiter=":", header=None)
df.columns = ["File", "Prediction"]

# Visualiza las 20 primeras filas
df.head(20)

Unnamed: 0,File,Prediction
0,1.csv,0
1,10.csv,0
2,100.csv,1
3,11.csv,0
4,12.csv,0
5,13.csv,0
6,14.csv,0
7,15.csv,1
8,16.csv,0
9,17.csv,1


## Publicar la canalización y usar como una interfaz REST

In [20]:
published_pipeline = pipeline_run.publish_pipeline(
    name='vino-batch-pipeline', description='Scoring batch de los datos del vino', version='1.0')

published_pipeline

Name,Id,Status,Endpoint
vino-batch-pipeline,cf71cfaa-590f-4ccb-8f60-319968c02485,Active,REST Endpoint


In [21]:
rest_endpoint = published_pipeline.endpoint
print(rest_endpoint)

https://francecentral.api.azureml.ms/pipelines/v1.0/subscriptions/87f9793d-5515-43eb-b182-0f27b97da8b3/resourceGroups/nuclio/providers/Microsoft.MachineLearningServices/workspaces/aml-nuclio/PipelineRuns/PipelineSubmit/cf71cfaa-590f-4ccb-8f60-319968c02485


In [22]:
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()
print('Autentificación preparada.')

Autentificación preparada.


Se puede llamar a la interfaz REST. La canalización ejecuta de manera asincrona.

In [23]:
import requests

rest_endpoint = published_pipeline.endpoint
response = requests.post(rest_endpoint, 
                         headers=auth_header, 
                         json={"ExperimentName": "nuclio-vino-batch"})
run_id = response.json()["Id"]
run_id

'2b8cd4cc-257b-474e-b7a6-4af7018b0c86'

In [24]:
from azureml.pipeline.core.run import PipelineRun
from azureml.widgets import RunDetails

published_pipeline_run = PipelineRun(ws.experiments['nuclio-vino-batch'], run_id)

published_pipeline_run.wait_for_completion(show_output=True)

PipelineRunId: 2b8cd4cc-257b-474e-b7a6-4af7018b0c86
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/2b8cd4cc-257b-474e-b7a6-4af7018b0c86?wsid=/subscriptions/87f9793d-5515-43eb-b182-0f27b97da8b3/resourcegroups/nuclio/workspaces/aml-nuclio&tid=e93a2455-92c2-4ff2-add1-a6a39d490ed6
PipelineRun Status: Running

PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '2b8cd4cc-257b-474e-b7a6-4af7018b0c86', 'status': 'Completed', 'startTimeUtc': '2021-09-02T14:13:01.644062Z', 'endTimeUtc': '2021-09-02T14:13:02.894834Z', 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'Unavailable', 'runType': 'HTTP', 'azureml.parameters': '{}', 'azureml.pipelineid': 'cf71cfaa-590f-4ccb-8f60-319968c02485'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://amlnuclio7384818405.blob.core.windows.net/azureml/ExperimentRun/dcid.2b8cd4cc-257b-474e-b7a6-4af7018b0c86/logs/azureml/executionlogs.txt?sv=2019-07-07&sr=b

'Finished'

In [25]:
import pandas as pd
import shutil

# Elimina la carpeta con los resultados locales
shutil.rmtree('vino-results', ignore_errors=True)

# Consigue la ejecución del primer paso y descarga el output
prediction_run = next(pipeline_run.get_children())
prediction_output = prediction_run.get_output_data('inferences')
prediction_output.download(local_path='vino-results')

# Recorre la jerarquía de carpetas y encuentra el archivo de resultados
for root, dirs, files in os.walk('vino-results'):
    for file in files:
        if file.endswith('parallel_run_step.txt'):
            result_file = os.path.join(root,file)

# Output
df = pd.read_csv(result_file, delimiter=":", header=None)
df.columns = ["File", "Prediction"]

# Visualiza las 20 primeras filas
df.head(20)

Unnamed: 0,File,Prediction
0,1.csv,0
1,10.csv,0
2,100.csv,1
3,11.csv,0
4,12.csv,0
5,13.csv,0
6,14.csv,0
7,15.csv,1
8,16.csv,0
9,17.csv,1


**Más información**: [Compilación de una canalización de Azure Machine Learning para la puntuación por lote](https://docs.microsoft.com/azure/machine-learning/how-to-run-batch-predictions) 