# Create a simple/non parallel batch inference pipeline

Create a batch inference pipeline that reads data from the registered dataset and store the outputs in the default datastore in the path defined in the variables cell bellow.

In [None]:
# Variables used in script
model_name = "simple-model" # This is also used in the requirements notebook to register the model
dataset_name = "simple-batch" # This is also used in the requirements notebook to register the dataset

compute_cluster_name='cpu-cluster'
pipeline_name='simple-batch-pipeline'
output_folder = "simple-batch-output" # folder in default store to output csv file

## Create the inference script
Generate the python script that will be doing the batch inference:
- Load the model by name. Model name passed as parameter.
- Load dataset. Name of dataset passed as parameter. The script could be loading the inputs from the datastore directly instead of having to register a dataset.
- Make inferences
- Store output in a folder passed as an argument. Script doesn't need to know what the output is going to be


We are also copying the SimpleModel module next to the script and generate a conda environment which will be used by the pipeline.


In [None]:
# Create a folder to store the script
import os

script_path = 'script_folder'

if not os.path.exists(script_path):
    os.makedirs(script_path)

In [None]:
# Copy the module that contains our simple model
# This is a dependency we need to bake with the scoring file
! cp SimpleModel.py script_folder/

In [None]:
%%writefile script_folder/inference.py

import argparse

# retrieve arguments of this script
# Default values added so that we can run locally the script and debug
# python inference.py
parser = argparse.ArgumentParser()
parser.add_argument(
    "--input-dataset",
    type=str,
    dest="input_dataset_name",
    help="The dataset name to use for inference",
    default="simple-batch",
)
parser.add_argument(
    "--model-name",
    type=str,
    dest="model_name",
    help="The model to use to do inferences",
    default="simple-model",
)
parser.add_argument(
    "--output-folder",
    type=str,
    dest="output_folder",
    help="Where to store the processed outputs in csv files",
    default="./output",
)
args = parser.parse_args()

# Get a reference to the workspace to be able to load things
from azureml.core import Workspace
from azureml.core.run import Run, _OfflineRun

run = Run.get_context()
ws = None
if type(run) == _OfflineRun:
    ws = Workspace.from_config()
else:
    ws = run.experiment.workspace

# Download and dehydrate the model
from azureml.core import Model

azureml_model = Model(ws, args.model_name)
# Download latest artifacts, overriding them
azureml_model.download("./", exist_ok=True)

import os, joblib
from SimpleModel import SimpleModel

model_path = os.path.join("model", "model.joblib")
model = joblib.load(model_path)  # Note that we don't call the constructor
# The target column is stored in the instance we hydrated in the
# requirements notebook

# Get dataset
from azureml.core import Dataset

ds = Dataset.get_by_name(ws, args.input_dataset_name)
inferenced_df = ds.to_pandas_dataframe()

# Make inferences and store them as a new column in the dataset
inferenced_df["outputs"] = model.predict(inferenced_df)

# Store the results
# Create output path if not exists
import os

output_path = args.output_folder
if not os.path.exists(output_path):
    os.makedirs(output_path)
output_file_path = os.path.join(output_path, "results.csv")
inferenced_df.to_csv(output_file_path, index=False)


In [None]:
%%writefile script_folder/conda_env.yaml
# Packages should have explicit versions 
# For demo purposes we let them loose
# Also note the azureml-defaults package mentioned in  https://docs.microsoft.com/en-us/azure/machine-learning/concept-environments#types-of-environments
name: inference-env
dependencies:
  - python=3.6
  - scikit-learn
  - pip
  - pip:
    - azureml-defaults

## Create the pipeline

In [None]:
from azureml.core import Workspace
from azureml.core.compute import ComputeTarget

# Connect to workspace and get resource references
ws = Workspace.from_config()
compute_cluster = ComputeTarget(workspace=ws, name=compute_cluster_name)

In [None]:
from azureml.core import Environment
from azureml.core.runconfig import RunConfiguration

# Create an environment from the conda dependencies
run_env = Environment.from_conda_specification("run-environment",  'script_folder/conda_env.yaml')
# Create a run config that we will use in our steps
run_config = RunConfiguration()
run_config.environment = run_env


In [None]:
from azureml.data import OutputFileDatasetConfig
# To find more help
# help(OutputFileDatasetConfig)
datastore = ws.get_default_datastore()

# Create an output folder
output = OutputFileDatasetConfig(destination=(datastore, output_folder))

In [None]:
from azureml.pipeline.steps import PythonScriptStep

# First step to run the inference script
step_01 = PythonScriptStep(
    'inference.py',
    source_directory='script_folder',
    name='Run inference',
    compute_target=compute_cluster,
    runconfig=run_config,
    allow_reuse= False,
    arguments=[
        "--input-dataset", dataset_name,
        "--model-name", model_name,
        "--output-folder", output
    ],
    outputs=[output]
)

In [None]:
from azureml.pipeline.core import Pipeline

pipeline = Pipeline(workspace=ws, steps=[step_01])

published_pipeline = pipeline.publish(
    pipeline_name, 
    description="Batch inference with predefined output")

In [None]:
from azureml.core import Experiment

# Submit the pipeline to be run
pipeline_run1 = Experiment(ws, f"{pipeline_name}-runs").submit(published_pipeline)
pipeline_run1.wait_for_completion()