# Simple parallel batch processing of multiple files

We have weather data partitioned by year and month e.g. `greece-weather-data/{year}/{month}/data.parquet`. We want to batch process them all in the `cpu-cluster` in 10 processes, 5 on each node.
Each process will be invoked multiple times, passing in 10 files every time.
The script will print which files it's processing every time.

Results will be stored in `inferences/greece-weather/outputs.txt` file.


In [None]:
from azureml.core import Workspace

ws = Workspace.from_config()
dstore = ws.get_default_datastore()
compute_target = ws.compute_targets["cpu-cluster"]

In [None]:
from azureml.core import Dataset

# Files are saved in a date partition like greece-weather-data/year/month/data.parquet 
file_paths = [(dstore,'greece-weather-data/**/*.parquet')]
file_ds = Dataset.File.from_files(path=file_paths, validate=True)

In [None]:
from azureml.pipeline.core import PipelineParameter
from azureml.data.dataset_consumption_config import DatasetConsumptionConfig

ds_pipeline_param = PipelineParameter(name="dataset", default_value=file_ds)
step01_input_dataset = DatasetConsumptionConfig("input_dataset", ds_pipeline_param)

In [None]:
from azureml.pipeline.steps import ParallelRunConfig

# Configure parallel step
parallel_run_config = ParallelRunConfig(
    source_directory="script",
    entry_script="file_batch.py",
    mini_batch_size=10,
    error_threshold=-1,
    output_action="append_row",
    append_row_file_name="outputs.txt",
    environment=ws.environments["AzureML-Tutorial"],
    compute_target=compute_target,
    node_count=2,
    process_count_per_node=5,
    run_invocation_timeout=600,
)

In [None]:
from azureml.data import OutputFileDatasetConfig

# Configure where to output inferences
step_output = OutputFileDatasetConfig(
    name="results_store", destination=(dstore, "/inferences/greece-weather/")
)

In [None]:
from azureml.pipeline.steps import ParallelRunStep

parallel_step = ParallelRunStep(
    name="parallel-inference",
    inputs=[step01_input_dataset],
    output=step_output,
    parallel_run_config=parallel_run_config,
    allow_reuse=False,
)

In [None]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline

pipeline = Pipeline(workspace=ws, steps=[parallel_step])

pipeline_run = Experiment(ws, "parallel-file-inference-run").submit(pipeline)