Copyright (c) Microsoft Corporation. All rights reserved.  
Licensed under the MIT License.

# AML Pipeline with AdlaStep
This notebook is used to demonstrate the use of AdlaStep in AML Pipeline.

## AML and Pipeline SDK-specific imports

In [None]:
import os
from azureml.core.compute import ComputeTarget, DatabricksCompute
from azureml.exceptions import ComputeTargetException
from azureml.core import Workspace, Run, Experiment
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.steps import AdlaStep
from azureml.core.datastore import Datastore
from azureml.data.data_reference import DataReference
from azureml.core import attach_legacy_compute_target

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

## Initialize Workspace

Initialize a workspace object from persisted configuration. Make sure the config file is present at .\config.json

In [None]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

In [None]:
# project folder
project_folder = './scripts'
print('projects will be created in {}.'.format(project_folder))

## Register Datastore

In [None]:
subscription_id = "<my-subscription-id>"
resource_group = "<my-rg>"
store_name = "<my-sotrename>"
tenant_id = "<my-tenant>"
client_id = "<my-client-id>"
client_secret = "<my-client-secret>"

adls_datastore = Datastore.register_azure_data_lake(
    workspace=ws,
    datastore_name='adlstoretestdata',
    subscription_id=subscription_id, # subscription id of ADLS account
    resource_group=resource_group, # resource group of ADLS account
    store_name=store_name, # ADLS account name
    tenant_id=tenant_id, # tenant id of service principal
    client_id=client_id, # client id of service principal
    client_secret=client_secret) # the secret of service principal

## Create DataReferences and PipelineData

In [None]:
adls_datastore = Datastore(workspace=ws, name="adlstoretestdata")
script_input = DataReference(
    datastore=adls_datastore,
    data_reference_name="script_input",
    path_on_datastore="testdata/testdata.txt")

script_output = PipelineData("script_output", datastore_name="adlstoretestdata")

## Setup Data Lake Account

In [None]:
adla_compute_name = 'testadl'

from azureml.core.compute import ComputeTarget, AdlaCompute

def get_or_create_adla_compute(workspace, compute_name):
    try:
        return AdlaCompute(workspace, compute_name)
    except ComputeTargetException as e:
        if 'ComputeTargetNotFound' in e.message:
            print('adla compute not found, creating...')
            provisioning_config = AdlaCompute.provisioning_configuration()
            adla_compute = ComputeTarget.create(workspace, compute_name, provisioning_config)
            adla_compute.wait_for_provisioning()
            return adla_compute
        else:
            raise e
            
adla_compute = get_or_create_adla_compute(ws, adla_compute_name)

print("ADLA compute state:{}".format(adla_compute.provisioning_state))
print("ADLA compute state:{}".format(adla_compute.provisioning_errors))
print("Using ADLA compute:{}".format(adla_compute.cluster_resource_id))

# CLI:
# Create: az ml computetarget setup adla -n <name>
# BYOC: az ml computetarget attach adla -n <name> -i <resource-id>

## Create an AdlaStep

**AdlaStep** is used to run U-SQL script using Azure Data Lake Analytics.

- **name:** Name of module
- **script_name:** name of U-SQL script
- **inputs:** List of input port bindings
- **outputs:** List of output port bindings
- **adla_compute:** the ADLA compute to use for this job
- **params:** Dictionary of name-value pairs to pass to U-SQL job *(optional)*
- **degree_of_parallelism:** the degree of parallelism to use for this job *(optional)*
- **priority:** the priority value to use for the current job *(optional)*
- **runtime_version:** the runtime version of the Data Lake Analytics engine *(optional)*
- **root_folder:** folder that contains the script, assemblies etc. *(optional)*
- **hash_paths:** list of paths to hash to detect a change (script file is always hashed) *(optional)*

In [None]:
adla_step = AdlaStep(
    name='adla_script_step',
    script_name='test_adla_script.usql',
    inputs=[script_input],
    outputs=[script_output],
    adla_compute=adla_compute)

## Build and Submit the Experiment

In [None]:
pipeline = Pipeline(
    description="adla_101",
    workspace=ws, 
    steps=[adla_step],
    default_source_directory=project_folder)

pipeline_run = Experiment(workspace, project_name).submit(pipeline)
pipeline_run.wait_for_completion()

### View Run Details

In [None]:
from azureml.train.widgets import RunDetails
RunDetails(pipeline_run).show()

### Examine the run
You can cycle through the node_run objects and examine job logs, stdout, and stderr of each of the steps.

In [None]:
step_runs = pipeline_run.get_children()
for step_run in step_runs:
    status = step_run.get_status()
    print('node', step_run.name, 'status:', status)
    if status == "Failed":
        joblog = step_run.get_job_log()
        print('job log:', joblog)
        stdout_log = step_run.get_stdout_log()
        print('stdout log:', stdout_log)
        stderr_log = step_run.get_stderr_log()
        print('stderr log:', stderr_log)
        with open("logs-" + step_run.name + ".txt", "w") as f:
            f.write(joblog)
            print("Job log written to logs-"+ step_run.name + ".txt")
    if status == "Finished":
        stdout_log = step_run.get_stdout_log()
        print('stdout log:', stdout_log)