## References
- https://docs.microsoft.com/en-us/azure/machine-learning/how-to-create-machine-learning-pipelines
- https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.run.run?view=azure-ml-py
- https://azuredevopslabs.com/labs/vstsextend/aml/
- https://vladiliescu.net/3-ways-to-pass-data-between-azure-ml-pipeline-steps/ (3 ways to pass datasets between pipeline)

In [35]:
import pandas as pd
import numpy as np
import azureml.core
from azureml.core import Workspace
from azureml.core.authentication import ServicePrincipalAuthentication
from azureml.core import Datastore, Dataset

from azureml.core.compute import ComputeTarget, AmlCompute, ComputeInstance
from azureml.core.compute_target import ComputeTargetException
from azureml.core import Environment, Experiment,ScriptRunConfig
from azureml.core.runconfig import DockerConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.pipeline.core import Pipeline
from azureml.pipeline.steps import PythonScriptStep

In [4]:
tenant_id = "72f988bf-86f1-41af-91ab-2d7cd011db47"
client_id = "2a81532b-016b-4c0e-aa43-bd9b97fbdaba"
client_secret = "NrL7Q~-CfUgOqeqJtC86ARcf2JSl2vBrHlTI5"
sp = ServicePrincipalAuthentication(tenant_id=tenant_id, # tenantID
                                    service_principal_id=client_id, # clientId
                                    service_principal_password=client_secret) # clientSecret

subscription_id = '7e48a1e8-8d3e-4e00-8bc0-098c43f5ace7'

# Azure Machine Learning resource group NOT the managed resource group
resource_group = 'rg-mlops-demo-dev' 

#Azure Machine Learning workspace name, NOT Azure Databricks workspace
workspace_name = 'ws-demo' 
ws = Workspace.get(name=workspace_name,
                   auth=sp,
                   subscription_id=subscription_id
                  ,resource_group=resource_group)
#ws.get_details()
dstore = ws.get_default_datastore()


In [6]:
datastore_name="dstore_diabetes"

if datastore_name in ws.datastores:
    adls_datastore = ws.datastores[datastore_name]
else:
    adls_datastore = Datastore.register_azure_data_lake_gen2(
       workspace=ws,
       datastore_name=datastore_name,
       filesystem="diabetes", # subscription id of ADLS account
       account_name="sasampledata", # ADLS account name
       tenant_id=tenant_id, # tenant id of service principal
       client_id=client_id, # client id of service principal
       client_secret=client_secret)

In [10]:
compute_name = "ws-demo-compute"

try:
    instance = ComputeInstance(workspace=ws, name=compute_name)
    print('Found existing instance, use it.')
except ComputeTargetException:
    compute_config = ComputeInstance.provisioning_configuration(
        vm_size='Standard_DS3_v2',
        ssh_public_access=False,
        # vnet_resourcegroup_name='<my-resource-group>',
        # vnet_name='<my-vnet-name>',
        # subnet_name='default',
        # admin_user_ssh_public_key='<my-sshkey>'
    )
    instance = ComputeInstance.create(ws, compute_name, compute_config)
    instance.wait_for_completion(show_output=True)

Found existing instance, use it.


In [12]:
dataset_name = "ds_diabetes"

train_ds = Dataset.Tabular.from_delimited_files(path=(adls_datastore, "diabetes.csv"))
train_ds.register(ws, dataset_name, create_new_version=True)  

{
  "source": [
    "('dstore_diabetes', 'diabetes.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "615dd694-5f49-4f6c-9818-84a0d50aacd0",
    "name": "ds_diabetes",
    "version": 1,
    "workspace": "Workspace.create(name='ws-demo', subscription_id='7e48a1e8-8d3e-4e00-8bc0-098c43f5ace7', resource_group='rg-mlops-demo-dev')"
  }
}

In [13]:
df = train_ds.to_pandas_dataframe()

In [14]:
df.head()

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


## Without Pipeline

In [27]:
'''myenv = Environment("myenv")
myenv.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn==0.19.1'
                                                                           ,"numpy==1.14.5"
                                                                          ,"pandas==0.23.1"
                                                                          ,"scipy==1.0.0"])

# Enable Docker
docker_config = DockerConfiguration(use_docker=True)

# Attach Experiment
experiment_name = "mlops-demo"
exp = Experiment(workspace=ws, name=experiment_name)
print(exp.name, exp.workspace.name, sep="\n")

print("Submitting an experiment.")
src = ScriptRunConfig(
    source_directory="./code",
    script="training/train.py",
    compute_target=instance, 
      environment=myenv
)

run = exp.submit(config=src)
run

# Shows output of the run on stdout.
run.wait_for_completion(show_output=True, wait_post_processing=True)

# Raise exception if run fails
if run.get_status() == "Failed":
    raise Exception(
        "Training on local failed with following run status: {} and logs: \n {}".format(
            run.get_status(), run.get_details_with_logs()
        )
    )
    
run.get_metrics()
'''

'myenv = Environment("myenv")\nmyenv.python.conda_dependencies = CondaDependencies.create(conda_packages=[\'scikit-learn==0.19.1\'\n                                                                           ,"numpy==1.14.5"\n                                                                          ,"pandas==0.23.1"\n                                                                          ,"scipy==1.0.0"])\n\n# Enable Docker\ndocker_config = DockerConfiguration(use_docker=True)\n\n# Attach Experiment\nexperiment_name = "mlops-demo"\nexp = Experiment(workspace=ws, name=experiment_name)\nprint(exp.name, exp.workspace.name, sep="\n")\n\nprint("Submitting an experiment.")\nsrc = ScriptRunConfig(\n    source_directory="./code",\n    script="training/train.py",\n    compute_target=instance, \n      environment=myenv\n)\n\nrun = exp.submit(config=src)\nrun\n\n# Shows output of the run on stdout.\nrun.wait_for_completion(show_output=True, wait_post_processing=True)\n\n# Raise exception if run 

## With Pipeline

In [51]:
from azureml.core.runconfig import RunConfiguration


aml_run_config = RunConfiguration()
aml_run_config.target = instance

aml_run_config.environment.python.user_managed_dependencies = False

# Add some packages relied on by data prep step
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['scikit-learn'
                   ,"numpy"
                  ,"pandas"
                  ,"scipy"
                   ,"joblib"], 
    pip_packages=['azureml-sdk', 'azureml-dataset-runtime[fuse,pandas]', "azureml-dataprep[pandas]"], 
    pin_sdk_version=False)

In [31]:
from azureml.data import OutputFileDatasetConfig

output_data1 = OutputFileDatasetConfig(destination = (adls_datastore, 'result/{run-id}'))
output_data_dataset = output_data1.register_on_complete(name = 'prepared_output_data')

In [99]:
# Attach Experiment
experiment_name = "mlops-demo"
exp = Experiment(workspace=ws, name=experiment_name)
print(exp.name, exp.workspace.name, sep="\n")

from azureml.pipeline.core import PipelineParameter

pipeline_param_fname = PipelineParameter(name="file_name", default_value="default_val")
pipeline_param_dstoraname = PipelineParameter(name="datastore_name", default_value=datastore_name)

train_step = PythonScriptStep(
    script_name="training/train.py",
    source_directory="./code",
    # arguments=["--input", train_ds.as_named_input("train_ds"), "--output", output_data1, "--param1", pipeline_param],    # Use this line if you want to pass default dataset
    arguments=["--file_name", pipeline_param_fname, "--datastore_name", pipeline_param_dstoraname],
    compute_target=instance,
    runconfig=aml_run_config,
    allow_reuse=True
)


mlops-demo
ws-demo


In [100]:
compare_models = [train_step]


# Build the pipeline
pipeline1 = Pipeline(workspace=ws, steps=[compare_models])

# Submit the pipeline to be run
pipeline_run1 = Experiment(ws, 'DiabetesModel').submit(pipeline1)
pipeline_run1.wait_for_completion()

Created step training/train.py [81a6ad21][2c65fe85-9fca-47b5-a34f-e461d9fb4484], (This step will run and generate new outputs)
Submitted PipelineRun ac8ea6f4-7a90-4818-bba0-84a1868913e8
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/ac8ea6f4-7a90-4818-bba0-84a1868913e8?wsid=/subscriptions/7e48a1e8-8d3e-4e00-8bc0-098c43f5ace7/resourcegroups/rg-mlops-demo-dev/workspaces/ws-demo&tid=72f988bf-86f1-41af-91ab-2d7cd011db47
PipelineRunId: ac8ea6f4-7a90-4818-bba0-84a1868913e8
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/ac8ea6f4-7a90-4818-bba0-84a1868913e8?wsid=/subscriptions/7e48a1e8-8d3e-4e00-8bc0-098c43f5ace7/resourcegroups/rg-mlops-demo-dev/workspaces/ws-demo&tid=72f988bf-86f1-41af-91ab-2d7cd011db47
PipelineRun Status: Running


StepRunId: 96100528-27f0-4d31-b9ba-4b315f4cb486
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/96100528-27f0-4d31-b9ba-4b315f4cb486?wsid=/subscriptions/7e48a1e8-8d3e-4e00-8bc0-098c43f5ace7/resourcegroups/rg-mlo

'Finished'

In [None]:
published_pipeline1 = pipeline_run1.publish_pipeline(
     name="Diabetes retrain Pipeline",
     description="Diabetes retrain Pipeline",
     version="1.0")

In [101]:
pipeline_run1.get_file_names()

['logs/azureml/executionlogs.txt',
 'logs/azureml/stderrlogs.txt',
 'logs/azureml/stdoutlogs.txt',
 'outputs/sklearn_regression_model.pkl']

In [102]:
pipeline_run1.register_model(model_name ="diabetes", model_path = "outputs/sklearn_regression_model.pkl")

Model(workspace=Workspace.create(name='ws-demo', subscription_id='7e48a1e8-8d3e-4e00-8bc0-098c43f5ace7', resource_group='rg-mlops-demo-dev'), name=diabetes, id=diabetes:2, version=2, tags={}, properties={})