Copyright (c) Microsoft Corporation. All rights reserved.   
Licensed under the MIT License.

# Using AML Pipelines to train multiple datasets
### Run models in parallel
3 scripts that download and train models and are independent of each other run on the same dsvm.

In [None]:
import os
from azureml.core import Workspace, Run, Experiment

ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

# Also create a Project and attach to Workspace
project_folder = "scripts"

if not os.path.isdir(project_folder):
    os.mkdir(project_folder)

In [None]:
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.core.datastore import Datastore
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.compute import BatchAiCompute, ComputeTarget

In [None]:
cpu_cluster_name = "cpu-cluster"
try:
    cpu_cluster = BatchAiCompute(ws, cpu_cluster_name)
    print("found existing cluster.")
except:
    print("creating new cluster")
    provisioning_config = BatchAiCompute.provisioning_configuration(vm_size = "STANDARD_D2_v2",
                                                                    autoscale_enabled = True,
                                                                    cluster_min_nodes = 3, 
                                                                    cluster_max_nodes = 3)

    # create the cluster
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, provisioning_config)
    cpu_cluster.wait_for_completion(show_output=True)    

# Datastore for output
We use the default blob datastore that comes with the workspace. 

In [None]:
default_datastore = ws.get_default_datastore()

## Write scripts to projects directory

In [None]:
%%writefile $project_folder/train_regression_boston.py
import argparse
import pickle
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from azureml.core.run import Run

parser = argparse.ArgumentParser("train lr")
parser.add_argument("--output_dir", type=str, help="output dir")
args = parser.parse_args()

X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = LinearRegression()
clf.fit(X_train, y_train)

mse = mean_squared_error(y_test, clf.predict(X_test))
run = Run.get_context()
run.log("linear reg boston mse", mse)

os.makedirs(args.output_dir, exist_ok=True)

output_file = os.path.join(args.output_dir, "model.pkl")
with open(output_file, "wb") as fp:
    pickle.dump(clf, fp)

In [None]:
%%writefile $project_folder/train_rf_diabetes.py
import argparse
import pickle
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from azureml.core.run import Run

parser = argparse.ArgumentParser("train random forest on iris")
parser.add_argument("--output_dir", type=str, help="output dir")
args = parser.parse_args()

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

mse = mean_squared_error(y_test, clf.predict(X_test))
run = Run.get_context()
run.log("lr diabetes mse", mse)

os.makedirs(args.output_dir, exist_ok=True)

output_file = os.path.join(args.output_dir, "model.pkl")
with open(output_file, "wb") as fp:
    pickle.dump(clf, fp)

In [None]:
%%writefile $project_folder/train_gbt_digits.py
import argparse
import pickle
from sklearn.datasets import load_digits
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from azureml.core.run import Run

parser = argparse.ArgumentParser("train gbt")
parser.add_argument("--output_dir", type=str, help="output dir")
args = parser.parse_args()

X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)

accuracy = accuracy_score(y_test, clf.predict(X_test))
run = Run.get_context()
run.log("gbt digits accuracy", accuracy)

os.makedirs(args.output_dir, exist_ok=True)

output_file = os.path.join(args.output_dir, "model.pkl")
with open(output_file, "wb") as fp:
    pickle.dump(clf, fp)

In [None]:
from azureml.core.runconfig import CondaDependencies, RunConfiguration
cd = CondaDependencies.create(conda_packages=['scikit-learn'])
runconfig = RunConfiguration(conda_dependencies=cd)
runconfig.environment.docker.enabled = True

## Declare intermediate data

In [None]:
lr_dir = PipelineData("lr_model", datastore=default_datastore)
rf_dir = PipelineData("rf_model", datastore=default_datastore)
gbt_dir = PipelineData("gbt_model", datastore=default_datastore)

## Define steps

In [None]:
lr_step = PythonScriptStep(
    name="lr boston",
    script_name="train_regression_boston.py",
    arguments=["--output_dir", lr_dir],
    outputs=[lr_dir],
    source_directory=project_folder,
    target=cpu_cluster,
    runconfig=runconfig
)

forest_step = PythonScriptStep(
    name="rf diabetes",
    script_name="train_rf_diabetes.py",
    arguments=["--output_dir", rf_dir],
    outputs=[rf_dir],
    source_directory=project_folder,
    target=cpu_cluster,
    runconfig=runconfig
)

gbt_step = PythonScriptStep(
    name="gbt digits",
    script_name="train_gbt_digits.py",
    arguments=["--output_dir", gbt_dir],
    outputs=[gbt_dir],
    source_directory=project_folder,
    target=cpu_cluster,
    runconfig=runconfig
)

## Create the pipeline

In [None]:
pipeline = Pipeline(workspace=ws, steps=[lr_step, forest_step, gbt_step])

In [None]:
pipeline.validate()
pipeline_run = Experiment(ws, "lr_rf_gbt").submit(pipeline)

## Monitor the run

In [None]:
from azureml.train.widgets import RunDetails
RunDetails(pipeline_run).show()

In [None]:
#pipeline_run.wait_for_completion(show_output=True)

In [None]:
for step_run in pipeline_run.get_children():
    print("{}: {}".format(step_run.name, step_run.get_metrics()))

# Clean compute resources

In [None]:
#cpu_cluster.delete()