### Load your Libraries


In [90]:
# Azure Libaries
from azureml.core import Datastore
from azureml.core.dataset import Dataset
from azureml.core.workspace import Workspace
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.experiment import Experiment
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.explain.model._internal.explanation_client import ExplanationClient
from azureml.train.hyperdrive import HyperDriveConfig, PrimaryMetricGoal, BayesianParameterSampling, uniform, choice
from azureml.core import Run
from azureml.train.estimator import Estimator
from azureml.widgets import RunDetails

### Creating an Azure Machine Learning Pipeline
In this notebook, we will show how to prep data, train model, register model and deploy the model using AML pipeline.

### Connect to Your Workspace

In [91]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.6.0 to work with ml-teaching-workspace


### Retrieve your Datasets by name


In [92]:
# Retrieve your Datasets by name
df_name  = "auto-mpg-classification-input" # CHANGE HERE


# Load Data in as Tabular Datasets
df_tab  = Dataset.get_by_name(ws, df_name, version='latest')



### Create Scripts for Pipeline Steps
Start by creating a folder to contain the scripts for each step/task of the pipeline.

In [93]:
import os
# Create a folder for the pipeline step files
experiment_folder = 'auto_mpg_pipeline'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

auto_mpg_pipeline


The first step is to tranform the raw data to prepare the training dataset for the model

In [94]:
%%writefile $experiment_folder/data_prep.py
# Import libraries
from azureml.core import Run
import argparse
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--clean_data", type=str, dest='clean_data' ,help="output_training_data directory")
args = parser.parse_args()
clean_data = args.clean_data

# Get the experiment run context
run = Run.get_context()

# load the mpg data (passed as an input dataset)
print("Loading Data...")

dataset = run.input_datasets['raw_data']

# Transform the data

df = dataset.to_pandas_dataframe()   

# get rid of the last column as we dont gain any info from it
df_column9_dropped = df.drop(['Column9'], axis=1)

# add header row to the dataframe
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight','Acceleration', 'Model Year', 'Origin']
df_column9_dropped.columns = column_names
df_with_headers = df_column9_dropped

# remove rows with '?' character

df_with_headers = df_with_headers[df_with_headers.Horsepower != '?']

# The "Origin" column is really categorical, not numeric. So convert that to a one-hot:
df_with_headers['Origin'] = df_with_headers['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})

train_dataset = pd.get_dummies(df_with_headers, columns=["Origin"], prefix=["Origin_is"] )


#Separate the features from the target variable(mpg)
train_labels = train_dataset.pop('MPG')

train_dataset[["Horsepower"]] = train_dataset[["Horsepower"]].apply(pd.to_numeric)

## save clean dataset to be use in the nexy step (e.g. model training step)

os.makedirs(clean_data, exist_ok=True)
train_dataset_output_path = clean_data + "/train_dataset.pkl"
train_labels_output_path = clean_data + "/train_labels.pkl"


train_labels.to_pickle(train_labels_output_path)
train_dataset.to_pickle(train_dataset_output_path)

run.complete()

Overwriting auto_mpg_pipeline/data_prep.py


The second step is to train the model using the cleaned data from the previous step and saved the model for the next step

In [95]:
%%writefile $experiment_folder/Regression_XGBoost.py
# Load in Libraries
import argparse
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
from azureml.core import Run
from azureml.contrib.interpret.explanation.explanation_client import ExplanationClient
from interpret.ext.blackbox import MimicExplainer
from interpret.ext.blackbox import TabularExplainer
from interpret.ext.glassbox import LGBMExplainableModel
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import make_scorer, r2_score, mean_squared_error
from azureml.core.dataset import Dataset
from sklearn import metrics
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBRegressor

run = Run.get_context()
# Load in Arguments.  
parser = argparse.ArgumentParser()
# parser.add_argument("--trained_model", type=str, dest='trained_model' ,help="trained_model directory")

parser.add_argument('--clean_data', type=str, dest='clean_data',  help='clean_data location')

args = parser.parse_args()
# trained_model = args.trained_model
clean_data = args.clean_data

# load the training data

train_data_path = clean_data + "/train_dataset.pkl"
train_labels_path = clean_data + "/train_labels.pkl"

train_dataset = pd.read_pickle(train_data_path)
train_labels = pd.read_pickle(train_labels_path)

xgb = XGBRegressor(colsample_bytree=0.6,
             gamma=0.1,                 
             learning_rate=0.07,
             max_depth=5,
             min_child_weight=6,
             n_estimators=100,                                                                    
             reg_alpha=0.01,
             reg_lambda=0.45,
             subsample=0.6,
             seed=42,objective='reg:squarederror')
xgb.fit(train_dataset,train_labels)
run.complete()

Overwriting auto_mpg_pipeline/Regression_XGBoost.py


### setup compute cluster and environment

In [96]:

#The compute will require a Python environment with the necessary package dependencies installed, 
#so we'll create a run configuration.

from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import RunConfiguration

# get the existing computer cluster we had created in the past
compute_name = 'aml-cluster' 
compute_target = ComputeTarget(ws, compute_name) 

# Create a Python environment for the experiment
auto_mpg_env = Environment("auto-mpg-pipeline-env")
auto_mpg_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies
auto_mpg_env.docker.enabled = True # Use a docker container

# Create a set of package dependencies
auto_mpg_packages = CondaDependencies.create(conda_packages=['scikit-learn==0.20.3', 'numpy==1.16.2','matplotlib==3.2.1',\
                                'joblib==0.14.1','xgboost==0.90','seaborn==0.9.0','lightgbm==2.3.0'],
                pip_packages=['azureml-defaults==1.3.0','azureml-contrib-interpret==1.3.0',\
                              'azureml-explain-model==1.3.0','azureml-dataprep[pandas]==1.4.3','pyarrow==0.15.1'])


# Add the dependencies to the environment
auto_mpg_env.python.conda_dependencies = auto_mpg_packages

# Register the environment (just in case you want to use it again)
auto_mpg_env.register(workspace=ws)
registered_env = Environment.get(ws, 'auto-mpg-pipeline-env')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = compute_target

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")




Run configuration created.


### Create and Run a Pipeline

In [97]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep
from azureml.train.estimator import Estimator



# Create a PipelineData (temporary Data Reference) for the model folder


clean_data = PipelineData("clean_data", datastore=ws.get_default_datastore())


# estimator = Estimator(source_directory=experiment_folder,
#                         compute_target = pipeline_cluster,
#                         environment_definition=pipeline_run_config.environment,
#                         entry_script='train_diabetes.py')

# # Step 1, run the estimator to train the model

# Step 2, run the model registration script
clean_data_step = PythonScriptStep(name = "Data Prep",
                                source_directory = experiment_folder,
                                script_name = "data_prep.py",
                                arguments = ['--clean_data', clean_data],
                                inputs=[df_tab.as_named_input('raw_data')],
                                outputs=[clean_data],   
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

estimator = Estimator(source_directory=experiment_folder,
                        compute_target = compute_target,
                        environment_definition=pipeline_run_config.environment,
                        entry_script='Regression_XGBoost.py')

train_step = EstimatorStep(name = "Train Model",
                           estimator=estimator, 
                           estimator_entry_script_arguments=['--clean_data', clean_data],
                           inputs=[clean_data],
#                            outputs=[model_folder],
                           compute_target = compute_target,
                           allow_reuse = True)


print("Pipeline steps defined")


Pipeline steps defined


OK, now you're ready to build the pipeline from the steps you've defined and run it as an experiment.



In [98]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [clean_data_step,train_step]
pipeline = Pipeline(workspace = ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace = ws, name = 'auto-mpg-end-to-end-pipeline')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")

RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion()

Pipeline is built.
Created step Data Prep [b5031ab5][b176797d-5b33-463c-895f-9b6c6e315021], (This step will run and generate new outputs)Created step Train Model [dfa96c52][4ab0a6fa-7231-4312-9195-205b7bc09559], (This step will run and generate new outputs)

Submitted PipelineRun d80e081f-68db-4290-8b07-df9bb974ce92
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/auto-mpg-end-to-end-pipeline/runs/d80e081f-68db-4290-8b07-df9bb974ce92?wsid=/subscriptions/dcfc206a-203b-4c00-a236-bdf576a37896/resourcegroups/ml-teaching/workspaces/ml-teaching-workspace
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: d80e081f-68db-4290-8b07-df9bb974ce92
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/auto-mpg-end-to-end-pipeline/runs/d80e081f-68db-4290-8b07-df9bb974ce92?wsid=/subscriptions/dcfc206a-203b-4c00-a236-bdf576a37896/resourcegroups/ml-teaching/workspaces/ml-teaching-workspace
PipelineRun Status: Running


StepRunId: 05d185bd-e0ca-4d7c-90f9-10c8d1075386
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/auto-mpg-end-to-end-pipeline/runs/05d185bd-e0ca-4d7c-90f9-10c8d1075386?wsid=/subscriptions/dcfc206a-203b-4c00-a236-bdf576a37896/resourcegroups/ml-teaching/workspaces/ml-teaching-workspace
StepRun( Data Prep ) Status: NotStarted
StepRun( Data Prep ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_8aa3614fe830113895518f8f8d274bf65f36b50b25a77ece532e9d9dfca04aa7_d.txt
2020-06-12T03:26:01Z Starting output-watcher...
2020-06-12T03:26:01Z IsDedicatedCompute == True, won't poll for Low Pri Preemption

Streaming azureml-l




StepRunId: bd21ad66-e56f-403f-9315-2c9a0a7190d4
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/auto-mpg-end-to-end-pipeline/runs/bd21ad66-e56f-403f-9315-2c9a0a7190d4?wsid=/subscriptions/dcfc206a-203b-4c00-a236-bdf576a37896/resourcegroups/ml-teaching/workspaces/ml-teaching-workspace
StepRun( Train Model ) Status: Queued
StepRun( Train Model ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_8aa3614fe830113895518f8f8d274bf65f36b50b25a77ece532e9d9dfca04aa7_d.txt
2020-06-12T03:28:29Z Starting output-watcher...
2020-06-12T03:28:29Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
9dc8cf12d9c5489583399c606ed65847a7c1595829033cf9098d5c7bac501405

Streaming azureml-logs/65_job_prep-tvmps_8aa3614fe830113895518f8f8d274bf65f36b50b25a77ece532e9d9dfca04aa7_d.txt
Entering job preparation. Current time:2020-06-12T03:28:32.061108
Starting job preparation. Current time:2020-06-12T03:28:32.956271
Extracting the control code.
fetching and extract



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': 'd80e081f-68db-4290-8b07-df9bb974ce92', 'status': 'Completed', 'startTimeUtc': '2020-06-12T03:22:55.184669Z', 'endTimeUtc': '2020-06-12T03:28:53.343107Z', 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{}'}, 'inputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://mlteachingwork9525397520.blob.core.windows.net/azureml/ExperimentRun/dcid.d80e081f-68db-4290-8b07-df9bb974ce92/logs/azureml/executionlogs.txt?sv=2019-02-02&sr=b&sig=I4iA%2BbBott3hSwhMakob0MYveuLXb1yY57Ae8S0oHKo%3D&st=2020-06-12T03%3A18%3A55Z&se=2020-06-12T11%3A28%3A55Z&sp=r', 'logs/azureml/stderrlogs.txt': 'https://mlteachingwork9525397520.blob.core.windows.net/azureml/ExperimentRun/dcid.d80e081f-68db-4290-8b07-df9bb974ce92/logs/azureml/stderrlogs.txt?sv=2019-02-02&sr=b&sig=yz0E%2FoShzwGZQaAOlGCKZnQsr7OHTgwZKFRKp73bfO4%3D&st=2020-06-12T03%3A18%3A55Z&se=2020-06-12T1

'Finished'