# Pipelines avec Azure ML

<img src='https://github.com/retkowsky/images/blob/master/AzureMLservicebanniere.png?raw=true'>

> https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-ml-pipelines

## 1. Infos

In [1]:
import sys
sys.version

'3.6.9 |Anaconda, Inc.| (default, Jul 30 2019, 19:07:31) \n[GCC 7.3.0]'

In [2]:
import datetime
now = datetime.datetime.now()
print(now)

2020-03-10 15:50:03.226976


In [3]:
import os
import azureml.core
from azureml.core import Workspace, Experiment, Datastore
from azureml.widgets import RunDetails

print("Version Azure ML service : ", azureml.core.VERSION)

Version Azure ML service :  1.0.83


In [4]:
# Chargement config workspace
ws = Workspace.from_config()

## 2. Données

In [5]:
from azureml.core import Dataset

default_ds = ws.get_default_datastore()
default_ds.upload_files(files=['./donnees/diabetes.csv', './donnees/diabetes2.csv'], # Upload the diabetes csv files in /data
                       target_path='diabetes-data/', # Put it in a folder path in the datastore
                       overwrite=True, # Replace existing files of the same name
                       show_progress=True)

#Create a tabular dataset from the path on the datastore (this may take a short while)
tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))

# Register the tabular dataset
tab_data_set = tab_data_set.register(workspace=ws, 
                           name='diabetes dataset',
                           description='diabetes data',
                           tags = {'format':'CSV'},
                           create_new_version=True)

print('OK')

Uploading an estimated of 2 files
Uploading ./donnees/diabetes.csv
Uploading ./donnees/diabetes2.csv
Uploaded ./donnees/diabetes2.csv, 1 files out of an estimated total of 2
Uploaded ./donnees/diabetes.csv, 2 files out of an estimated total of 2
Uploaded 2 files
OK


In [6]:
from azureml.core import ComputeTarget, Datastore, Dataset

print("Compute Targets :")
for compute_name in ws.compute_targets:
    compute = ws.compute_targets[compute_name]
    print("\t", compute.name, ':', compute.type)
    
print("Datastores :")
for datastore_name in ws.datastores:
    datastore = Datastore.get(ws, datastore_name)
    print("\t", datastore.name, ':', datastore.datastore_type)
    
print("Datasets :")
for dataset_name in list(ws.datasets.keys()):
    dataset = Dataset.get_by_name(ws, dataset_name)
    print("\t", dataset.name)

Compute Targets :
	 pipeline-cpu : AmlCompute
	 aks-exemple : AKS
	 gpu-cluster2 : AmlCompute
	 gpuclusterNC6 : AmlCompute
Datastores :
	 azureml_globaldatasets : AzureBlob
	 workspacefilestore : AzureFile
	 workspaceblobstore : AzureBlob
Datasets :
	 mnist dataset
	 diabetes dataset


## 3. Création des scripts pour le pipeline

In [7]:
import os

# Create a folder for the pipeline step files
experiment_folder = 'WorkshopPipelines'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

WorkshopPipelines


In [8]:
%%writefile $experiment_folder/train_diabetes.py
# Import libraries
from azureml.core import Run
import argparse
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument('--output_folder', type=str, dest='output_folder', default="diabetes_model", help='output folder')
args = parser.parse_args()
output_folder = args.output_folder

# Get the experiment run context
run = Run.get_context()

# load the diabetes data (passed as an input dataset)
print("Loading Data...")
diabetes = run.input_datasets['diabetes_train'].to_pandas_dataframe()

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train adecision tree model
print('Training a decision tree model')
model = DecisionTreeClassifier().fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

# plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
fig = plt.figure(figsize=(6, 4))
# Plot the diagonal 50% line
plt.plot([0, 1], [0, 1], 'k--')
# Plot the FPR and TPR achieved by our model
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
run.log_image(name = "ROC", plot = fig)
plt.show()

# Save the trained model
os.makedirs(output_folder, exist_ok=True)
output_path = output_folder + "/model.pkl"
joblib.dump(value=model, filename=output_path)

run.complete()


Overwriting WorkshopPipelines/train_diabetes.py


In [9]:
%%writefile $experiment_folder/register_diabetes.py
# Import libraries
import argparse
import joblib
from azureml.core import Workspace, Model, Run

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument('--model_folder', type=str, dest='model_folder', default="diabetes_model", help='model location')
args = parser.parse_args()
model_folder = args.model_folder

# Get the experiment run context
run = Run.get_context()

# load the model
print("Loading model from " + model_folder)
model_file = model_folder + "/model.pkl"
model = joblib.load(model_file)

Model.register(workspace=run.experiment.workspace,
               model_path = model_file,
               model_name = 'diabetes_model',
               tags={'Training context':'Pipeline'})

run.complete()


Overwriting WorkshopPipelines/register_diabetes.py


## 4. Création Azure ML compute et environnement


In [10]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "pipelines"

# Verify that cluster exists
try:
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If not, create it
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D4_V2', 
                                                           #vm_priority='lowpriority',
                                                           min_nodes=1,
                                                           max_nodes=4)
    pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)

pipeline_cluster.wait_for_completion(show_output=True)

Creating
Succeeded..................................................
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned


In [11]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import RunConfiguration

# Create a Python environment for the experiment
diabetes_env = Environment("diabetes-experiment-env")
diabetes_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies
diabetes_env.docker.enabled = True # Use a docker container

# Create a set of package dependencies
diabetes_packages = CondaDependencies.create(conda_packages=['scikit-learn','ipykernel','matplotlib', 'pandas'],
                                             pip_packages=['azureml-sdk','pyarrow'])

# Add the dependencies to the environment
diabetes_env.python.conda_dependencies = diabetes_packages

# Register the environment (just in case previous lab wasn't completed)
diabetes_env.register(workspace=ws)
registered_env = Environment.get(ws, 'diabetes-experiment-env')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("OK!")

OK!


## 5. Création et exécution du pipeline

In [12]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep
from azureml.train.estimator import Estimator

# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")

# Create a PipelineData (temporary Data Reference) for the model folder
model_folder = PipelineData("model_folder", datastore=ws.get_default_datastore())

estimator = Estimator(source_directory=experiment_folder,
                        compute_target = pipeline_cluster,
                        environment_definition=pipeline_run_config.environment,
                        entry_script='train_diabetes.py')

train_step = EstimatorStep(name = "1. Train Model",
                           estimator=estimator, 
                           estimator_entry_script_arguments=['--output_folder', model_folder],
                           inputs=[diabetes_ds.as_named_input('diabetes_train')],
                           outputs=[model_folder],
                           compute_target = pipeline_cluster,
                           allow_reuse = True)

# Step 2, run the model registration script
register_step = PythonScriptStep(name = "2. Register Model",
                                source_directory = experiment_folder,
                                script_name = "register_diabetes.py",
                                arguments = ['--model_folder', model_folder],
                                inputs=[model_folder],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("OK!")

OK!


## Execution du pipeline
> Prévoir 10 minutes de temps de traitement

In [13]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [train_step, register_step]
pipeline = Pipeline(workspace = ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace = ws, name = 'Exemple7-Pipeline-Diabetes')

pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Execution du pipeline")

RunDetails(pipeline_run).show()

Pipeline is built.
Created step 1. Train Model [70772d73][6dd52439-65d9-4bfe-a6f0-8f02ffbf0477], (This step will run and generate new outputs)
Created step 2. Register Model [87cf5258][d493a357-d31e-4f99-a18c-980721f6b6c8], (This step will run and generate new outputs)
Submitted PipelineRun 69b44604-24b2-442b-8a60-4b1b804e2ba3
Link to Azure Machine Learning studio: https://ml.azure.com/experiments/Exemple7-Pipeline-Diabetes/runs/69b44604-24b2-442b-8a60-4b1b804e2ba3?wsid=/subscriptions/70b8f39e-8863-49f7-b6ba-34a80799550c/resourcegroups/AzureMLWorkshopRG/workspaces/AzureMLWorkshop
Execution du pipeline


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

In [24]:
# Pour connaitre le statut du run du pipeline

step_runs = pipeline_run.get_children()
for step_run in step_runs:
    status = step_run.get_status()
    print('Script :', step_run.name, '- Statut =', status)
    
    # Change this if you want to see details even if the Step has succeeded.
    if status == "Failed":
        joblog = step_run.get_job_log()
        print('job log:', joblog)

Script : 2. Register Model - Statut = Finished
Script : 1. Train Model - Statut = Finished


In [25]:
from azureml.core import Model

for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

Exemple10-Modele-TensorFlow version: 1
	 Training context : TensorFlow GPU


diabetes_model version: 1
	 Training context : Pipeline


sklearn_regression_model.pkl version: 1
	 area : diabetes
	 type : regression


Exemple3-AutoML-Regression version: 1
	 Training context : Azure Auto ML
	 R2 : 0.8823064055329451
	 RMSE : 0.06954349242467574


Exemple2-AutoML-Classif version: 1
	 Training context : Azure Auto ML
	 AUC : 0.9992557112092131
	 Accuracy : 0.9652407308963173


Exemple4-AutoML-Forecast version: 1
	 Training context : Azure Auto ML
	 R2 : 0.2102297702988311
	 RMSE : 0.025719636958220327


IBM_attrition_explainer version: 1


local_deploy_model version: 1


boston_model.pkl version: 1
	 algo : Regression
	 type : sklearn






## 6. Publication du pipeline

In [26]:
experiment_name = 'Exemple7-Pipeline-Diabetes'


pipeline_experiment = ws.experiments.get(experiment_name)
pipeline_run = list(pipeline_experiment.get_runs())[0]

# Publish the pipeline from the run
published_pipeline = pipeline_run.publish_pipeline(name="Exemple7-Training-Pipeline", description="Pipeline Diabetes", version="1.0")

published_pipeline

Name,Id,Status,Endpoint
Exemple7-Training-Pipeline,70497e55-2f96-4052-8ee7-c0264b291bac,Active,REST Endpoint


### API du pipeline

In [27]:
rest_endpoint = published_pipeline.endpoint
print("Endpoint du pipeline :")
print(rest_endpoint)

Endpoint du pipeline :
https://westeurope.aether.ms/api/v1.0/subscriptions/70b8f39e-8863-49f7-b6ba-34a80799550c/resourceGroups/AzureMLWorkshopRG/providers/Microsoft.MachineLearningServices/workspaces/AzureMLWorkshop/PipelineRuns/PipelineSubmit/70497e55-2f96-4052-8ee7-c0264b291bac


In [28]:
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()
print(auth_header)

{'Authorization': 'Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiIsIng1dCI6IkhsQzBSMTJza3hOWjFXUXdtak9GXzZ0X3RERSIsImtpZCI6IkhsQzBSMTJza3hOWjFXUXdtak9GXzZ0X3RERSJ9.eyJhdWQiOiJodHRwczovL21hbmFnZW1lbnQuY29yZS53aW5kb3dzLm5ldC8iLCJpc3MiOiJodHRwczovL3N0cy53aW5kb3dzLm5ldC83MmY5ODhiZi04NmYxLTQxYWYtOTFhYi0yZDdjZDAxMWRiNDcvIiwiaWF0IjoxNTgzODUzMzcxLCJuYmYiOjE1ODM4NTMzNzEsImV4cCI6MTU4Mzg1NzI3MSwiYWNyIjoiMSIsImFpbyI6IkFWUUFxLzhPQUFBQXJCME9NTzg5L3NFcEhVU1VqQ3Q1V2tGMkQwOEZaNTdjSlA3bnhGVmx2ZnZMVWR5TDlJVG9OVzVzTHR2TXFac0RKODU3RVlicHVSbjc4ZmhqeUhTUS9mV1lVTVE3WU5uZ01rSjBlQ2pIY0d3PSIsImFtciI6WyJ3aWEiLCJtZmEiXSwiYXBwaWQiOiIwNGIwNzc5NS04ZGRiLTQ2MWEtYmJlZS0wMmY5ZTFiZjdiNDYiLCJhcHBpZGFjciI6IjAiLCJmYW1pbHlfbmFtZSI6IlJldGtvd3NreSIsImdpdmVuX25hbWUiOiJTZXJnZSIsImdyb3VwcyI6WyI3YzAwZDUyYy1mMmI4LTRjYjctYjkyMy0zMmY3MTg5ZjQzNTEiLCI2NzE0ZjczMy0wNjVlLTQ3ZjctYmZjNy05OTZkNWQyYjYwOGMiLCJkNzYyNGNiOC1lMDAyLTRlZDktYjRiNS1kM2RiNGE3Njk0NTUiLCJlZGM5YzlmZS00ZjFkLTQyOTUtYmIwNC00OGQ1MWYxMTE3YzQiLCJmMzg1NGE5MS1hMTc2LTRmOTEtODY4OS1kMWViN

In [29]:
# Run Id du pipeline
import requests

rest_endpoint = published_pipeline.endpoint
response = requests.post(rest_endpoint, 
                         headers=auth_header, 
                         json={"ExperimentName": experiment_name})
run_id = response.json()["Id"]
run_id

'362066be-bb1d-4801-861e-43d8afb10888'

In [30]:
from azureml.pipeline.core.run import PipelineRun
from azureml.widgets import RunDetails

published_pipeline_run = PipelineRun(ws.experiments[experiment_name], run_id)
RunDetails(published_pipeline_run).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

### Planification du pipeline

In [31]:
from azureml.pipeline.core import ScheduleRecurrence, Schedule

# Planification du pipeline tous les lundis à 00:00 UTC
recurrence = ScheduleRecurrence(frequency="Week", interval=1, week_days=["Monday"], time_of_day="00:00")
weekly_schedule = Schedule.create(ws, name="Pipeline-Hebdomadaire", 
                                  description="Pipeline hebdomadaire Diabetes",
                                  pipeline_id=published_pipeline.id, 
                                  experiment_name=experiment_name, 
                                  recurrence=recurrence)

In [32]:
# Visualisation des planifications
schedules = Schedule.list(ws)
schedules

[Pipeline(Name: Pipeline-Hebdomadaire,
 Id: 471c1abb-c097-4772-9ae2-f0d2036e10b1,
 Status: Active,
 Pipeline Id: 70497e55-2f96-4052-8ee7-c0264b291bac,
 Recurrence Details: Runs at 0:00 on Monday every Week)]

In [33]:
pipeline_experiment = ws.experiments.get(experiment_name)
latest_run = list(pipeline_experiment.get_runs())[0]

latest_run.get_details()

{'runId': '69b44604-24b2-442b-8a60-4b1b804e2ba3',
 'status': 'Completed',
 'startTimeUtc': '2020-03-10T16:04:47.824394Z',
 'endTimeUtc': '2020-03-10T16:12:05.009117Z',
 'properties': {'azureml.runsource': 'azureml.PipelineRun',
  'runSource': 'SDK',
  'runType': 'SDK',
  'azureml.parameters': '{}'},
 'inputDatasets': [],
 'logFiles': {'logs/azureml/executionlogs.txt': 'https://azuremlworksho6034843387.blob.core.windows.net/azureml/ExperimentRun/dcid.69b44604-24b2-442b-8a60-4b1b804e2ba3/logs/azureml/executionlogs.txt?sv=2019-02-02&sr=b&sig=4HQ4OR9dZ2jc2MrvCMiGR0kIQ%2Bqk7mMwdaJCS7HVAXs%3D&st=2020-03-10T16%3A02%3A51Z&se=2020-03-11T00%3A12%3A51Z&sp=r',
  'logs/azureml/stderrlogs.txt': 'https://azuremlworksho6034843387.blob.core.windows.net/azureml/ExperimentRun/dcid.69b44604-24b2-442b-8a60-4b1b804e2ba3/logs/azureml/stderrlogs.txt?sv=2019-02-02&sr=b&sig=DxkpC%2FLfDC2xIv0ewhCKupCOoqq9nyCPomiCbf2A8b0%3D&st=2020-03-10T16%3A02%3A51Z&se=2020-03-11T00%3A12%3A51Z&sp=r',
  'logs/azureml/stdoutlogs.

> https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-parallel-run-step

<img src="https://github.com/retkowsky/images/blob/master/Powered-by-MS-Azure-logo-v2.png?raw=true" height="300" width="300">