# IRIS data training with Sweep (Hyper param tuning)

In [1]:
subscriptionID = '80ea84e8-afce-4851-928a-9e2219724c69'
RG = '1-baef76e5-playground-sandbox'
ws_name = "MLOPS101"

In [2]:
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient

ws = MLClient(
    DefaultAzureCredential(),
    subscription_id = subscriptionID,
    resource_group_name = RG,
    workspace_name= ws_name,
)
print(ws)

MLClient(credential=<azure.identity._credentials.default.DefaultAzureCredential object at 0x7fdace8b4d90>,
         subscription_id=80ea84e8-afce-4851-928a-9e2219724c69,
         resource_group_name=1-baef76e5-playground-sandbox,
         workspace_name=MLOPS101)


In [3]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

iris_url = "https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv"

iris_data = Data(
    name = "raw_iris_data",
    path = iris_url,
    type = AssetTypes.URI_FILE,
    description = "Uncleansed IRIS dataset",
    tags = {"source_type": "web"},
    version = "1.0.0",
) #Stores in the default datastore

iris_data = ws.data.create_or_update(iris_data)
print(f"Dataset with name {iris_data.name} was registered to workspace, the dataset version is {iris_data.version}")

Dataset with name raw_iris_data was registered to workspace, the dataset version is 1.0.0


In [4]:
from azure.ai.ml.entities import AmlCompute


def createCluster(cluster_name, size):
    try:
        cpu_cluster = ws.compute.get(cluster_name)
        print(f'{CLUSTER_NAME} exists!')
    except Exception:
        print("Creating a new cpu compute target...")
        cpu_cluster = AmlCompute(
            name=cluster_name,
            type="amlcompute",
            size=size,
            min_instances=0,
            max_instances=1,
            idle_time_before_scale_down=180,
            tier="Dedicated",
        )
        cpu_cluster = ws.compute.begin_create_or_update(cpu_cluster).result()
        print(f'Cluster created successfully named {cpu_cluster.name} with size {cpu_cluster.size}')
    return cpu_cluster

CLUSTER_NAME = 'iris-cluster'
CLUSTER_SIZE = 'Standard_D2_v2'

trainCluster = createCluster(CLUSTER_NAME, CLUSTER_SIZE)

Creating a new cpu compute target...
Cluster created successfully named iris-cluster with size STANDARD_D2_V2


In [5]:
from pathlib import Path

PARENT_DIR = Path("./E2EPipelines")
ENVIRONMENT_DIR = PARENT_DIR / "environment"
TRAIN_DIR = PARENT_DIR / "train"
PREDICT_DIR = PARENT_DIR / "predict"

ENVIRONMENT_DIR.mkdir(parents = True, exist_ok = True)
TRAIN_DIR.mkdir(parents = True, exist_ok = True)
PREDICT_DIR.mkdir(parents = True, exist_ok = True)

In [6]:
%%writefile {ENVIRONMENT_DIR}/conda.yaml
name: model-env
channels:
  - conda-forge
dependencies:
  - python=3.8
  - numpy=1.21.2
  - pip=21.2.4
  - scikit-learn=0.24.2
  - scipy=1.7.1
  - pandas>=1.1,<1.2
  - pip:
    - inference-schema[numpy-support]==1.3.0
    - xlrd==2.0.1
    - mlflow== 1.26.1
    - azureml-mlflow==1.42.0
    - matplotlib

Writing E2EPipelines/environment/conda.yaml


In [7]:
from azure.ai.ml.entities import Environment

custom_env_name = "iris-env"

iris_env = Environment(
    name=custom_env_name,
    description="Custom environment for IRIS data",
    tags={"scikit-learn": "0.24.2"},
    conda_file = 'E2EPipelines/environment/conda.yaml',
    image = "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
    version="0.2.0",
)
iris_env = ws.environments.create_or_update(iris_env)

print(
    f"Environment with name {iris_env.name} is registered to workspace, the environment version is {iris_env.version}"
)

Environment with name iris-env is registered to workspace, the environment version is 0.2.0


## Training components

In [8]:
%%writefile {TRAIN_DIR}/train.yml

name: iris_train
display_name: IRIS Train Config
type: command
inputs:
    raw_data:
        type: uri_file
    split_ratio:
        type: number
        default: 0.7
    max_depth:
        type: number
        default: 3
outputs:
    model_output:
        type: mlflow_model
    test_data:
        type: uri_folder
code: ./train.py
environment: azureml:iris-env:0.2.0
command: >-
    python train.py
    --raw_data ${{inputs.raw_data}}
    --split_ratio ${{inputs.split_ratio}}
    --max_depth ${{inputs.max_depth}}
    --model_output ${{outputs.model_output}}
    --test_data ${{outputs.test_data}}    

Writing E2EPipelines/train/train.yml


In [9]:
%%writefile {TRAIN_DIR}/train.py

import os
import argparse
from pathlib import Path
import pandas as pd
import mlflow
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from distutils.dir_util import copy_tree

def parseArgs():
    parser = argparse.ArgumentParser()
    parser.add_argument('--raw_data', type = str)
    parser.add_argument('--split_ratio', type = float, default = 0.8)
    parser.add_argument('--max_depth', type = int, default = 3)
    parser.add_argument('--model_output', type = str, help = 'Model saved in this path')
    parser.add_argument('--test_data', type = str, help = 'X_train and y_train are stored here')
    args = parser.parse_args()
    return args
    

def main(args):
    mlflow.autolog()
    params = {
        'max_depth' : args.max_depth
    }
    df = pd.read_csv(args.raw_data)
    train, test = train_test_split(df, train_size = args.split_ratio, random_state = 42, stratify = df.variety)
    y_train, y_test = train.pop('variety'), test.pop('variety')
    
    mlflow.log_param('Train size', train.shape)
    mlflow.log_param('Test size', test.shape)
    
    model = DecisionTreeClassifier(**params)
    model.fit(train, y_train)
    mlflow.sklearn.save_model(model, 'model')
    copy_tree('model', args.model_output)
    
    test.to_csv(Path(args.test_data) / "X_test.csv", index=False)
    y_test.to_csv(Path(args.test_data) / "y_test.csv", index=False)
    
if __name__ == '__main__':
    args = parseArgs()
    main(args)

Writing E2EPipelines/train/train.py


## Prediction scripts

In [10]:
%%writefile {PREDICT_DIR}/predict.yml

name: iris_predict
display_name: IRIS Predict Config
type: command
inputs:
    model:
        type: mlflow_model
    test_data:
        type: uri_folder 
outputs:
    predictions:
        type: uri_folder
code: ./predict.py
environment: azureml:iris-env:0.2.0  
command: >-
    python predict.py
    --model ${{inputs.model}}
    --test_data ${{inputs.test_data}}
    --predictions ${{outputs.predictions}}

Writing E2EPipelines/predict/predict.yml


In [15]:
%%writefile {PREDICT_DIR}/predict.py

import os
import argparse
from pathlib import Path
import mlflow
import pandas as pd
from sklearn.tree import DecisionTreeClassifier


parser = argparse.ArgumentParser()
parser.add_argument('--model', type = str)
parser.add_argument('--test_data', type = str)
parser.add_argument('--predictions', type = str)

args = parser.parse_args()

X_test = pd.read_csv(Path(args.test_data) / 'X_test.csv')

model = mlflow.sklearn.load_model(Path(args.model))

mlflow.sklearn.log_model(
    sk_model = model,
    registered_model_name = "irisbest",
    artifact_path = args.model,
)

y_test = pd.read_csv(Path(args.test_data) / 'y_test.csv')
y_test['predict'] = model.predict(X_test)
y_test.to_csv(Path(args.predictions) / 'predict_result.csv', index = False)

Overwriting E2EPipelines/predict/predict.py


## Pipeline definition

In [16]:
from azure.ai.ml import dsl, Input, Output, load_component
from azure.ai.ml.sweep import Choice

train_component_func = load_component(source = TRAIN_DIR / 'train.yml')
score_component_func = load_component(source = PREDICT_DIR / 'predict.yml')

@dsl.pipeline(compute = trainCluster, description = 'Iris Sweep Job')
def irisSweepPipeline():
    
    raw_data = Input(type = 'uri_file', path = iris_data.path)
    train_model = train_component_func(raw_data = raw_data,
                                       split_ratio = 0.8,
                                       max_depth = Choice([2, 3]))
    sweep_step = train_model.sweep(
        primary_metric = 'training_f1_score',
        goal = 'maximize',
        sampling_algorithm = 'random',
    )
    sweep_step.set_limits(max_total_trials = 2, max_concurrent_trials = 2, timeout = 7200)
    score_data = score_component_func(
       model = sweep_step.outputs.model_output, test_data=sweep_step.outputs.test_data
    )
    return {'model_dir' : sweep_step.outputs.model_output}

pipeline_job = irisSweepPipeline()

In [17]:
pipeline_job = ws.jobs.create_or_update(
    pipeline_job, experiment_name = "Iris With Sweep"
)
pipeline_job

[32mUploading predict.py[32m (< 1 MB): 100%|██████████| 750/750 [00:00<00:00, 69.1kB/s]
[39m



Experiment,Name,Type,Status,Details Page
Iris With Sweep,amusing_queen_hdl8tytzwx,pipeline,Preparing,Link to Azure Machine Learning studio


In [18]:
ws.jobs.stream(pipeline_job.name)

RunId: amusing_queen_hdl8tytzwx
Web View: https://ml.azure.com/runs/amusing_queen_hdl8tytzwx?wsid=/subscriptions/80ea84e8-afce-4851-928a-9e2219724c69/resourcegroups/1-baef76e5-playground-sandbox/workspaces/MLOPS101

Streaming logs/azureml/executionlogs.txt

[2023-06-03 13:39:44Z] Completing processing run id 2fe2a893-1558-44bd-a25b-568d244510e8.
[2023-06-03 13:39:44Z] Submitting 1 runs, first five are: 1733a49f:e408e5b7-5651-4097-a888-6650a52cd0e0
[2023-06-03 13:40:34Z] Completing processing run id e408e5b7-5651-4097-a888-6650a52cd0e0.

Execution Summary
RunId: amusing_queen_hdl8tytzwx
Web View: https://ml.azure.com/runs/amusing_queen_hdl8tytzwx?wsid=/subscriptions/80ea84e8-afce-4851-928a-9e2219724c69/resourcegroups/1-baef76e5-playground-sandbox/workspaces/MLOPS101

