# E2E ML Pipeline With AML

- A simple classification model with E2E ML Pipelines

In [1]:
subscriptionID = '2213e8b1-dbc7-4d54-8aff-b5e315df5e5b'
RG = '1-6a7b0882-playground-sandbox'
ws_name = "MLOPS101"

In [2]:
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient

ws = MLClient(
    DefaultAzureCredential(),
    subscription_id = subscriptionID,
    resource_group_name = RG,
    workspace_name= ws_name,
)
print(ws)

MLClient(credential=<azure.identity._credentials.default.DefaultAzureCredential object at 0x7f7d477d82b0>,
         subscription_id=2213e8b1-dbc7-4d54-8aff-b5e315df5e5b,
         resource_group_name=1-6a7b0882-playground-sandbox,
         workspace_name=MLOPS101)


In [3]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

iris_url = "https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv"

iris_data = Data(
    name = "raw_iris_data",
    path = iris_url,
    type = AssetTypes.URI_FILE,
    description = "Uncleansed IRIS dataset",
    tags = {"source_type": "web"},
    version = "1.0.0",
) #Stores in the default datastore

iris_data = ws.data.create_or_update(iris_data)
print(f"Dataset with name {iris_data.name} was registered to workspace, the dataset version is {iris_data.version}")

Dataset with name raw_iris_data was registered to workspace, the dataset version is 1.0.0


In [4]:
from azure.ai.ml.entities import AmlCompute


def createCluster(cluster_name, size):
    try:
        cpu_cluster = ws.compute.get(cluster_name)
        print(f'{CLUSTER_NAME} exists!')
    except Exception:
        print("Creating a new cpu compute target...")
        cpu_cluster = AmlCompute(
            name=cluster_name,
            type="amlcompute",
            size=size,
            min_instances=0,
            max_instances=1,
            idle_time_before_scale_down=180,
            tier="Dedicated",
        )
        cpu_cluster = ws.compute.begin_create_or_update(cpu_cluster).result()
        print(f'Cluster created successfully named {cpu_cluster.name} with size {cpu_cluster.size}')
    return cpu_cluster

CLUSTER_NAME = 'iris-cluster'
CLUSTER_SIZE = 'Standard_D2_v2'

trainCluster = createCluster(CLUSTER_NAME, CLUSTER_SIZE)

Creating a new cpu compute target...
Cluster created successfully named iris-cluster with size STANDARD_D2_V2


In [5]:
from pathlib import Path

PARENT_DIR = Path("./E2EPipelines")
ENVIRONMENT_DIR = PARENT_DIR / "environment"
DATA_PREP_DIR = PARENT_DIR / "data_prep"
TRAIN_DIR = PARENT_DIR / "train"

ENVIRONMENT_DIR.mkdir(parents = True, exist_ok = True)
DATA_PREP_DIR.mkdir(parents = True, exist_ok = True)
TRAIN_DIR.mkdir(parents = True, exist_ok = True)

<pre>
Refs:

1. https://learn.microsoft.com/en-us/azure/machine-learning/how-to-create-component-pipeline-python?view=azureml-api-2
2. https://learn.microsoft.com/en-us/azure/machine-learning/how-to-use-sweep-in-pipeline?view=azureml-api-2
</pre>

In [6]:
%%writefile {ENVIRONMENT_DIR}/conda.yaml
name: model-env
channels:
  - conda-forge
dependencies:
  - python=3.8
  - numpy=1.21.2
  - pip=21.2.4
  - scikit-learn=0.24.2
  - scipy=1.7.1
  - pandas>=1.1,<1.2
  - pip:
    - inference-schema[numpy-support]==1.3.0
    - xlrd==2.0.1
    - mlflow== 1.26.1
    - azureml-mlflow==1.42.0

Writing E2EPipelines/environment/conda.yaml


In [7]:
from azure.ai.ml.entities import Environment

custom_env_name = "iris-env"

iris_env = Environment(
    name=custom_env_name,
    description="Custom environment for IRIS data",
    tags={"scikit-learn": "0.24.2"},
    conda_file = 'E2EPipelines/environment/conda.yaml',
    image = "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
    version="0.1.0",
)
iris_env = ws.environments.create_or_update(iris_env)

print(
    f"Environment with name {iris_env.name} is registered to workspace, the environment version is {iris_env.version}"
)

Environment with name iris-env is registered to workspace, the environment version is 0.1.0


## Data Preparation Component

In [8]:
%%writefile {DATA_PREP_DIR}/data_prep.py

import os
from pathlib import Path
import argparse
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import logging
import mlflow

def labelEncoder(df):
    le = LabelEncoder().fit(df.variety)
    df.variety = le.transform(df.variety)
    return df
    

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--raw_data", type = str, help = "Datastore path to the input data")
    parser.add_argument("--split_ratio", type=float, required=False, default=0.25)
    parser.add_argument("--train_data_dir", type = str, help = "Path to the data processed directory")
    parser.add_argument("--test_data_dir", type = str, help = "Path to the data processed directory")
    args = parser.parse_args()

    mlflow.start_run()

    mlflow.log_param("Input ARGS", " ".join(f"{k}={v}" for k, v in vars(args).items()))
    mlflow.log_param("input data:", args.raw_data)

    iris_df = pd.read_csv(args.raw_data)
    mlflow.log_param("num_samples", iris_df.shape[0])
    mlflow.log_param("num_features", iris_df.shape[1] - 1)
    
    iris_df = labelEncoder(iris_df)
    iris_train_df, iris_test_df = train_test_split(
        iris_df,
        test_size=args.split_ratio,
        stratify = iris_df.variety
    )
    mlflow.log_param("train_path", args.train_data_dir)
    mlflow.log_param("test_path", args.test_data_dir)
    iris_train_df.to_csv(os.path.join(args.train_data_dir, "data.csv"), index=False)
    iris_test_df.to_csv(os.path.join(args.test_data_dir, "data.csv"), index=False)
    mlflow.end_run()

if __name__ == "__main__":
    main()

Writing E2EPipelines/data_prep/data_prep.py


In [9]:
%%writefile {DATA_PREP_DIR}/data_prep.yml

name: iris_data_prep
display_name: IRIS Data Prep
type: command
inputs:
    raw_data: 
        type: uri_file
    split_ratio:
        type: number
outputs:
    train_data_dir:
        type: uri_folder
        mode: rw_mount
    test_data_dir:
        type: uri_folder
        mode: rw_mount
code:
    ./data_prep.py
environment:
    azureml:iris-env:0.1.0
command: >-
    python data_prep.py
    --raw_data  ${{inputs.raw_data}}
    --split_ratio ${{inputs.split_ratio}}
    --train_data ${{outputs.train_data_dir}}
    --test_data ${{outputs.test_data_dir}}

Writing E2EPipelines/data_prep/data_prep.yml


In [10]:
from azure.ai.ml import load_component

iris_data_prep_component = load_component(source = DATA_PREP_DIR/ 'data_prep.yml')

iris_data_prep_component = ws.create_or_update(iris_data_prep_component)

print(f"Component {iris_data_prep_component.name} with Version {iris_data_prep_component.version} is registered")

[32mUploading data_prep.py[32m (< 1 MB): 100%|██████████| 1.63k/1.63k [00:00<00:00, 126kB/s]
[39m



Component iris_data_prep with Version 1 is registered


## Model Training

In [18]:
%%writefile {TRAIN_DIR}/train.py

import argparse
from pathlib import Path
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
import os
import pandas as pd
import mlflow


def select_first_file(path):
    files = os.listdir(path)
    return os.path.join(path, files[0])

mlflow.start_run()
mlflow.sklearn.autolog()

OUTPUT_DIR = Path('./outputs')
OUTPUT_DIR.mkdir(exist_ok = True)

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_data_dir", type=str, help="path to train data")
    parser.add_argument("--test_data_dir", type=str, help="path to test data")
    parser.add_argument("--model_name", type=str, help="path to model file")
    parser.add_argument("--model_dir", type=str, help="path to model file")
    args = parser.parse_args()
    
    train_df = pd.read_csv(select_first_file(args.train_data_dir))
    y_train, X_train = train_df.pop("variety"), train_df.values
    
    test_df = pd.read_csv(select_first_file(args.test_data_dir))
    y_test, X_test = test_df.pop("variety"), test_df.values

    mlflow.log_param('Train data shape', X_train.shape)
    mlflow.log_param('Test data shape', X_test.shape)

    clf = GradientBoostingClassifier(
        n_estimators =  100, learning_rate = 0.01
    )
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    # Registering the model to the workspace
    print("Registering the model via MLFlow")
    mlflow.sklearn.log_model(
        sk_model=clf,
        registered_model_name = args.model_name,
        artifact_path = args.model_name,
    )
    mlflow.sklearn.save_model(
        sk_model=clf,
        path=os.path.join(args.model_dir, "trained_model"),
    )
    mlflow.end_run()
if __name__ == "__main__":
    main()
    

Overwriting E2EPipelines/train/train.py


In [19]:
%%writefile {TRAIN_DIR}/train.yml

name: iris_train
display_name: IRIS Train Step
type: command
inputs:
    train_data_dir: 
        type: uri_folder
    test_data_dir:
        type: uri_folder
    model_name:
        type: string
outputs:
    model_dir:
        type: uri_folder
code: ./train.py
environment: azureml:iris-env:0.1.0
command: >-
    python train.py
    --train_data_dir  ${{inputs.train_data_dir}}
    --test_data_dir ${{inputs.test_data_dir}}
    --model_name ${{inputs.model_name}}
    --model_dir ${{outputs.model_dir}}

Overwriting E2EPipelines/train/train.yml


In [20]:
iris_train_component = load_component(source = TRAIN_DIR/ 'train.yml')

iris_train_component = ws.create_or_update(iris_train_component)

print(f"Component {iris_train_component.name} with Version {iris_train_component.version} is registered")

[32mUploading train.py[32m (< 1 MB): 100%|██████████| 1.75k/1.75k [00:00<00:00, 117kB/s]
[39m



Component iris_train with Version 2023-05-30-14-15-26-7758220 is registered


## Building a pipeline

In [21]:
from azure.ai.ml import dsl, Input, Output


@dsl.pipeline(compute = trainCluster, description = "E2E IRIS pipeline")
def iris_pipeline(raw_data, split_ratio, model_name):
    data_prep_job = iris_data_prep_component(raw_data = raw_data, split_ratio = split_ratio)
    train_data_dir, test_data_dir = data_prep_job.outputs.train_data_dir, data_prep_job.outputs.test_data_dir
    train_job = iris_train_component(train_data_dir = train_data_dir,test_data_dir = test_data_dir, model_name = model_name)
    return {
        "pipeline_job_train_data": train_data_dir,
        "pipeline_job_test_data": test_data_dir,
    }

In [22]:
model_name = "iris_model"

# Let's instantiate the pipeline with the parameters of our choice
pipeline = iris_pipeline(
    raw_data=Input(type="uri_file", path = iris_data.path),
    split_ratio=0.25,
    model_name="iris_model",
)

In [23]:
import webbrowser

# submit the pipeline job
pipeline_job = ws.jobs.create_or_update(
    pipeline,
    experiment_name="e2e_registered_components",
)
webbrowser.open(pipeline_job.studio_url)

False

In [24]:
ws.jobs.stream(pipeline_job.name)

RunId: yellow_wolf_bvfprd58p6
Web View: https://ml.azure.com/runs/yellow_wolf_bvfprd58p6?wsid=/subscriptions/2213e8b1-dbc7-4d54-8aff-b5e315df5e5b/resourcegroups/1-6a7b0882-playground-sandbox/workspaces/MLOPS101

Streaming logs/azureml/executionlogs.txt

[2023-05-30 14:15:32Z] Completing processing run id 7610bfa3-a932-4ce4-95ff-e791c1c422e1.
[2023-05-30 14:15:32Z] Submitting 1 runs, first five are: 3e9206d3:ce8e3808-7a9b-407d-82e4-3e04ae3e0a41
[2023-05-30 14:16:09Z] Execution of experiment failed, update experiment status and cancel running nodes.

Execution Summary
RunId: yellow_wolf_bvfprd58p6
Web View: https://ml.azure.com/runs/yellow_wolf_bvfprd58p6?wsid=/subscriptions/2213e8b1-dbc7-4d54-8aff-b5e315df5e5b/resourcegroups/1-6a7b0882-playground-sandbox/workspaces/MLOPS101


JobException: Exception : 
 {
    "error": {
        "code": "UserError",
        "message": "Pipeline has failed child jobs. Failed nodes: /train_job. For more details and logs, please go to the job detail page and check the child jobs.",
        "message_format": "Pipeline has failed child jobs. {0}",
        "message_parameters": {},
        "reference_code": "PipelineHasStepJobFailed",
        "details": []
    },
    "environment": "eastus2",
    "location": "eastus2",
    "time": "2023-05-30T14:16:09.911623Z",
    "component_name": ""
} 