In [1]:
pip show azure-ai-ml

Name: azure-ai-ml
Version: 1.8.0
Summary: Microsoft Azure Machine Learning Client Library for Python
Home-page: https://github.com/Azure/azure-sdk-for-python
Author: Microsoft Corporation
Author-email: azuresdkengsysadmins@microsoft.com
License: MIT License
Location: /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages
Requires: azure-common, azure-core, azure-mgmt-core, azure-storage-blob, azure-storage-file-datalake, azure-storage-file-share, colorama, isodate, jsonschema, marshmallow, msrest, opencensus-ext-azure, pydash, pyjwt, pyyaml, strictyaml, tqdm, typing-extensions
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [3]:
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient

try:
    credential=DefaultAzureCredential()
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    credential=InteractiveBrowserCredential()

In [4]:
ml_client=MLClient.from_config(credential=credential)

Found the config file in: /config.json


In [5]:
import os

script_folder="src"

os.makedirs(name=script_folder, exist_ok=True)
print(f"{script_folder} folder created")

src folder created


In [23]:
%%writefile $script_folder/prep-data.py

import argparse
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler

def main(args):
    df=get_data(path=args.input_data)
    cleaned_data=clean_data(df=df)
    normalized_data=normalize_data(df=cleaned_data)
    output_df=normalized_data.to_csv(
        path_or_buf=Path(args.output_data)/"diabetes.csv",
        index=False
        )

def get_data(path):
    df=pd.read_csv(path)
    row_count=df.shape[0]
    print(f"Preparing {row_count} rows of data")

    return df

def clean_data(df):
    df=df.dropna()
    return df

def normalize_data(df):
    scaler=MinMaxScaler()
    num_cols= ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure',
    'TricepsThickness','SerumInsulin','BMI','DiabetesPedigree']

    df[num_cols]=scaler.fit_transform(X=df[num_cols])
    return df

def parse_args():
    parser=argparse.ArgumentParser()
    parser.add_argument("--input_data", dest="input_data", type=str)
    parser.add_argument("--output_data", dest="output_data", type=str)
    args=parser.parse_args()
    
    return args

if __name__=="__main__":
    args=parse_args()
    main(args=args)
    print("*"*60)
    print("\n\n")

Overwriting src/prep-data.py


In [9]:
%%writefile $script_folder/train-model.py

import mlflow
import glob
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

def main(args):
    mlflow.autolog()

    # read data
    df=get_data(data_path=args.training_data)

    # split data
    X_train, X_test, y_train, y_test=split_data(df=df)

    # train model
    model=train_model(args.reg_rate, X_train, X_test, y_train, y_test)
    eval_model(model=model, X_test=X_test, y_test=y_test)

def get_data(data_path):
    all_files=glob.glob(pathname=data_path+"/*.csv")
    df=pd.concat(objs=[pd.read_csv(f) for f in all_files],sort=False)
    return df

def split_data(df):
    num_cols= ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure',
    'TricepsThickness','SerumInsulin','BMI','DiabetesPedigree']
    X,y=df[num_cols].values, df['Diabetic'].values
    return train_test_split(X, y, test_size=0.30, random_state=0)

def train_model(reg_rate, X_train, X_test, y_train, y_test):
    mlflow.log_param(key="Regularization Rate", value=reg_rate)
    print("Training Model")
    model=LogisticRegression(C=1/reg_rate, solver="liblinear").fit(X_train, y_train)
    mlflow.sklearn.save_model(model, args.model_output)
    return model

def eval_model(model, X_test, y_test):
    y_hat=model.predict(X_test)
    # calculate accuracy
    acc=np.average(y_hat==y_test)
    print(f"Accuracy:{acc}")

    # calculate AUC
    y_scores=model.predict_proba(X_test)
    auc=roc_auc_score(y_true=y_test, y_score=y_scores[:,1])
    print(f"AUC: {str(auc)}")

    # Plot ROC Curve
    fpr, tpr, thresholds=roc_curve(y_true=y_test, y_score=y_scores[:,1])
    fig=plt.figure(figsize=(6,4))
    plt.plot([0,1],[0,1],"k--")
    plt.plot(fpr, tpr)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.savefig("ROC-Curve.png")

def parse_args():
    parser=argparse.ArgumentParser()
    parser.add_argument("--training_data", dest="training_data", type=str)
    parser.add_argument("--reg_rate", dest="reg_rate", type=float, default=0.01)
    parser.add_argument("--model_output", dest="model_output", type=str)
    args=parser.parse_args()

    return args


if __name__=="__main__":
    argument=parse_args()
    main(argument)
    print("*"*60)
    print("\n\n")

Overwriting src/train-model.py


<h3> Create YAML for each component you want to run as a Pipeline Setup </h3>

In [24]:
%%writefile prep-data.yml
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: prep_data
display_name: Prepare Training Data
version: 1
type: command
inputs:
    input_data:
        type: uri_file
outputs:
    output_data:
        type: uri_folder
code: ./src
environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest
command: >-
    python prep-data.py
    --input_data ${{inputs.input_data}}
    --output_data ${{outputs.output_data}}

Overwriting prep-data.yml


In [25]:
%%writefile train-model.yml
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: train_model
display_name: Train a Logistic Regression Classifier Model
version: 1
type: command
inputs:
    training_data:
        type: uri_folder
    reg_rate:
        type: number
        default: 0.01
outputs:
    model_output:
        type: mlflow_model
code: ./src
environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest
command: >-
    python train-model.py
    --training_data ${{inputs.training_data}}
    --reg_rate ${{inputs.reg_rate}}
    --model_output ${{outputs.model_output}}

Overwriting train-model.yml


<h3>  Load the Components </h3>

In [26]:
from azure.ai.ml import load_component
parent_dir=""

prep_data=load_component(source=parent_dir+"./prep-data.yml")
train_logistic_regression=load_component(source=parent_dir+"./train-model.yml")

<h3> Build The Pipeline</h3>

In [27]:
from azure.ai.ml import Input
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml.dsl import pipeline

@pipeline()
def diabetes_classification(pipeline_job_input):
    prepared_data=prep_data(input_data=pipeline_job_input)  # the source file / output from darastore
    trained_model=train_logistic_regression(training_data=prepared_data.outputs.output_data)  # output from preparation stage

    return {
        "pipeline_job_transformed_data":prepared_data.outputs.output_data,  # output from preparation stage
        "pipeline_job_trained_model": trained_model.outputs.model_output  # output from train model stage
    }

pipeline_job=diabetes_classification(pipeline_job_input=Input(
    type=AssetTypes.URI_FILE,
    path="azureml:diabetes-data:1"
))

In [28]:
# Retrieve the configuration of the pipeline job
print(pipeline_job)

display_name: diabetes_classification
type: pipeline
inputs:
  pipeline_job_input:
    type: uri_file
    path: azureml:diabetes-data:1
outputs:
  pipeline_job_transformed_data:
    type: uri_folder
  pipeline_job_trained_model:
    type: mlflow_model
jobs:
  prepared_data:
    type: command
    inputs:
      input_data:
        path: ${{parent.inputs.pipeline_job_input}}
    outputs:
      output_data: ${{parent.outputs.pipeline_job_transformed_data}}
    component:
      $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
      name: prep_data
      version: '1'
      display_name: Prepare Training Data
      type: command
      inputs:
        input_data:
          type: uri_file
      outputs:
        output_data:
          type: uri_folder
      command: python prep-data.py --input_data ${{inputs.input_data}} --output_data
        ${{outputs.output_data}}
      environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest
      code: /mnt/batc

In [29]:
# change output mode
pipeline_job.outputs.pipeline_job_transformed_data.mode="upload"
pipeline_job.outputs.pipeline_job_trained_model.mode="upload"

# pipeline level compute
pipeline_job.settings.default_compute="aml-cluster"

# pipeline level datastore
pipeline_job.settings.default_datastore="workspaceblobstore"

<h3> Submit Pipeline Job </h3>

In [30]:
pipeline_job=ml_client.jobs.create_or_update(
    job=pipeline_job, 
    experiment_name="pipeline diabetes"
    )

print( f"Monitor Pipeline Job: {pipeline_job.studio_url}")

[32mUploading src (0.0 MBs):   0%|          | 0/3923 [00:00<?, ?it/s][32mUploading src (0.0 MBs): 100%|██████████| 3923/3923 [00:00<00:00, 39500.51it/s]
[39m



Monitor Pipeline Job: https://ml.azure.com/runs/sleepy_egg_xsc2kkl7w1?wsid=/subscriptions/18a1f27f-edf5-495e-9acb-753c93335294/resourcegroups/rg-dp100-lb638a26176bc414592/workspaces/mlw-dp100-lb638a26176bc414592&tid=6a1d2f96-8cdf-4d1a-943d-7b73f4dfbb6d
