In [1]:
!pip install -r lib/requirements.txt

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import pandas as pd
import sklearn
import zipfile
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor

In [4]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import Environment, BuildContext
from azureml.mlflow import register_model
import mlflow
import pandas as pd

subscription_id = "<SUBSCRIPTION_ID>"
resource_group = "<RESOURCE_GROUP>"
workspace = "<AML_WORKSPACE_NAME>"

#connect to the workspace
registry_name = "azureml"
ml_client = MLClient(DefaultAzureCredential(), subscription_id, resource_group, workspace)

In [5]:
rai_hospital_version_string = '1'
version='1'

In [6]:
compute_name = "trainingcompute"

In [7]:
from azure.ai.ml.entities import AmlCompute

all_compute_names = [x.name for x in ml_client.compute.list()]

if compute_name in all_compute_names:
    print(f"Found existing compute: {compute_name}")
else:
    my_compute = AmlCompute(
        name=compute_name,
        size="Standard_DS2_v2",
        min_instances=0,
        max_instances=4,
        idle_time_before_scale_down=3600
    )
    ml_client.compute.begin_create_or_update(my_compute)
    print("Initiated compute creation")

Found existing compute: trainingcompute


In [8]:
train_data = pd.read_parquet('data/training_data.parquet')
test_data = pd.read_parquet('data/testing_data.parquet')

In [9]:
target_column = "readmit_status"

In [10]:
display(train_data)

Unnamed: 0,race,gender,age,discharge_destination,admission_source,time_in_hospital,num_lab_procedures,num_procedures,num_medications,prior_outpatient,...,prior_inpatient,primary_diagnosis,number_diagnoses,max_glu_serum,A1Cresult,insulin,diabetes_Med_prescribe,readmit_status,medicare,medicaid
39348,Caucasian,Male,Over 60 years,Discharged to Home,Other,1,18,1,12,0,...,1,Other,9,,,No,No,not readmitted,False,False
52111,Caucasian,Male,Over 60 years,Other,Other,5,29,3,21,1,...,0,Other,9,,,Steady,Yes,not readmitted,True,False
42567,Caucasian,Male,Over 60 years,Other,Other,7,50,1,15,0,...,0,Respitory Issues,8,,,Down,Yes,not readmitted,True,False
55713,AfricanAmerican,Male,Over 60 years,Discharged to Home,Other,3,46,2,18,0,...,1,Respitory Issues,9,,,Steady,Yes,not readmitted,True,False
60066,Other,Female,30-60 years,Discharged to Home,Other,3,1,0,4,1,...,1,Other,6,,,No,Yes,readmitted,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35764,Caucasian,Female,Over 60 years,Discharged to Home,Other,2,64,0,10,0,...,1,Respitory Issues,5,,Norm,Steady,Yes,not readmitted,False,False
46719,Caucasian,Female,Over 60 years,Discharged to Home,Other,7,65,0,19,0,...,0,Other,9,,,No,No,not readmitted,True,False
73613,Caucasian,Male,Over 60 years,Other,Other,10,50,0,25,0,...,0,Other,8,,,No,Yes,not readmitted,True,False
56143,AfricanAmerican,Male,Over 60 years,Discharged to Home,Other,2,31,0,10,0,...,0,Other,8,,,Steady,Yes,readmitted,True,False


In [11]:
import os
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes


training_dataset_filename = 'hospital_train_parquet'
testing_dataset_filename = 'hospital_test_parquet'


training_data = Data(
    name=training_dataset_filename,
    path='data/training_data.parquet',
    type=AssetTypes.URI_FILE,
    description="RAI hospital  train data",  
)

tr_data = ml_client.data.create_or_update(training_data)

testing_data = Data(
    name=testing_dataset_filename,
    path='data/testing_data.parquet',
    type=AssetTypes.URI_FILE,
    description="RAI hospital  test data",  
)

te_data = ml_client.data.create_or_update(testing_data)


In [12]:
import os

os.makedirs('component', exist_ok=True)
os.makedirs('register_model_src', exist_ok=True)
os.makedirs('environment', exist_ok=True)

In [13]:
%%writefile component/hospital_training.py

import argparse
import os
import shutil
import tempfile

from azureml.core import Run

import mlflow
import mlflow.sklearn

import pandas as pd
import numpy as np
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split



def parse_args():
    # setup arg parser
    parser = argparse.ArgumentParser()

    # add arguments
    parser.add_argument("--training_data", type=str, help="Path to training data")
    parser.add_argument("--target_column_name", type=str, help="Name of target column")
    parser.add_argument("--model_output", type=str, help="Path of output model")

    # parse args
    args = parser.parse_args()    

    # return args
    return args

def get_categorical_index(categorical_fields):
    cat_idx = []
    for col, value in categorical_fields.iteritems():
        if value.dtype == 'object':
            cat_idx.append(categorical_fields.columns.get_loc(col))
    print("col indices: ", cat_idx)  
    return cat_idx    



def main(args):
    current_experiment = Run.get_context().experiment
    tracking_uri = current_experiment.workspace.get_mlflow_tracking_uri()
    print("tracking_uri: {0}".format(tracking_uri))
    mlflow.set_tracking_uri(tracking_uri)
    mlflow.set_experiment(current_experiment.name)

    # Read in data
    print("Reading data")
    all_training_data = pd.read_parquet(args.training_data)
    target = all_training_data[args.target_column_name]
    features = all_training_data.drop([args.target_column_name], axis = 1)  

    # Transform string data to numeric
    numerical_selector = selector(dtype_include=np.number)
    categorical_selector = selector(dtype_exclude=np.number)

    numerical_columns = numerical_selector(features)
    categorical_columns = categorical_selector(features)

    categorial_encoder = OneHotEncoder(handle_unknown="ignore")
    numerical_encoder = StandardScaler()

    preprocessor = ColumnTransformer([
    ('categorical-encoder', categorial_encoder, categorical_columns),
    ('standard_scaler', numerical_encoder, numerical_columns)])

    categorical_indices = get_categorical_index(features)
    clf = make_pipeline(preprocessor, LogisticRegression())
    

    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=1)

    print("Training model...") 
    
    model = clf.fit(X_train, y_train)
 
    # Saving model with mlflow - leave this section unchanged
    model_dir =  "./model_output"
    with tempfile.TemporaryDirectory() as td:
        print("Saving model with MLFlow to temporary directory")
        tmp_output_dir = os.path.join(td, model_dir)
        mlflow.sklearn.save_model(sk_model=model, path=tmp_output_dir)

        print("Copying MLFlow model to output path")
        for file_name in os.listdir(tmp_output_dir):
            print("  Copying: ", file_name)
            shutil.copy2(src=os.path.join(tmp_output_dir, file_name), dst=os.path.join(args.model_output, file_name))


# run script
if __name__ == "__main__":
    # add space in logs
    print("*" * 60)
    print("\n\n")

    # parse args
    args = parse_args()

    # run main function
    main(args)

    # add space in logs
    print("*" * 60)
    print("\n\n")

Overwriting component/hospital_training.py


In [14]:
%%writefile register_model_src/model_register.py

# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

import argparse
import json
import os
import time


from azureml.core import Run

import mlflow
import mlflow.sklearn

# Based on example:
# https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-cli
# which references
# https://github.com/Azure/azureml-examples/tree/main/cli/jobs/train/lightgbm/iris


def parse_args():
    # setup arg parser
    parser = argparse.ArgumentParser()

    # add arguments
    parser.add_argument("--model_input_path", type=str, help="Path to input model")
    parser.add_argument(
        "--model_info_output_path", type=str, help="Path to write model info JSON"
    )
    parser.add_argument(
        "--model_base_name", type=str, help="Name of the registered model"
    )


    # parse args
    args = parser.parse_args()

    # return args
    return args


def main(args):
    current_experiment = Run.get_context().experiment
    tracking_uri = current_experiment.workspace.get_mlflow_tracking_uri()
    print("tracking_uri: {0}".format(tracking_uri))
    mlflow.set_tracking_uri(tracking_uri)
    mlflow.set_experiment(current_experiment.name)

    print("Loading model")
    mlflow_model = mlflow.sklearn.load_model(args.model_input_path)

    registered_name = args.model_base_name
    print(f"Registering model as {registered_name}")

    print("Registering via MLFlow")
    mlflow.sklearn.log_model(
        sk_model=mlflow_model,
        registered_model_name=registered_name,
        artifact_path=registered_name,
    )

    print("Writing JSON")
    dict = {"id": "{0}:1".format(registered_name)}
    output_path = os.path.join(args.model_info_output_path, "model_info.json")
    with open(output_path, "w") as of:
        json.dump(dict, fp=of)


# run script
if __name__ == "__main__":
    # add space in logs
    print("*" * 60)
    print("\n\n")

    # parse args
    args = parse_args()

    # run main function
    main(args)

    # add space in logs
    print("*" * 60)
    print("\n\n")

Overwriting register_model_src/model_register.py


In [15]:
from azure.ai.ml import load_component

yaml_contents = f"""
$schema: http://azureml/sdk-2-0/CommandComponent.json
name: rai_training_component
display_name: hospital  classification training component for RAI example
version: {rai_hospital_version_string}
type: command
inputs:
  training_data:
    type: path
  target_column_name:
    type: string
outputs:
  model_output:
    type: path
code: ./component/
environment: azureml://registries/azureml/environments/AzureML-responsibleai-0.20-ubuntu20.04-py38-cpu/versions/4
""" + r"""
command: >-
  python hospital_training.py
  --training_data ${{{{inputs.training_data}}}}
  --target_column_name ${{{{inputs.target_column_name}}}}
  --model_output ${{{{outputs.model_output}}}}
"""

yaml_filename = "RAIhospitalClassificationTrainingComponent.yaml"

with open(yaml_filename, 'w') as f:
    f.write(yaml_contents.format(yaml_contents))
    
train_component_definition = load_component(
    source=yaml_filename
)

ml_client.components.create_or_update(train_component_definition)

CommandComponent({'auto_increment_version': False, 'source': 'REMOTE.WORKSPACE.COMPONENT', 'is_anonymous': False, 'name': 'rai_training_component', 'description': None, 'tags': {}, 'properties': {}, 'id': '/subscriptions/8a0f6419-1f4c-45b3-8d92-ee53be1ea443/resourceGroups/demoRG/providers/Microsoft.MachineLearningServices/workspaces/aml-ws/components/rai_training_component/versions/13', 'Resource__source_path': None, 'base_path': './', 'creation_context': <azure.ai.ml._restclient.v2022_05_01.models._models_py3.SystemData object at 0x7fba48512d90>, 'serialize': <msrest.serialization.Serializer object at 0x7fba4856a910>, 'command': 'python hospital_training.py --training_data ${{inputs.training_data}} --target_column_name ${{inputs.target_column_name}} --model_output ${{outputs.model_output}}', 'code': '/subscriptions/8a0f6419-1f4c-45b3-8d92-ee53be1ea443/resourceGroups/demoRG/providers/Microsoft.MachineLearningServices/workspaces/aml-ws/codes/2b04ca47-e160-4a6f-8345-dcc160a87ba8/versions

In [16]:
yaml_contents = f"""
$schema: http://azureml/sdk-2-0/CommandComponent.json
name: register_trained_model
display_name: Register hospital Model
version: {rai_hospital_version_string}
type: command
is_deterministic: False
inputs:
  model_input_path:
    type: path
  model_base_name:
    type: string
outputs:
  model_info_output_path:
    type: path
code: ./register_model_src/
environment: azureml://registries/azureml/environments/AzureML-responsibleai-0.20-ubuntu20.04-py38-cpu/versions/4
command: >-
  python model_register.py
  --model_input_path ${{{{inputs.model_input_path}}}}
  --model_base_name ${{{{inputs.model_base_name}}}}
  --model_info_output_path ${{{{outputs.model_info_output_path}}}}

"""
yaml_filename = "model_register.yaml"

with open(yaml_filename, 'w') as f:
    f.write(yaml_contents)
    
register_component = load_component(
    source=yaml_filename
)

ml_client.components.create_or_update(register_component)

CommandComponent({'auto_increment_version': False, 'source': 'REMOTE.WORKSPACE.COMPONENT', 'is_anonymous': False, 'name': 'register_trained_model', 'description': None, 'tags': {}, 'properties': {}, 'id': '/subscriptions/8a0f6419-1f4c-45b3-8d92-ee53be1ea443/resourceGroups/demoRG/providers/Microsoft.MachineLearningServices/workspaces/aml-ws/components/register_trained_model/versions/13', 'Resource__source_path': None, 'base_path': './', 'creation_context': <azure.ai.ml._restclient.v2022_05_01.models._models_py3.SystemData object at 0x7fba484e5370>, 'serialize': <msrest.serialization.Serializer object at 0x7fba484cd1c0>, 'command': 'python model_register.py --model_input_path ${{inputs.model_input_path}} --model_base_name ${{inputs.model_base_name}} --model_info_output_path ${{outputs.model_info_output_path}}', 'code': '/subscriptions/8a0f6419-1f4c-45b3-8d92-ee53be1ea443/resourceGroups/demoRG/providers/Microsoft.MachineLearningServices/workspaces/aml-ws/codes/9047cb19-1cde-428f-b03c-ed46

In [17]:
import time

model_base_name = 'rai_hospital_model'

In [18]:
from azure.ai.ml import dsl, Input


hospital_train_parquet = Input(
    type="uri_file", path="data/training_data.parquet", mode="download"
)

hospital_test_parquet = Input(
    type="uri_file", path="data/testing_data.parquet", mode="download"
)

@dsl.pipeline(
    compute=compute_name,
    description="Register Model for RAI hospital ",
    experiment_name=f"RAI_hospital_Model_Training_{rai_hospital_version_string}",
)
def my_training_pipeline(target_column_name, training_data):
    trained_model = train_component_definition(
        target_column_name=target_column_name,
        training_data=training_data
    )
    trained_model.set_limits(timeout=120)

    _ = register_component(
        model_input_path=trained_model.outputs.model_output,
        model_base_name=model_base_name,
    )

    return {}

model_registration_pipeline_job = my_training_pipeline(target_column, hospital_train_parquet)

In [19]:
from azure.ai.ml.entities import PipelineJob
import webbrowser

def submit_and_wait(ml_client, pipeline_job) -> PipelineJob:
    created_job = ml_client.jobs.create_or_update(pipeline_job)
    assert created_job is not None

    while created_job.status not in ['Completed', 'Failed', 'Canceled', 'NotResponding']:
        time.sleep(30)
        created_job = ml_client.jobs.get(created_job.name)
        print("Latest status : {0}".format(created_job.status))


    # open the pipeline in web browser
    webbrowser.open(created_job.services["Studio"].endpoint)
    
    #assert created_job.status == 'Completed'
    return created_job

# This is the actual submission
training_job = submit_and_wait(ml_client, model_registration_pipeline_job)

Latest status : Running
Latest status : Running
Latest status : Completed
