In [1]:
#conda install azure-common azure-ai-ml==0.1.0b7 mltable==0.1.0b4 azureml_dataprep azureml_dataprep_rslex responsibleai raiwidgets pandas pyarrow shap 

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import pandas as pd
import sklearn
import zipfile
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor

from raiwidgets import ResponsibleAIDashboard
from responsibleai import RAIInsights
from urllib.request import urlretrieve
import zipfile

'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.


In [4]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import Environment, BuildContext
from azureml.mlflow import register_model
import mlflow
import pandas as pd

#connect to the workspace
registry_name = "azureml"
credential = DefaultAzureCredential()
ml_client =  MLClient.from_config(credential=credential)

ml_client_registry = MLClient(
    credential=credential,
    subscription_id=ml_client.subscription_id,
    resource_group_name=ml_client.resource_group_name,
    registry_name=registry_name
    )

Found the config file in: ./config.json


In [5]:
compute_name = "trainingcompute"

In [6]:
from azure.ai.ml.entities import AmlCompute

all_compute_names = [x.name for x in ml_client.compute.list()]

if compute_name in all_compute_names:
    print(f"Found existing compute: {compute_name}")
else:
    my_compute = AmlCompute(
        name=compute_name,
        size="Standard_DS2_v2",
        min_instances=0,
        max_instances=4,
        idle_time_before_scale_down=3600
    )
    ml_client.compute.begin_create_or_update(my_compute)
    print("Initiated compute creation")

Found existing compute: trainingcompute


In [7]:
rai_hospital_classifier_version_string = '1'
version='1'

In [25]:
expected_model_id = f'{model_base_name}_{model_name_suffix}:1'
azureml_model_id = f'azureml:{expected_model_id}'

In [26]:
def get_categorical_numerical_data(dataset):
    dataset = dataset.drop([target_column], axis = 1)  
    categorical = []
    for col, value in dataset.iteritems():
        if value.dtype == 'object' or value.dtype == 'bool':
            categorical.append(col)
    numerical = dataset.columns.difference(categorical)
    return categorical, numerical

In [None]:
train_data = pd.read_parquet('data/train_dataset.parquet')
test_data = pd.read_parquet('data/test_dataset.parquet')

# name of registered datasets
training_dataset_filename = 'hospital_train_parquet'
testing_dataset_filename = 'hospital_test_parquet'

In [None]:
target_column = "readmit_status"

In [27]:
# get categorical and numerical fields from training data
categorical, numerical = get_categorical_numerical_data(train_data)
print("categorical columns: ",  categorical)
print("numerical field: ", numerical)

categorical columns:  ['race', 'gender', 'age', 'discharge_destination', 'max_glu_serum', 'A1Cresult', 'insulin', 'diabetes_Med_prescribe', 'medicare', 'medicaid']
numerical field:  Index(['admission_source', 'num_lab_procedures', 'num_medications',
       'num_procedures', 'number_diagnoses', 'primary_diagnosis',
       'prior_emergency', 'prior_inpatient', 'prior_outpatient',
       'time_in_hospital'],
      dtype='object')


In [28]:
label = "latest"

rai_constructor_component = ml_client_registry.components.get(
    name="microsoft_azureml_rai_tabular_insight_constructor", label=label
)

# We get latest version and use the same version for all components
version = rai_constructor_component.version

rai_explanation_component = ml_client_registry.components.get(
    name="microsoft_azureml_rai_tabular_explanation", version=version
)

rai_erroranalysis_component = ml_client_registry.components.get(
    name="microsoft_azureml_rai_tabular_erroranalysis", version=version
)

rai_gather_component = ml_client_registry.components.get(
    name="microsoft_azureml_rai_tabular_insight_gather", version=version
)

In [None]:
import json

@dsl.pipeline(
        compute=compute_name,
        description="RAI computation on hospital readmit classification data",
        experiment_name=
        f"RAI_hospital_Classification_RAIInsights_Computation_{model_name_suffix}",
    )
def rai_classification_pipeline(
        target_column_name,
        training_data,
        testing_data
    ):
        # Initiate the RAIInsights
        create_rai_job = rai_constructor_component(
            title="RAI Dashboard",
            task_type="classification",
            model_info=expected_model_id,
            model_input=Input(type=AssetTypes.MLFLOW_MODEL, path=azureml_model_id),            
            train_dataset=training_data,
            test_dataset=testing_data,
            target_column_name=target_column_name,
            #classes=json.dumps(['not readmitted', 'readmitted']),
            categorical_column_names=json.dumps(categorical),
        )
        create_rai_job.set_limits(timeout=120)
        
        # Add an explanation
        explain_job = rai_explanation_component(
            comment="Explanation for hospital remitted less than 30days  classification",
            rai_insights_dashboard=create_rai_job.outputs.rai_insights_dashboard,
        )
        explain_job.set_limits(timeout=120)
        
        # Add error analysis
        erroranalysis_job = rai_erroranalysis_component(
            rai_insights_dashboard=create_rai_job.outputs.rai_insights_dashboard,
        )
        erroranalysis_job.set_limits(timeout=120)

        # Combine everything
        rai_gather_job = rai_gather_component(
            constructor=create_rai_job.outputs.rai_insights_dashboard,
            insight_1=explain_job.outputs.explanation,
            insight_4=erroranalysis_job.outputs.error_analysis,
        )
        rai_gather_job.set_limits(timeout=120)

        rai_gather_job.outputs.dashboard.mode = "upload"
        rai_gather_job.outputs.ux_json.mode = "upload"

        return {
            "dashboard": rai_gather_job.outputs.dashboard,
            "ux_json": rai_gather_job.outputs.ux_json
        }

In [31]:
import uuid
from azure.ai.ml import Output

# Pipeline to construct the RAI Insights
insights_pipeline_job = rai_classification_pipeline(
    target_column_name=target_column,
    training_data=hospital_train_parquet,
    testing_data=hospital_test_parquet,
)

# Workaround to enable the download
rand_path = str(uuid.uuid4())
insights_pipeline_job.outputs.dashboard = Output(
    path=f"azureml://datastores/workspaceblobstore/paths/{rand_path}/dashboard/",
    mode="upload",
    type="uri_folder",
)
insights_pipeline_job.outputs.ux_json = Output(
    path=f"azureml://datastores/workspaceblobstore/paths/{rand_path}/ux_json/",
    mode="upload",
    type="uri_folder",
)


# submit pipeline
insights_job = submit_and_wait(ml_client, insights_pipeline_job)

[32mUploading rai_hospital_readmission_score_card_config.json[32m (< 1 MB): 100%|██████████| 528/528 [00:00<00:00, 48.8kB/s]
[39m



Latest status : Running
Latest status : Running
Latest status : Running
Latest status : Running
Latest status : Running
Latest status : Running
Latest status : Running
Latest status : Running
Latest status : Running
Latest status : Running
Latest status : Running
Latest status : Running
Latest status : Running
Latest status : Running
Latest status : Running
