In [2]:
from azureml.core import Workspace, Datastore, Dataset

ws = Workspace.from_config()

In [5]:
from azureml.core import Dataset
# Load diabetes dataset (stored previously)
ds = Dataset.get_by_name(ws, 'diab_dataset')
ds.take(5).to_pandas_dataframe().head()

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,1883350,9,103,78,25,304,29.582192,1.28287,43,1
4,1424119,1,85,59,27,35,42.604536,0.549542,22,0


## Param tuning using hyperdrive

In [16]:
from azureml.train.hyperdrive import RandomParameterSampling, choice, BanditPolicy
import numpy as np
# Define param space
param_space = {'--eta': choice(*np.linspace(0.001,1,10).tolist()),
              '--colsample': choice(0.5,0.6,0.7,0.8,0.9,1),
              '--max_depth': choice(3,4,5,6,7,8,9,10)}
param_sampling = RandomParameterSampling(param_space)
# Stopping condition
stopping_policy = BanditPolicy(slack_factor = 0.03, # If 5% worse than best run, stop
                               evaluation_interval = 1, # Apply every run
                               delay_evaluation = 5) # Don't apply until fifth run

In [17]:
from azureml.core import Experiment
from azureml.train.estimator import Estimator
from azureml.train.hyperdrive import HyperDriveConfig, PrimaryMetricGoal
from azureml.widgets import RunDetails


# Script location
script_dir = 'param_tuning'
script_name = 'tuner.py'

# Estimator
estimator = Estimator(source_directory=script_dir,
                      entry_script=script_name,
                      compute_target='DS-Ass-Cluster',
                      conda_packages=['scikit-learn'],
                      inputs=[ds.as_named_input('training_data')],
                      pip_packages=['azureml-dataprep[pandas]', 'xgboost'] # Required when using datasets
                    )

# Hyperdriveconfig
hyperdrive_config = HyperDriveConfig(estimator=estimator,
                              hyperparameter_sampling=param_sampling,
                              policy=stopping_policy,
                              primary_metric_name='AUC',
                              primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                              max_concurrent_runs=10,
                              max_total_runs=500)

# Create experiment and submit
experiment = Experiment(workspace = ws, name='param_tuning_hyperdrive')
hyperdrive_run = experiment.submit(config=hyperdrive_config)

RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion()



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

{'runId': 'HD_9f761e88-3dfa-42bc-969e-4d34c5e45896',
 'target': 'DS-Ass-Cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-11-20T14:19:26.129485Z',
 'endTimeUtc': '2020-11-20T15:50:15.008939Z',
 'properties': {'primary_metric_config': '{"name": "AUC", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'e29803b8-14d2-4fec-8b64-9455d43457ef',
  'score': '0.9618356721434783',
  'best_child_run_id': 'HD_9f761e88-3dfa-42bc-969e-4d34c5e45896_452',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://ml2ssandbox5194139173.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_9f761e88-3dfa-42bc-969e-4d34c5e45896/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=sYzZTbCE5dRtRV6%2FOjXdLxkSnvA4WIY9TBFm2dMpcc0%3D&st=2020-11-23T07%3A51%3A37Z&se=2020-11-23T16%3A01%3A37Z&sp=r'}}

In [55]:
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [59]:
for child_run in hyperdrive_run.get_children():
    print(child_run.id, child_run.get_metrics())

HD_5acbc787-5999-4d7b-9efc-a506e0661c90_28 {'AUC': 0.7404206598803899}
HD_5acbc787-5999-4d7b-9efc-a506e0661c90_27 {'AUC': 0.7344015882019286}
HD_5acbc787-5999-4d7b-9efc-a506e0661c90_29 {'AUC': 0.7511490978157646}
HD_5acbc787-5999-4d7b-9efc-a506e0661c90_22 {'AUC': 0.7464181563343173}
HD_5acbc787-5999-4d7b-9efc-a506e0661c90_24 {'AUC': 0.7362974102370158}
HD_5acbc787-5999-4d7b-9efc-a506e0661c90_23 {'AUC': 0.747506245896425}
HD_5acbc787-5999-4d7b-9efc-a506e0661c90_25 {'AUC': 0.7345022058489108}
HD_5acbc787-5999-4d7b-9efc-a506e0661c90_26 {'AUC': 0.7430770988144193}
HD_5acbc787-5999-4d7b-9efc-a506e0661c90_21 {'AUC': 0.7502530297767402}
HD_5acbc787-5999-4d7b-9efc-a506e0661c90_20 {'AUC': 0.7352198936390024}
HD_5acbc787-5999-4d7b-9efc-a506e0661c90_19 {'AUC': 0.7277860958935911}
HD_5acbc787-5999-4d7b-9efc-a506e0661c90_18 {'AUC': 0.7533856022808268}
HD_5acbc787-5999-4d7b-9efc-a506e0661c90_16 {'AUC': 0.7317330965743941}
HD_5acbc787-5999-4d7b-9efc-a506e0661c90_17 {'AUC': 0.7332319542538716}
HD_5acb

In [21]:
# Fetch best run and register model
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run.register_model(model_path='outputs/model.pkl', model_name='model_xgb',
                        tags={'Training context':'Hyperdrive'},
                        properties={'AUC': best_run.get_metrics()['AUC']})

Model(workspace=Workspace.create(name='ML-2s-sandbox', subscription_id='08265842-251e-450c-8d28-a06ee3f3c611', resource_group='DataSandbox'), name=model_xgb, id=model_xgb:1, version=1, tags={'Training context': 'Hyperdrive'}, properties={'AUC': '0.9618356721434783'})

In [27]:
n = 0
for child_run in hyperdrive_run.get_children():
    print(child_run, child_run.get_metrics())
    n += 1
    if n > 5:
        break

Run(Experiment: param_tuning_hyperdrive,
Id: HD_9f761e88-3dfa-42bc-969e-4d34c5e45896_476,
Type: azureml.scriptrun,
Status: Completed) {'AUC': 0.9553389953542393}
Run(Experiment: param_tuning_hyperdrive,
Id: HD_9f761e88-3dfa-42bc-969e-4d34c5e45896_477,
Type: azureml.scriptrun,
Status: Completed) {'AUC': 0.9439469320066335}
Run(Experiment: param_tuning_hyperdrive,
Id: HD_9f761e88-3dfa-42bc-969e-4d34c5e45896_479,
Type: azureml.scriptrun,
Status: Completed) {'AUC': 0.9496653213081535}
Run(Experiment: param_tuning_hyperdrive,
Id: HD_9f761e88-3dfa-42bc-969e-4d34c5e45896_478,
Type: azureml.scriptrun,
Status: Completed) {'AUC': 0.9359734115137323}
Run(Experiment: param_tuning_hyperdrive,
Id: HD_9f761e88-3dfa-42bc-969e-4d34c5e45896_475,
Type: azureml.scriptrun,
Status: Completed) {'AUC': 0.9505793168020689}
Run(Experiment: param_tuning_hyperdrive,
Id: HD_9f761e88-3dfa-42bc-969e-4d34c5e45896_472,
Type: azureml.scriptrun,
Status: Completed) {'AUC': 0.9508537217991235}


## Explainability

In [31]:
from azureml.core import Model
# Load the registered model
model_obj = Model(ws, 'model_xgb')

In [35]:
import joblib
model_obj.download('param_tuning/', exist_ok=True)
model = joblib.load('param_tuning/model.pkl')

In [45]:
from interpret.ext.blackbox import TabularExplainer

# Define features and labels
features = ds.to_pandas_dataframe().drop(columns=['PatientID', 'Diabetic']).columns.tolist()
labels = ['negative', 'positive']

tab_explainer = TabularExplainer(model=model,
                             initialization_examples=ds.to_pandas_dataframe().drop(columns=['PatientID', 'Diabetic']),
                             features=features,
                             classes=labels)

Setting feature_perturbation = "tree_path_dependent" because no background data was given.
The option feature_dependence has been renamed to feature_perturbation!
The option feature_perturbation="independent" is has been renamed to feature_perturbation="interventional"!


In [54]:
df = ds.to_pandas_dataframe().drop(columns=['PatientID', 'Diabetic']).sample(n=100)
glob_tab = tab_explainer.explain_global(df.values)
glob_tab.get_feature_importance_dict()

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




{'Pregnancies': 0.09719966318529832,
 'DiabetesPedigree': 0.007862072745263543,
 'Age': 0.0,
 'BMI': 0.0,
 'SerumInsulin': 0.0,
 'TricepsThickness': 0.0,
 'DiastolicBloodPressure': 0.0,
 'PlasmaGlucose': 0.0}

In [59]:
df_loc = df.sample(n=2)
loc_tab = tab_explainer.explain_local(df_loc.values)
display(loc_tab.get_ranked_local_names())
display(loc_tab.get_ranked_local_values())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [60]:
# Can also get explanations directly from experiment by including ExplanationClient in input script