# Automated ML

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [1]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset
from azureml.pipeline.steps import AutoMLStep
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.data.dataset_factory import TabularDatasetFactory


# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.20.0


## Dataset

### Overview
TODO: In this markdown cell, give an overview of the dataset you are using. Also mention the task you will be performing.

This dataset contains California Housing Price data downloaded from https://raw.githubusercontent.com/ageron/handson-ml/master/ and was used as an example in Hands-On Machine Learning with Scikit-Learn & TensorFlow by Aurelien Geron. 

This dataset is an adapted version from the original data from the StatLib repository collected from the 1990 California census. 

TODO: Get data. In the cell below, write code to access the data you will be using in this project. Remember that the dataset needs to be external.

In [2]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

# choose a name for experiment
experiment_name = 'California-housing-price-projection'
experiment=Experiment(ws, experiment_name)

quick-starts-ws-136785
aml-quickstarts-136785
southcentralus
1b944a9b-fdae-4f97-aeb1-b7eea0beac53


## AutoML Configuration

TODO: Explain why you chose the automl settings and cofiguration you used below.

In [3]:
#create a compute target
amlcompute_cluster_name = "compute-housing"

# Verify that cluster does not exist already, if not create a new one with amlcompute_cluster_name
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           vm_priority = 'lowpriority', 
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count = 0, timeout_in_minutes = 10)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [4]:
#check if the dataset exists and register the dataset if not
import train
found = False
key = "California-housing-price"
description_text = "California housing price from 1990 census"

if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:       
        #If the dataset doesn't exist, then download data and register the Dataset in Workspace
        DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
        HOUSING_PATH = "datasets/housing"
        HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"
        df = train.fetch_housing_data(HOUSING_URL, HOUSING_PATH)
        datastore = ws.get_default_datastore()
        dataset= TabularDatasetFactory.register_pandas_dataframe(dataframe = df, target= datastore, name=key)
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text)


df = dataset.to_pandas_dataframe()
df.describe()

datasets/housing/housing.csv


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [5]:
# TODO: Put your automl settings here
automl_settings = {
    "experiment_timeout_minutes": 20,
    "max_concurrent_iterations": 5,
    "primary_metric" : 'r2_score'
}
project_folder = "./housing"
# TODO: Put your automl config here
automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "regression",
                             training_data=dataset,
                             label_column_name="median_house_value",   
                             path = project_folder,
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             **automl_settings)

In [6]:
#create a pipeline and AutoMLStep
from azureml.pipeline.core import PipelineData, TrainingOutput

ds = ws.get_default_datastore()
metrics_output_name = 'metrics_output'
best_model_output_name = 'best_model_output'

metrics_data = PipelineData(name='metrics_data',
                           datastore=ds,
                           pipeline_output_name=metrics_output_name,
                           training_output=TrainingOutput(type='Metrics'))
model_data = PipelineData(name='model_data',
                           datastore=ds,
                           pipeline_output_name=best_model_output_name,
                           training_output=TrainingOutput(type='Model'))

In [7]:
#create an AutoMLStep
automl_step = AutoMLStep(
    name='automl_module',
    automl_config=automl_config,
    outputs=[metrics_data, model_data],
    allow_reuse=True)

In [8]:
from azureml.pipeline.core import Pipeline
pipeline = Pipeline(
    description="pipeline_with_automlstep",
    workspace=ws,    
    steps=[automl_step])

In [9]:
# TODO: Submit your experiment
pipeline_run = experiment.submit(pipeline)

Created step automl_module [b9e70506][094db952-69ad-47e5-b327-d1ae0c1840a7], (This step will run and generate new outputs)
Submitted PipelineRun ec47336c-7dcf-4e32-98f2-99fd50ff7f05
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/California-housing-price-projection/runs/ec47336c-7dcf-4e32-98f2-99fd50ff7f05?wsid=/subscriptions/1b944a9b-fdae-4f97-aeb1-b7eea0beac53/resourcegroups/aml-quickstarts-136785/workspaces/quick-starts-ws-136785


## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [10]:
from azureml.widgets import RunDetails
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: ec47336c-7dcf-4e32-98f2-99fd50ff7f05
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/California-housing-price-projection/runs/ec47336c-7dcf-4e32-98f2-99fd50ff7f05?wsid=/subscriptions/1b944a9b-fdae-4f97-aeb1-b7eea0beac53/resourcegroups/aml-quickstarts-136785/workspaces/quick-starts-ws-136785
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 1b2b697f-cadf-4bf7-8e9f-de4eb81bba06
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/California-housing-price-projection/runs/1b2b697f-cadf-4bf7-8e9f-de4eb81bba06?wsid=/subscriptions/1b944a9b-fdae-4f97-aeb1-b7eea0beac53/resourcegroups/aml-quickstarts-136785/workspaces/quick-starts-ws-136785
StepRun( automl_module ) Status: NotStarted
StepRun( automl_module ) Status: Running

StepRun(automl_module) Execution Summary
StepRun( automl_module ) Status: Finished



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': 'ec47336c-7dcf-4e32-98f2-99fd50ff7f05', 

'Finished'

## Best Model

TODO: In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [11]:
#retrieve the metrics of all the child runs
metrics_output = pipeline_run.get_pipeline_output(metrics_output_name)
num_file_downloaded = metrics_output.download('.', show_progress=True)

Downloading azureml/1b2b697f-cadf-4bf7-8e9f-de4eb81bba06/metrics_data
Downloaded azureml/1b2b697f-cadf-4bf7-8e9f-de4eb81bba06/metrics_data, 1 files out of an estimated total of 1


In [12]:
#take a look at all the output metrics
import json
with open(metrics_output._path_on_datastore) as f:
    metrics_output_result = f.read()
    
deserialized_metrics_output = json.loads(metrics_output_result)
df_metrics = pd.DataFrame(deserialized_metrics_output)
df_metrics

Unnamed: 0,1b2b697f-cadf-4bf7-8e9f-de4eb81bba06_5,1b2b697f-cadf-4bf7-8e9f-de4eb81bba06_10,1b2b697f-cadf-4bf7-8e9f-de4eb81bba06_9,1b2b697f-cadf-4bf7-8e9f-de4eb81bba06_26,1b2b697f-cadf-4bf7-8e9f-de4eb81bba06_24,1b2b697f-cadf-4bf7-8e9f-de4eb81bba06_27,1b2b697f-cadf-4bf7-8e9f-de4eb81bba06_25,1b2b697f-cadf-4bf7-8e9f-de4eb81bba06_1,1b2b697f-cadf-4bf7-8e9f-de4eb81bba06_19,1b2b697f-cadf-4bf7-8e9f-de4eb81bba06_33,...,1b2b697f-cadf-4bf7-8e9f-de4eb81bba06_38,1b2b697f-cadf-4bf7-8e9f-de4eb81bba06_21,1b2b697f-cadf-4bf7-8e9f-de4eb81bba06_20,1b2b697f-cadf-4bf7-8e9f-de4eb81bba06_23,1b2b697f-cadf-4bf7-8e9f-de4eb81bba06_29,1b2b697f-cadf-4bf7-8e9f-de4eb81bba06_2,1b2b697f-cadf-4bf7-8e9f-de4eb81bba06_32,1b2b697f-cadf-4bf7-8e9f-de4eb81bba06_11,1b2b697f-cadf-4bf7-8e9f-de4eb81bba06_17,1b2b697f-cadf-4bf7-8e9f-de4eb81bba06_15
spearman_correlation,[0.8114307785141688],[0.8861186841138643],[0.8122774499910725],[0.7661459729251996],[0.8723283337696267],[0.786575546002285],[0.9062704869297559],[0.875283217338246],[0.8321833015976092],[0.9022547029840364],...,[0.9145893169429459],[0.8025097078978682],[0.8288584218425185],[0.9070610960935945],[0.8000327688939929],[0.8299326159668942],[0.8023495650592405],[0.8674872380319342],[0.7869871600771507],[0.847045725846481]
normalized_root_mean_squared_error,[0.1491826274700455],[0.12006148359100904],[0.14779832564567758],[0.15877378750513169],[0.1201919677891714],[0.14815327795426203],[0.11329556475413084],[0.11883562937823601],[0.1299801834129745],[0.10794496008217953],...,[0.0976958934709766],[0.15015103687220613],[0.15537506039481722],[0.10142035783486895],[0.15090103197674543],[0.133924844042316],[0.1510390654793365],[0.12909649927564615],[0.15585744042530622],[0.134996172362517]
normalized_root_mean_squared_log_error,[NaN],[0.07884307713870943],[NaN],[0.10337648936184347],[0.0800209935114424],[0.09945274581751443],[0.07007695853455194],[0.0792614346604747],[0.09024818820267289],[0.0712401411263438],...,[0.06599551347350652],[NaN],[0.10526592527223773],[0.06958510192148994],[0.10509729646984065],[0.09185701573526359],[NaN],[0.08478368178162322],[0.10003031296508891],[0.08679398070494634]
mean_absolute_error,[52161.87334915281],[40730.83474539882],[51641.45938374579],[55119.82278418726],[39739.46619503997],[52428.15820756892],[36033.0990264656],[39196.122573734254],[45503.066447886384],[35158.91182127567],...,[31072.60719799691],[52435.98374221255],[56144.618868774174],[33331.870365540104],[52078.94277205459],[46881.57469472959],[51976.82376630766],[44405.59732195013],[54426.2321855119],[45310.72201564285]
root_mean_squared_log_error,[NaN],[0.2764681301614278],[NaN],[0.36249605867411766],[0.2805985668322732],[0.34873720906674954],[0.24572919267677143],[0.27793512670670373],[0.31646085805293334],[0.2498079644327654],...,[0.2314173529397374],[NaN],[0.3691214826449191],[0.24400446701846729],[0.36853017531164417],[0.32210231137806233],[NaN],[0.29729922804926395],[0.3507624840195734],[0.3043484656559961]
median_absolute_error,[39813.05827668565],[28949.48659545777],[38581.541260464],[38637.42578125],[27205.46875],[37974.754886917595],[22012.3515625],[26423.3828125],[32740.671875],[23738.823796536817],...,[19497.09037145035],[38080.0324012473],[43239.68017322791],[22429.069198810103],[37462.18647113597],[34583.22265625],[37781.26715859905],[32087.50216033429],[39967.70986073365],[31834.710970690096]
mean_absolute_percentage_error,[29.790225364231144],[22.815411761515225],[29.179542054023404],[29.92953598579733],[22.312130408101375],[30.102628823492715],[18.204275495553734],[21.951776813172085],[26.270742406203603],[19.66086758797743],...,[17.25734077413065],[29.401632919857654],[33.610098159118785],[18.847008793612574],[28.55465529073423],[26.560440961585655],[28.189693865919182],[24.978384638650954],[29.948206217590315],[24.97978975072058]
normalized_mean_absolute_error,[0.1075498108237756],[0.08398075625543568],[0.10647679676320054],[0.11364865048842532],[0.08193670581779038],[0.10809884950488642],[0.0742947431690294],[0.08081641431114563],[0.0938203686745341],[0.07249230275602095],...,[0.06406696714239717],[0.10811498456132665],[0.11576162339283998],[0.06872522250535071],[0.10737882064827482],[0.0966626419988569],[0.10716826686551326],[0.09155755506564948],[0.11221857267704442],[0.09342378385170133]
r2_score,[0.6074500478874915],[0.7457471462475641],[0.6147013802769821],[0.5553523158365333],[0.74519419609607],[0.6128484924520496],[0.7735959235080877],[0.7509125961073937],[0.702002434648006],[0.7944757059622451],...,[0.8316508053710795],[0.6023370809492482],[0.5741849119274066],[0.8185701665083969],[0.5983545563259673],[0.683640614287985],[0.5976194270404502],[0.7060405973368314],[0.5715368222522277],[0.6785589525296538]
explained_variance,[0.6074652812787776],[0.74577737879601],[0.6147217236052167],[0.5706391812937526],[0.7454132247360674],[0.6128690891098267],[0.7881122352775481],[0.7510976159241117],[0.70224632416489],[0.794681770060073],...,[0.8317536475472811],[0.6023661082431537],[0.5741886110495287],[0.8188550470335458],[0.5990784344280575],[0.6837499846054192],[0.5999292055299998],[0.7060579657830467],[0.5715906560619539],[0.6785877524884434]


In [13]:
# Retrieve best model from Pipeline Run
best_model_output = pipeline_run.get_pipeline_output(best_model_output_name)
num_file_downloaded = best_model_output.download('.', show_progress=True)

Downloading azureml/1b2b697f-cadf-4bf7-8e9f-de4eb81bba06/model_data
Downloaded azureml/1b2b697f-cadf-4bf7-8e9f-de4eb81bba06/model_data, 1 files out of an estimated total of 1


In [14]:
print(best_model_output)

$AZUREML_DATAREFERENCE_best_model_output


In [15]:
import pickle

with open(best_model_output._path_on_datastore, "rb" ) as f:
    best_model = pickle.load(f)
best_model

RegressionPipeline(pipeline=Pipeline(memory=None,
                                     steps=[('datatransformer',
                                             DataTransformer(enable_dnn=None,
                                                             enable_feature_sweeping=None,
                                                             feature_sweeping_config=None,
                                                             feature_sweeping_timeout=None,
                                                             featurization_config=None,
                                                             force_text_dnn=None,
                                                             is_cross_validation=None,
                                                             is_onnx_compatible=None,
                                                             logger=None,
                                                             observer=None,
                                         

In [16]:
best_model.steps

[('datatransformer',
  DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                  feature_sweeping_config=None, feature_sweeping_timeout=None,
                  featurization_config=None, force_text_dnn=None,
                  is_cross_validation=None, is_onnx_compatible=None, logger=None,
                  observer=None, task=None, working_dir=None)),
 ('stackensembleregressor',
  StackEnsembleRegressor(base_learners=[('4',
                                         Pipeline(memory=None,
                                                  steps=[('standardscalerwrapper',
                                                          <azureml.automl.runtime.shared.model_wrappers.StandardScalerWrapper object at 0x7efd60305b38>),
                                                         ('xgboostregressor',
                                                          XGBoostRegressor(base_score=0.5,
                                                                           boost

In [17]:
best_model

RegressionPipeline(pipeline=Pipeline(memory=None,
                                     steps=[('datatransformer',
                                             DataTransformer(enable_dnn=None,
                                                             enable_feature_sweeping=None,
                                                             feature_sweeping_config=None,
                                                             feature_sweeping_timeout=None,
                                                             featurization_config=None,
                                                             force_text_dnn=None,
                                                             is_cross_validation=None,
                                                             is_onnx_compatible=None,
                                                             logger=None,
                                                             observer=None,
                                         

In [18]:
type(best_model)

azureml.automl.runtime.shared.model_wrappers.RegressionPipeline

In [29]:
#TODO: Save the best model
import joblib
os.makedirs('./outputs_automl', exist_ok = True)
ml_path = r'./outputs_automl/modelbest.joblib'
joblib.dump(value = best_model, filename=ml_path)

['./outputs_automl/modelbest.joblib']

## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

In [36]:
published_pipeline = pipeline_run.publish_pipeline(
    name="California-Housing-Price", description="California Housing Price Prediction", version="1.0")

published_pipeline

Name,Id,Status,Endpoint
California-Housing-Price,30d67724-3dd1-4002-a213-161ddd5f8860,Active,REST Endpoint


In [37]:
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()

TODO: In the cell below, send a request to the web service you deployed to test it.

In [38]:
import requests

rest_endpoint = published_pipeline.endpoint
response = requests.post(rest_endpoint, 
                         headers=auth_header, 
                         json={"ExperimentName": "pipeline-rest-endpoint"}
                        )

TODO: In the cell below, print the logs of the web service and delete the service

In [39]:
try:
    response.raise_for_status()
except Exception:    
    raise Exception("Received bad response from the endpoint: {}\n"
                    "Response Code: {}\n"
                    "Headers: {}\n"
                    "Content: {}".format(rest_endpoint, response.status_code, response.headers, response.content))

run_id = response.json().get('Id')
print('Submitted pipeline run: ', run_id)

Submitted pipeline run:  b5d1c561-5448-4d71-b0e0-1f491a7b2ef9


In [41]:
from azureml.pipeline.core.run import PipelineRun
from azureml.widgets import RunDetails
published_pipeline_run = PipelineRun(ws.experiments["pipeline-rest-endpoint"], run_id)
RunDetails(published_pipeline_run).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …