# Automated ML

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [39]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset
from azureml.pipeline.steps import AutoMLStep
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.data.dataset_factory import TabularDatasetFactory


# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.20.0


## Dataset

### Overview
TODO: In this markdown cell, give an overview of the dataset you are using. Also mention the task you will be performing.
This dataset contains California Housing Price data downloaded from https://raw.githubusercontent.com/ageron/handson-ml/master/ ad was used as an example in Hands-On Machine Learning with Scikit-Learn & TensorFlow by Aurelien Geron. 
This dataset is an adapted version from the original data from the StatLib repository collected from the 1990 California census. 

TODO: Get data. In the cell below, write code to access the data you will be using in this project. Remember that the dataset needs to be external.

In [40]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

# choose a name for experiment
experiment_name = 'California-housing-price-projection'
experiment=Experiment(ws, experiment_name)

quick-starts-ws-136753
aml-quickstarts-136753
southcentralus
510b94ba-e453-4417-988b-fbdc37b55ca7


## AutoML Configuration

TODO: Explain why you chose the automl settings and cofiguration you used below.

In [41]:
#create a compute target
amlcompute_cluster_name = "compute-housing"

# Verify that cluster does not exist already, if not create a new one with amlcompute_cluster_name
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           vm_priority = 'lowpriority', 
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count = 0, timeout_in_minutes = 10)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [42]:
#check if the dataset exists and register the dataset if not
import train
found = False
key = "California-housing-price"
description_text = "California housing price from 1990 census"

if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:       
        #If the dataset doesn't exist, then download data and register the Dataset in Workspace
        DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
        HOUSING_PATH = "datasets/housing"
        HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"
        df = train.fetch_housing_data(HOUSING_URL, HOUSING_PATH)
        datastore = ws.get_default_datastore()
        dataset= TabularDatasetFactory.register_pandas_dataframe(dataframe = df, target= datastore, name=key)
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text,
                                   update = True)


df = dataset.to_pandas_dataframe()
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [43]:
# TODO: Put your automl settings here
automl_settings = {
    "experiment_timeout_minutes": 20,
    "max_concurrent_iterations": 5,
    "primary_metric" : 'r2_score'
}
project_folder = "./housing"
# TODO: Put your automl config here
automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "regression",
                             training_data=dataset,
                             label_column_name="median_house_value",   
                             path = project_folder,
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             **automl_settings)

In [44]:
#create a pipeline and AutoMLStep
from azureml.pipeline.core import PipelineData, TrainingOutput

ds = ws.get_default_datastore()
metrics_output_name = 'metrics_output'
best_model_output_name = 'best_model_output'

metrics_data = PipelineData(name='metrics_data',
                           datastore=ds,
                           pipeline_output_name=metrics_output_name,
                           training_output=TrainingOutput(type='Metrics'))
model_data = PipelineData(name='model_data',
                           datastore=ds,
                           pipeline_output_name=best_model_output_name,
                           training_output=TrainingOutput(type='Model'))

In [45]:
#create an AutoMLStep
automl_step = AutoMLStep(
    name='automl_module',
    automl_config=automl_config,
    outputs=[metrics_data, model_data],
    allow_reuse=True)

In [46]:
from azureml.pipeline.core import Pipeline
pipeline = Pipeline(
    description="pipeline_with_automlstep",
    workspace=ws,    
    steps=[automl_step])

In [47]:
# TODO: Submit your experiment
pipeline_run = experiment.submit(pipeline)

Created step automl_module [9a3ef573][af640437-dcae-4802-9a5c-0048cd7dc46e], (This step is eligible to reuse a previous run's output)
Submitted PipelineRun 7d6f47b2-cede-401e-8c4b-25b20df4b0ec
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/California-housing-price-projection/runs/7d6f47b2-cede-401e-8c4b-25b20df4b0ec?wsid=/subscriptions/510b94ba-e453-4417-988b-fbdc37b55ca7/resourcegroups/aml-quickstarts-136753/workspaces/quick-starts-ws-136753


## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [48]:
from azureml.widgets import RunDetails
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 7d6f47b2-cede-401e-8c4b-25b20df4b0ec
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/California-housing-price-projection/runs/7d6f47b2-cede-401e-8c4b-25b20df4b0ec?wsid=/subscriptions/510b94ba-e453-4417-988b-fbdc37b55ca7/resourcegroups/aml-quickstarts-136753/workspaces/quick-starts-ws-136753
PipelineRun Status: Running


StepRunId: a6c2e206-c098-43fd-a0b2-dffa8acacbe6
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/California-housing-price-projection/runs/a6c2e206-c098-43fd-a0b2-dffa8acacbe6?wsid=/subscriptions/510b94ba-e453-4417-988b-fbdc37b55ca7/resourcegroups/aml-quickstarts-136753/workspaces/quick-starts-ws-136753
StepRun( automl_module ) Status: NotStarted

StepRun(automl_module) Execution Summary
StepRun( automl_module ) Status: Finished



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '7d6f47b2-cede-401e-8c4b-25b20df4b0ec', 'status': 'Completed', 'startTimeUtc': '2021-01-31T19:20:28.065353Z', 'e

'Finished'

## Best Model

TODO: In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [38]:
#retrieve the metrics of all the child runs
metrics_output = pipeline_run.get_pipeline_output(metrics_output_name)
num_file_downloaded = metrics_output.download('.', show_progress=True)

Downloading azureml/76de7827-9fdb-421b-bdcc-cf3e9bc83995/metrics_data
Downloaded azureml/76de7827-9fdb-421b-bdcc-cf3e9bc83995/metrics_data, 1 files out of an estimated total of 1


In [None]:
#take a look at all the output metrics
import json
with open(metrics_output._path_on_datastore) as f:
    metrics_output_result = f.read()
    
deserialized_metrics_output = json.loads(metrics_output_result)
df_metrics = pd.DataFrame(deserialized_metrics_output)
df_metrics

In [49]:
# Retrieve best model from Pipeline Run
best_model_output = pipeline_run.get_pipeline_output(best_model_output_name)
num_file_downloaded = best_model_output.download('.', show_progress=True)

Downloading azureml/76de7827-9fdb-421b-bdcc-cf3e9bc83995/model_data
Downloaded azureml/76de7827-9fdb-421b-bdcc-cf3e9bc83995/model_data, 1 files out of an estimated total of 1


In [64]:
print(best_model_output)

$AZUREML_DATAREFERENCE_best_model_output


In [50]:
import pickle

with open(best_model_output._path_on_datastore, "rb" ) as f:
    best_model = pickle.load(f)
best_model

RegressionPipeline(pipeline=Pipeline(memory=None,
                                     steps=[('datatransformer',
                                             DataTransformer(enable_dnn=None,
                                                             enable_feature_sweeping=None,
                                                             feature_sweeping_config=None,
                                                             feature_sweeping_timeout=None,
                                                             featurization_config=None,
                                                             force_text_dnn=None,
                                                             is_cross_validation=None,
                                                             is_onnx_compatible=None,
                                                             logger=None,
                                                             observer=None,
                                         

In [51]:
best_model.steps

[('datatransformer',
  DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                  feature_sweeping_config=None, feature_sweeping_timeout=None,
                  featurization_config=None, force_text_dnn=None,
                  is_cross_validation=None, is_onnx_compatible=None, logger=None,
                  observer=None, task=None, working_dir=None)),
 ('prefittedsoftvotingregressor',
  PreFittedSoftVotingRegressor(estimators=[('27',
                                            Pipeline(memory=None,
                                                     steps=[('maxabsscaler',
                                                             MaxAbsScaler(copy=True)),
                                                            ('lightgbmregressor',
                                                             LightGBMRegressor(boosting_type='gbdt',
                                                                               class_weight=None,
                            

In [60]:
best_model

RegressionPipeline(pipeline=Pipeline(memory=None,
                                     steps=[('datatransformer',
                                             DataTransformer(enable_dnn=None,
                                                             enable_feature_sweeping=None,
                                                             feature_sweeping_config=None,
                                                             feature_sweeping_timeout=None,
                                                             featurization_config=None,
                                                             force_text_dnn=None,
                                                             is_cross_validation=None,
                                                             is_onnx_compatible=None,
                                                             logger=None,
                                                             observer=None,
                                         

In [62]:
type(best_model)

azureml.automl.runtime.shared.model_wrappers.RegressionPipeline

In [58]:
#TODO: Save the best model
import joblib
os.makedirs('./outputs_automl', exist_ok = True)
ml_path = r'./outputs_automl/modelbest.joblib'
joblib.dump(value = best_model, filename=ml_path)

['./outputs_automl/modelbest.joblib']

## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

TODO: In the cell below, send a request to the web service you deployed to test it.

TODO: In the cell below, print the logs of the web service and delete the service