# Automated ML

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [1]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources
import joblib

from azureml.widgets import RunDetails
import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset
from azureml.pipeline.steps import AutoMLStep
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.core.datastore import Datastore


# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.20.0


In [2]:
#assign work space
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

# choose a name for experiment
experiment_name = 'California-housing-price-projection'
experiment=Experiment(ws, experiment_name)

Performing interactive authentication. Please follow the instructions on the terminal.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code F8RUBE6W2 to authenticate.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.
quick-starts-ws-137724
aml-quickstarts-137724
southcentralus
d7f39349-a66b-446e-aba6-0053c2cf1c11


In [3]:
#create a compute target
amlcompute_cluster_name = "compute-housing"

# Verify that cluster does not exist already, if not create a new one with amlcompute_cluster_name
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           vm_priority = 'lowpriority', 
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count = 0, timeout_in_minutes = 10)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Dataset

### Overview
TODO: In this markdown cell, give an overview of the dataset you are using. Also mention the task you will be performing.

This dataset contains California Housing Price data downloaded from https://raw.githubusercontent.com/ageron/handson-ml/master/ and was used as an example in Hands-On Machine Learning with Scikit-Learn & TensorFlow by Aurelien Geron. 

This dataset is an adapted version from the original data from the StatLib repository collected from the 1990 California census. 

TODO: Get data. In the cell below, write code to access the data you will be using in this project. Remember that the dataset needs to be external.

In [4]:
#check if the dataset exists and register the dataset if not
import train
found = False
key = "California_housing_price-1"
description_text = "California housing price from 1990 census"

if key in ws.datasets.keys():
    found = True
    dataset = ws.datasets[key]
    print("found dataset") 

if not found:       
        #If the dataset doesn't exist, then download data and register the Dataset in Workspace
#        DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
#        HOUSING_PATH = "datasets/housing"
#        HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"
#        df = train.fetch_housing_data(HOUSING_URL, HOUSING_PATH)
    path = r"https://github.com/second-husky/Capstone-Azure-ML/raw/master/starter_file/housing.csv"
    datastore = Datastore(ws, name = None)
    df = pd.read_csv(path).dropna()
#    print(df.head(5))
    dataset = TabularDatasetFactory.register_pandas_dataframe(df, datastore, 'housing_data')


df = dataset.to_pandas_dataframe()
df.describe()

datasets/housing/housing.csv
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  
Validating arguments.
Arguments validated.
Suc

Method register_pandas_dataframe: This is an experimental method, and may change at any time.<br/>For more information, see https://aka.ms/azuremlexperimental.


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0
mean,-119.570689,35.633221,28.633094,2636.504233,537.870553,1424.946949,499.433465,3.871162,206864.413155
std,2.003578,2.136348,12.591805,2185.269567,421.38507,1133.20849,382.299226,1.899291,115435.667099
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1450.0,296.0,787.0,280.0,2.5637,119500.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5365,179700.0
75%,-118.01,37.72,37.0,3143.0,647.0,1722.0,604.0,4.744,264700.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


## AutoML Configuration

TODO: Explain why you chose the automl settings and cofiguration you used below.

In [5]:
# TODO: Put your automl settings here
automl_settings = {
    "experiment_timeout_minutes": 20,
    "max_concurrent_iterations": 5,
    "iterations" : 20,
    "primary_metric" : 'r2_score'
}
project_folder = "./housing"
# TODO: Put your automl config here
automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "regression",
                             training_data=dataset,
                             label_column_name="median_house_value",   
                             path = project_folder,
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             **automl_settings)

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

Trained models include StackEnsemble, VotingEnsemble, XGBoostRegressor, LightGBM, RandomForest, etc. 
StackEnsemble and VotingEnsemble are the highest proforming models. 
And most models with high performance used XGBoostRegressor

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [6]:
exp_automl = Experiment(ws, 'AutoML-housing')
automl_run = exp_automl.submit(automl_config, show_output = False)
RunDetails(automl_run).show()
automl_run.wait_for_completion()

Running on remote.


_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

{'runId': 'AutoML_2b27b78d-cfc2-4f6d-8b9b-8574b8bfd6a7',
 'target': 'compute-housing',
 'status': 'Completed',
 'startTimeUtc': '2021-02-07T03:59:48.615989Z',
 'endTimeUtc': '2021-02-07T04:21:32.397144Z',
 'properties': {'num_iterations': '20',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'r2_score',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': None,
  'target': 'compute-housing',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"28a7bfcf-43c5-4346-9e8c-1f2dad47ceeb\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"managed-dataset/13be8556-e873-4849-8ef8-30b87c0aa392/\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-137724\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"d7f39349-a66b-4

## Best Model

TODO: In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [7]:
#save the best model using joblib
best_automl_run, best_automl_model = automl_run.get_output()
os.makedirs('./outputs_automl', exist_ok = True)
ml_path = r'./outputs_automl/model_best.joblib'
joblib.dump(value = best_automl_model, filename = ml_path)

Package:azureml-automl-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-core, training version:1.21.0.post1, current version:1.20.0
Package:azureml-dataprep, training version:2.8.2, current version:2.7.3
Package:azureml-dataprep-native, training version:28.0.0, current version:27.0.0
Package:azureml-dataprep-rslex, training version:1.6.0, current version:1.5.0
Package:azureml-dataset-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-defaults, training version:1.21.0, current version:1.20.0
Package:azureml-interpret, training version:1.21.0, current version:1.20.0
Package:azureml-pipeline-core, training version:1.21.0, current version:1.20.0
Package:azureml-telemetry, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-client, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-runtime, training version:1.21.0, current version:1.20.0


['./outputs_automl/model_best.joblib']

In [8]:
best_automl_model

RegressionPipeline(pipeline=Pipeline(memory=None,
                                     steps=[('datatransformer',
                                             DataTransformer(enable_dnn=None,
                                                             enable_feature_sweeping=None,
                                                             feature_sweeping_config=None,
                                                             feature_sweeping_timeout=None,
                                                             featurization_config=None,
                                                             force_text_dnn=None,
                                                             is_cross_validation=None,
                                                             is_onnx_compatible=None,
                                                             logger=None,
                                                             observer=None,
                                         

In [9]:
#register the best model using model name
model_name = best_automl_run.properties['model_name']
automl_model = automl_run.register_model(model_name = model_name)

In [10]:
best_automl_run.get_metrics()

{'explained_variance': 0.8456366105088856,
 'spearman_correlation': 0.9276782962094277,
 'root_mean_squared_log_error': 0.22132043925448125,
 'r2_score': 0.8456265202164054,
 'normalized_root_mean_squared_log_error': 0.06311607943499828,
 'mean_absolute_error': 30367.897935756602,
 'median_absolute_error': 19605.844965430166,
 'root_mean_squared_error': 46132.68588960423,
 'normalized_mean_absolute_error': 0.06261396434603693,
 'normalized_median_absolute_error': 0.040424255911171846,
 'mean_absolute_percentage_error': 16.563774128931886,
 'normalized_root_mean_squared_error': 0.09511854773713146,
 'residuals': 'aml://artifactId/ExperimentRun/dcid.AutoML_2b27b78d-cfc2-4f6d-8b9b-8574b8bfd6a7_19/residuals',
 'predicted_true': 'aml://artifactId/ExperimentRun/dcid.AutoML_2b27b78d-cfc2-4f6d-8b9b-8574b8bfd6a7_19/predicted_true'}

In [20]:
#display the properties of the best model
best_automl_run.get_properties()

{'runTemplate': 'automl_child',
 'pipeline_id': '__AutoML_Stack_Ensemble__',
 'pipeline_spec': '{"pipeline_id":"__AutoML_Stack_Ensemble__","objects":[{"module":"azureml.train.automl.stack_ensemble","class_name":"StackEnsemble","spec_class":"sklearn","param_args":[],"param_kwargs":{"automl_settings":"{\'task_type\':\'regression\',\'primary_metric\':\'r2_score\',\'verbosity\':20,\'ensemble_iterations\':15,\'is_timeseries\':False,\'name\':\'AutoML-housing\',\'compute_target\':\'compute-housing\',\'subscription_id\':\'d7f39349-a66b-446e-aba6-0053c2cf1c11\',\'region\':\'southcentralus\',\'spark_service\':None}","ensemble_run_id":"AutoML_2b27b78d-cfc2-4f6d-8b9b-8574b8bfd6a7_19","experiment_name":"AutoML-housing","workspace_name":"quick-starts-ws-137724","subscription_id":"d7f39349-a66b-446e-aba6-0053c2cf1c11","resource_group_name":"aml-quickstarts-137724"}}]}',
 'training_percent': '100',
 'predicted_cost': None,
 'iteration': '19',
 '_aml_system_scenario_identification': 'Remote.Child',
 '_

## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

In [11]:
#check the files associated with best run
best_automl_run.get_file_names()

['automl_driver.py',
 'azureml-logs/55_azureml-execution-tvmps_22113568c726b683c8fb7763203f4788263cd985e17bc112724d1343b42777dc_p.txt',
 'azureml-logs/65_job_prep-tvmps_22113568c726b683c8fb7763203f4788263cd985e17bc112724d1343b42777dc_p.txt',
 'azureml-logs/70_driver_log.txt',
 'azureml-logs/75_job_post-tvmps_22113568c726b683c8fb7763203f4788263cd985e17bc112724d1343b42777dc_p.txt',
 'azureml-logs/process_info.json',
 'azureml-logs/process_status.json',
 'logs/azureml/103_azureml.log',
 'logs/azureml/azureml_automl.log',
 'logs/azureml/job_prep_azureml.log',
 'logs/azureml/job_release_azureml.log',
 'outputs/conda_env_v_1_0_0.yml',
 'outputs/env_dependencies.json',
 'outputs/model.pkl',
 'outputs/pipeline_graph.json',
 'outputs/scoring_file_v_1_0_0.py',
 'predicted_true',
 'residuals']

In [12]:
#download environment and scoring script associated with the best run
best_automl_run.download_file('outputs/conda_env_v_1_0_0.yml')
best_automl_run.download_file('outputs/scoring_file_v_1_0_0.py')

In [13]:
#create an inference configuration
from azureml.core.environment import Environment
from azureml.core.model import InferenceConfig

env = Environment.from_conda_specification("myenv", 'conda_env_v_1_0_0.yml')
#env = Environment.get(workspace = ws, name = "AzureML-AutoML")

#for pip_package in ["pandas","scikit-learn","joblib"]:
#    env.python.conda_dependencies.add_pip_package(pip_package)

inference_config = InferenceConfig(entry_script='scoring_file_v_1_0_0.py',
                                    environment=env)

In [19]:
from azureml.core.webservice import AciWebservice, AksWebservice, LocalWebservice
from azureml.core.model import InferenceConfig, Model

#deployment_config = LocalWebservice.deploy_configuration(port = 9000)
deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1, enable_app_insights = True)
first_model = Model(ws, name = model_name)
service = Model.deploy(ws, "automl-service", [first_model], inference_config, deployment_config, overwrite = True)
service.wait_for_deployment(show_output = True)
print(service.state)
print(service.get_logs())

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running........................................
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy
2021-02-07T04:51:01,414135400+00:00 - gunicorn/run 
2021-02-07T04:51:01,425539000+00:00 - iot-server/run 
2021-02-07T04:51:01,450414800+00:00 - rsyslog/run 
2021-02-07T04:51:01,491695600+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_7785023fceb74e4facc1b1a577b1faf9/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_7785023fceb74e4facc1b1a577b1faf9/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_7785023fceb74e4facc1b1a577b1faf9/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/n

TODO: In the cell below, send a request to the web service you deployed to test it.

In [15]:
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()

In [16]:
import urllib.request
import json
import os
import ssl

def allowSelfSignedHttps(allowed):
    # bypass the server certificate verification on client side
    if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
        ssl._create_default_https_context = ssl._create_unverified_context

allowSelfSignedHttps(True) # this line is needed if you use self-signed certificate in your scoring service.

data = {
    "data":
    [
        {
            'longitude': "-122.23",
            'latitude': "37.88",
            'housing_median_age': "41",
            'total_rooms': "880",
            'total_bedrooms': "129",
            'population': "322",
            'households': "126",
            'median_income': "8.3252",
            'ocean_proximity': "NEAR BAY",
        },
    ],
}

body = str.encode(json.dumps(data))

url = service.scoring_uri
api_key = '' 
headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)}

req = urllib.request.Request(url, body, headers)

try:
    response = urllib.request.urlopen(req)

    result = response.read()
    print(result)
except urllib.error.HTTPError as error:
    print("The request failed with status code: " + str(error.code))

    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
    print(error.info())
    print(json.loads(error.read().decode("utf8", 'ignore')))

b'"{\\"result\\": [461201.17410603084]}"'


TODO: In the cell below, print the logs of the web service and delete the service

In [17]:
service.get_logs()


'2021-02-07T04:36:11,320189500+00:00 - iot-server/run \n2021-02-07T04:36:11,338040300+00:00 - rsyslog/run \n2021-02-07T04:36:11,356115100+00:00 - gunicorn/run \n2021-02-07T04:36:11,372335900+00:00 - nginx/run \n/usr/sbin/nginx: /azureml-envs/azureml_7785023fceb74e4facc1b1a577b1faf9/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)\nrsyslogd: /azureml-envs/azureml_7785023fceb74e4facc1b1a577b1faf9/lib/libuuid.so.1: no version information available (required by rsyslogd)\n/usr/sbin/nginx: /azureml-envs/azureml_7785023fceb74e4facc1b1a577b1faf9/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_7785023fceb74e4facc1b1a577b1faf9/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_7785023fceb74e4facc1b1a577b1faf9/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx

In [18]:
service.delete()