# AutoML Regression


<img src='https://github.com/retkowsky/images/blob/master/AzureMLservicebanniere.png?raw=true'>

In [1]:
import sys
print("Version Python : ", sys.version)

Version Python :  3.6.9 |Anaconda, Inc.| (default, Jul 30 2019, 19:07:31) 
[GCC 7.3.0]


In [2]:
import datetime
now = datetime.datetime.now()
print(now)

2020-02-14 08:50:54.573149


In [3]:
import azureml.core
print("Version Azure ML service :", azureml.core.VERSION)

Version Azure ML service : 1.0.83


In [4]:
import logging

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
import azureml.dataprep as dprep
from azureml.automl.core.featurization import FeaturizationConfig
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

## 1. Création expérimentation

In [5]:
ws = Workspace.from_config()

experiment_name = 'workshop3-automregression'
project_folder = './sample_projects/workshop3'

experiment = Experiment(ws, experiment_name)

output = {}
output['SDK version'] = azureml.core.VERSION
output['Workspace Name'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Project Directory'] = project_folder
output['Experiment Name'] = experiment.name
pd.set_option('display.max_colwidth', -1)
pd.DataFrame(data = output, index = ['']).T

Unnamed: 0,Unnamed: 1
SDK version,1.0.83
Workspace Name,workshop-aml-2020
Resource Group,workshopaml2020RG
Location,westeurope
Project Directory,./sample_projects/workshop3
Experiment Name,workshop3-automregression


### Création d'un Training Cluster

In [6]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget

# Choose a name for your cluster.
amlcompute_cluster_name = "automl2"

found = False
# Check if this compute target already exists in the workspace.
cts = ws.compute_targets
if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':
    found = True
    print('Found existing compute target.')
    compute_target = cts[amlcompute_cluster_name]

if not found:
    print('Creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = "STANDARD_D2_V2", # for GPU, use "STANDARD_NC6"
                                                                #vm_priority = 'lowpriority', # optional
                                                                max_nodes = 4)

    # Create the cluster.\n",
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)

print('Checking cluster status...')
# Can poll for a minimum number of nodes and for a specific timeout.
# If no min_node_count is provided, it will use the scale settings for the cluster.
compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)

# For a more detailed view of current AmlCompute status, use get_status().

Creating a new compute target...
Checking cluster status...
Creating
Succeeded
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned


## 2. Chargement des données

In [7]:
data = 'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/machineData.csv'

dataset = Dataset.Tabular.from_delimited_files(data)

# Partitionnement des données
train_data, test_data = dataset.random_split(percentage=0.8, seed=223)


# Référencement du dataset Train dans le workspace
train_data.register(workspace = ws, name = 'machineData_train_dataset',
                       description = 'hardware performance training data',
                      create_new_version=True)

# Référencement du dataset Test dans le workshop
test_data.register(workspace = ws, name = 'machineData_test_dataset', description = 'hardware performance test data', create_new_version=True)

label ="ERP"

In [8]:
train_data.to_pandas_dataframe().head(10)

Unnamed: 0,VendorName,ModelName,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,adviser,32/60,125,256,6000,256,16,128,198,199
1,amdahl,470v/7,29,8000,32000,32,8,32,269,253
2,amdahl,470v/7a,29,8000,32000,32,8,32,220,253
3,amdahl,470v/7c,29,8000,16000,32,8,16,132,132
4,amdahl,470v/b,26,8000,32000,64,8,32,318,290
5,amdahl,580-5840,23,16000,32000,64,16,32,367,381
6,amdahl,580-5850,23,16000,32000,64,16,32,489,381
7,amdahl,580-5860,23,16000,64000,64,16,32,636,749
8,amdahl,580-5880,23,32000,64000,128,32,64,1144,1238
9,apollo,dn420,400,512,3500,4,1,6,40,24


> Accès direct au dataset référence (ici Train par exemple) :

In [9]:
# azureml-core of version 1.0.72 or higher is required
from azureml.core import Workspace, Dataset

subscription_id = '70b8f39e-8863-49f7-b6ba-34a80799550c'
resource_group = 'workshopmlRG'
workspace_name = 'workshopML'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='machineData_train_dataset')
df=dataset.to_pandas_dataframe()

In [10]:
df.describe()

Unnamed: 0,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
count,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0
mean,199.9,3100.9,12499.41,27.2,5.1,19.31,115.43,109.54
std,255.0,4234.49,12550.22,43.52,7.43,27.69,177.22,170.89
min,17.0,64.0,64.0,0.0,0.0,0.0,6.0,15.0
25%,50.0,768.0,4000.0,0.0,1.0,5.0,26.0,28.0
50%,110.0,2000.0,8000.0,8.0,3.0,8.0,50.0,45.5
75%,225.0,4000.0,16000.0,32.0,6.0,24.0,117.0,113.0
max,1500.0,32000.0,64000.0,256.0,52.0,176.0,1150.0,1238.0


In [11]:
df.corr()

Unnamed: 0,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
MYCT,1.0,-0.35,-0.38,-0.32,-0.31,-0.27,-0.32,-0.3
MMIN,-0.35,1.0,0.76,0.53,0.51,0.27,0.8,0.82
MMAX,-0.38,0.76,1.0,0.53,0.57,0.55,0.87,0.91
CACH,-0.32,0.53,0.53,1.0,0.61,0.51,0.65,0.65
CHMIN,-0.31,0.51,0.57,0.61,1.0,0.57,0.62,0.61
CHMAX,-0.27,0.27,0.55,0.51,0.57,1.0,0.63,0.61
PRP,-0.32,0.8,0.87,0.65,0.62,0.63,1.0,0.97
ERP,-0.3,0.82,0.91,0.65,0.61,0.61,0.97,1.0


In [12]:
df.shape

(164, 10)

## 3. Configuration AutoML


In [13]:
featurization_config = FeaturizationConfig()
featurization_config.blocked_transformers = ['LabelEncoder']
#featurization_config.drop_columns = ['MMIN']
featurization_config.add_column_purpose('MYCT', 'Numeric')
featurization_config.add_column_purpose('VendorName', 'CategoricalHash')
#default strategy mean, add transformer param for for 3 columns
featurization_config.add_transformer_params('Imputer', ['CACH'], {"strategy": "median"})
featurization_config.add_transformer_params('Imputer', ['CHMIN'], {"strategy": "median"})
featurization_config.add_transformer_params('Imputer', ['PRP'], {"strategy": "most_frequent"})
#featurization_config.add_transformer_params('HashOneHotEncoder', [], {"number_of_bits": 3})

In [14]:
automl_settings = {
    "enable_early_stopping": True, 
    "experiment_timeout_minutes" : 15,
    "max_concurrent_iterations": 4,
    "max_cores_per_iteration": -1,
    "n_cross_validations": 3,
    "primary_metric": 'normalized_root_mean_squared_error',
    "verbosity": logging.INFO
}

automl_config = AutoMLConfig(task = 'regression',
                             debug_log = 'automl_errors.log',
                             compute_target=compute_target,
                             featurization=featurization_config,
                             training_data = train_data,
                             label_column_name = label,
                             **automl_settings
                            )

## 4. AutoML

In [15]:
remote_run = experiment.submit(automl_config, show_output = True)

Running on remote compute: automl2
Parent Run ID: AutoML_65319070-1e3b-4bd5-9950-0f8e41c45907

Current status: DatasetFeaturization. Beginning to featurize the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
****************************************************************************************************

 ITERATION   PIPELINE                                       DURATION      METRIC      BEST
         3   StandardScalerWrapper ElasticNet               0:08:16       0.0364    0.0364
         2   StandardScalerWrapper Ela

In [16]:
remote_run

Experiment,Id,Type,Status,Details Page,Docs Page
workshop3-automregression,AutoML_65319070-1e3b-4bd5-9950-0f8e41c45907,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [17]:
from azureml.widgets import RunDetails
RunDetails(remote_run).show() 

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [18]:
children = list(remote_run.get_children())
metricslist = {}
for run in children:
    properties = run.get_properties()
    metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}
    metricslist[int(properties['iteration'])] = metrics

rundata = pd.DataFrame(metricslist).sort_index(1)
rundata

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
explained_variance,0.91,0.91,0.92,0.92,0.88,0.66,0.86,0.88,0.83,0.59,...,0.57,0.95,0.81,0.73,0.92,0.91,0.91,0.79,0.95,0.91
mean_absolute_error,24.86,24.61,24.71,25.19,23.73,35.86,27.47,21.23,24.72,34.01,...,35.34,13.51,26.74,37.79,24.98,24.5,25.03,34.29,14.52,21.67
mean_absolute_percentage_error,35.03,34.36,34.4,36.34,19.79,29.82,18.89,17.97,16.54,18.79,...,20.34,9.02,18.05,42.79,35.88,34.33,36.06,22.4,10.36,21.61
median_absolute_error,14.1,14.19,13.23,14.86,6.8,8.71,6.33,6.35,5.56,5.66,...,7.03,3.15,5.5,12.28,15.27,13.81,15.37,7.1,3.99,8.19
normalized_mean_absolute_error,0.02,0.02,0.02,0.02,0.02,0.03,0.02,0.02,0.02,0.03,...,0.03,0.01,0.02,0.03,0.02,0.02,0.02,0.03,0.01,0.02
normalized_median_absolute_error,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.0,0.0,...,0.01,0.0,0.0,0.01,0.01,0.01,0.01,0.01,0.0,0.01
normalized_root_mean_squared_error,0.04,0.04,0.04,0.04,0.05,0.08,0.05,0.05,0.05,0.07,...,0.07,0.03,0.06,0.07,0.04,0.04,0.04,0.06,0.03,0.04
normalized_root_mean_squared_log_error,,,,,0.06,0.08,0.06,0.05,0.05,0.06,...,0.06,0.03,0.06,0.1,,,,0.07,0.03,0.08
r2_score,0.9,0.9,0.91,0.91,0.87,0.63,0.85,0.88,0.82,0.57,...,0.54,0.95,0.8,0.7,0.91,0.9,0.91,0.79,0.95,0.9
root_mean_squared_error,44.52,45.33,45.23,44.48,58.41,92.82,64.21,55.48,62.92,89.07,...,90.21,36.14,74.42,90.77,44.21,44.91,44.18,76.29,34.96,46.51


### Best Run

In [19]:
best_run, fitted_model = remote_run.get_output()

In [20]:
best_run_customized, fitted_model_customized = remote_run.get_output()

### Transparence

In [21]:
custom_featurizer = fitted_model_customized.named_steps['datatransformer']

In [22]:
custom_featurizer.get_featurization_summary()

[{'RawFeatureName': 'VendorName',
  'TypeDetected': 'CategoricalHash',
  'Dropped': 'No',
  'EngineeredFeatureCount': 16,
  'Transformations': ['StringCast-HashOneHotEncoder']},
 {'RawFeatureName': 'ModelName',
  'TypeDetected': 'Hashes',
  'Dropped': 'Yes',
  'EngineeredFeatureCount': 0,
  'Transformations': ['']},
 {'RawFeatureName': 'MYCT',
  'TypeDetected': 'Numeric',
  'Dropped': 'No',
  'EngineeredFeatureCount': 1,
  'Transformations': ['MeanImputer']},
 {'RawFeatureName': 'MMIN',
  'TypeDetected': 'Numeric',
  'Dropped': 'No',
  'EngineeredFeatureCount': 1,
  'Transformations': ['MeanImputer']},
 {'RawFeatureName': 'MMAX',
  'TypeDetected': 'Numeric',
  'Dropped': 'No',
  'EngineeredFeatureCount': 1,
  'Transformations': ['MeanImputer']},
 {'RawFeatureName': 'CACH',
  'TypeDetected': 'Numeric',
  'Dropped': 'No',
  'EngineeredFeatureCount': 1,
  'Transformations': ['MeanImputer']},
 {'RawFeatureName': 'CHMIN',
  'TypeDetected': 'Numeric',
  'Dropped': 'No',
  'EngineeredFeatureC

In [23]:
custom_featurizer.get_featurization_summary(is_user_friendly=False)

[{'RawFeatureName': 'VendorName',
  'TypeDetected': 'CategoricalHash',
  'Dropped': 'No',
  'EngineeredFeatureCount': 16,
  'Transformations': ['StringCast-HashOneHotEncoder'],
  'TransformationParams': {'Transformer1': {'Input': ['VendorName'],
    'TransformationFunction': 'StringCast',
    'Operator': None,
    'FeatureType': 'CategoricalHash',
    'ShouldOutput': False,
    'TransformationParams': None},
   'Transformer2': {'Input': ['Transformer1'],
    'TransformationFunction': 'HashOneHotEncoder',
    'Operator': None,
    'FeatureType': None,
    'ShouldOutput': True,
    'TransformationParams': {'hashing_seed_val': 314489979, 'num_cols': 16}}}},
 {'RawFeatureName': 'ModelName',
  'TypeDetected': 'Hashes',
  'Dropped': 'Yes',
  'EngineeredFeatureCount': 0,
  'Transformations': [''],
  'TransformationParams': {'Transformer1': {'Input': ['ModelName'],
    'TransformationFunction': '',
    'Operator': None,
    'FeatureType': 'Hashes',
    'ShouldOutput': True,
    'Transformation

In [24]:
custom_featurizer.get_stats_feature_type_summary()

[{'statistic': '{"num_unique_vals": 30, "total_number_vals": 164, "total_number_vals_including_nans": 164, "num_na": 0, "column_type": "string", "num_unique_lens": 10, "average_entry_length": 5.085365853658536, "average_number_spaces": 0.0, "cardinality_ratio": 0.18292682926829268, "is_datetime": false, "is_all_nan": false}',
  'feature type': 'CategoricalHash',
  'column name': 'VendorName'},
 {'statistic': '{"num_unique_vals": 164, "total_number_vals": 164, "total_number_vals_including_nans": 164, "num_na": 0, "column_type": "string", "num_unique_lens": 15, "average_entry_length": 7.420731707317073, "average_number_spaces": 0.0, "cardinality_ratio": 1.0, "is_datetime": false, "is_all_nan": false}',
  'feature type': 'Hashes',
  'column name': 'ModelName'},
 {'statistic': '{"num_unique_vals": 55, "total_number_vals": 164, "total_number_vals_including_nans": 164, "num_na": 0, "column_type": "integer", "num_unique_lens": 0, "average_entry_length": 0, "average_number_spaces": 0, "cardina

<img src="https://github.com/retkowsky/images/blob/master/Powered-by-MS-Azure-logo-v2.png?raw=true" height="300" width="300">