In [None]:
import logging

from matplotlib import pyplot as plt
import pandas as pd

import azureml.core
from azureml.core.datastore import Datastore
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.core.dataset import Dataset
from azureml.data.datapath import DataPath
from azureml.train.automl import AutoMLConfig
from azureml.train.automl.utilities import get_primary_metrics
from azureml.interpret import ExplanationClient

In [None]:
!pip show cryptography

In [None]:
print(f'The current version of the Azure ML SDK is {azureml.core.VERSION}.')

# Snippets

```python
# Load the workspace specified by your parameters.
# Configuration code to access the workspace from all notebooks using the Workspace.from_config() method
from azureml.core import Workspace

try:
    ws = Workspace(subscription_id, resource_group, workspace_name)
    print("Found workspace {} at location {}".format(ws.name, ws.location))
    # write the details of the workspace to a configuration file to the notebook library
    ws.write_config()
    print("Workspace configuration succeeded. Skip the workspace creation steps below")
except:
    print("Workspace not accessible. Change your parameters or create a new workspace below")
```

In [None]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'automl-car-price-prediction'

experiment=Experiment(ws, experiment_name)

output = {}
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Experiment Name'] = experiment.name
outputDf = pd.DataFrame(data = output, index = [''])
outputDf.T

# Load data

```python
# Get the default datastore
default_ds = ws.get_default_datastore()

# Enumerate all datastores, indicating which is the default
for ds_name in ws.datastores:
    print(ds_name, "- Default =", ds_name == default_ds.name)
```

In [None]:
default_ds_name = ws.get_default_datastore().name
curr_ds = Datastore.get(ws, default_ds_name)

```python
# Upload files from local machine to the blob container the current datastore points to
car_datasets_path = [
    'output/car_train.csv',
    'output/car_test.csv'
]

curr_ds.upload_files(car_datasets_path, target_path='./car_dataset', overwrite=False, show_progress=True)
```

In [None]:
label_column_name = 'price'

train_data = 'https://cardealershipa0525196035.blob.core.windows.net/azureml-blobstore-52e628b0-c309-4152-8e63-77d08607560b/car_dataset/car_train.csv'
train_dataset = Dataset.Tabular.from_delimited_files(train_data)
test_data = 'https://cardealershipa0525196035.blob.core.windows.net/azureml-blobstore-52e628b0-c309-4152-8e63-77d08607560b/car_dataset/car_test.csv'
test_dataset = Dataset.Tabular.from_delimited_files(test_data)

# Train

## Choosing main metrics

1) Difference between RMSE and MAE
- In many circumstances it makes sense to give more weight to points further away from the mean--that is, being off by 10 is more than twice as bad as being off by 5. In such cases RMSE is a more appropriate measure of error.
- If being off by ten is just twice as bad as being off by 5, then MAE is more appropriate.

2) R2 is coefficient of determination, scaled between 0 and 1. R-squared is simply the fraction of response variance that is captured by the model. **It directly measures the goodness of fit in capturing the variance in training data.**

- If R-squared = 1, means the model fits the data perfectly.
- If R2=0.7, it says that with this model, we can explain 70% of what is going on in the real data, rest 30% can’t be explained.



In [None]:
automl_settings = {
    'training_data': train_dataset,
    'label_column_name': label_column_name,
    'task': 'regression',
    'featurization': 'auto', # check will automl standardize the data? double-check
    'primary_metric': 'normalized_root_mean_squared_error',
    'validation_size': 0.08,
    # https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.constants.supportedmodels.regression?view=azure-ml-py
    'blocked_models': ['FastLinearRegressor', 'KNearestNeighborsRegressor', 'DecisionTreeRegressor'], 
    'model_explainablility': True,
    'enable_dnn': False,
    'path': './',
    # 'compute_target': '', leave empty if run in local environment?
}

automl_config = AutoMLConfig(**automl_settings)

# Monitoring

In [None]:
local_run = experiment.submit(automl_config, show_output=True)

In [None]:
from azureml.widgets import RunDetails
RunDetails(local_run).show()

# Test

## Retrieve the best model

In [None]:
best_run, fitted_model = local_run.get_output()
print(best_run)
print(fitted_model)

## Retrieve best model based on other metric

```python
lookup_metric = "root_mean_squared_error"
best_run, fitted_model = remote_run.get_output(metric = lookup_metric)
print(best_run)
print(fitted_model)
```

In [None]:
X_train_df = train_dataset.drop_columns(columns=[label_column_name]).to_pandas_dataframe()
y_train_df = train_dataset.keep_columns(columns=[label_column_name], validate=True).to_pandas_dataframe()
X_test_df = test_dataset.drop_columns(columns=[label_column_name]).to_pandas_dataframe()
y_test_df = test_dataset.keep_columns(columns=[label_column_name], validate=True).to_pandas_dataframe()

In [None]:
y_pred_train = fitted_model.predict(train_data)
y_residual_train = y_train.values - y_pred_train

y_pred_test = fitted_model.predict(test_data)
y_residual_test = y_test.values - y_pred_tes

## Plotting predictions

In [None]:
%matplotlib inline
from sklearn.metrics import mean_squared_error, r2_score

# Set up a multi-plot chart.
f, (a0, a1) = plt.subplots(1, 2, gridspec_kw = {'width_ratios':[1, 1], 'wspace':0, 'hspace': 0})
f.suptitle('Regression Residual Values', fontsize = 18)
f.set_figheight(6)
f.set_figwidth(16)

# Plot residual values of training set.
a0.axis([0, 360, -100, 100])
a0.plot(y_residual_train, 'bo', alpha = 0.5)
a0.plot([-10,360],[0,0], 'r-', lw = 3)
a0.text(16,170,'RMSE = {0:.2f}'.format(np.sqrt(mean_squared_error(y_train, y_pred_train))), fontsize = 12)
a0.text(16,140,'R2 score = {0:.2f}'.format(r2_score(y_train, y_pred_train)),fontsize = 12)
a0.set_xlabel('Training samples', fontsize = 12)
a0.set_ylabel('Residual Values', fontsize = 12)

# Plot residual values of test set.
a1.axis([0, 90, -100, 100])
a1.plot(y_residual_test, 'bo', alpha = 0.5)
a1.plot([-10,360],[0,0], 'r-', lw = 3)
a1.text(5,170,'RMSE = {0:.2f}'.format(np.sqrt(mean_squared_error(y_test, y_pred_test))), fontsize = 12)
a1.text(5,140,'R2 score = {0:.2f}'.format(r2_score(y_test, y_pred_test)),fontsize = 12)
a1.set_xlabel('Test samples', fontsize = 12)
a1.set_yticklabels([])

plt.show()

In [None]:
%matplotlib inline
test_pred = plt.scatter(y_test, y_pred_test, color='')
test_test = plt.scatter(y_test, y_test, color='g')
plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)
plt.show()

# Sources

- https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/local-run-classification-credit-card-fraud/auto-ml-classification-credit-card-fraud-local.ipynb
- https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.ipynb