# Step 1 - Create Workspace and prepare training data

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn import linear_model 
from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import azureml
from azureml.core import Run
from azureml.core import Workspace
from azureml.core.run import Run
from azureml.core.experiment import Experiment
from azureml.train.automl import AutoMLConfig
from azureml.data.azure_storage_datastore import AzureBlobDatastore
from azureml.core import Dataset
import pickle

# Verify AML SDK Installed
# view version history at https://pypi.org/project/azureml-sdk/#history 
print("SDK Version:", azureml.core.VERSION)

## Create a workspace

In [None]:
# Provide the Subscription ID of your existing Azure subscription
subscription_id = "xxx-xxx-xxx"

# Provide values for the Resource Group and Workspace that will be created
resource_group = "service-labs"
workspace_name = "service-labs-ws"
workspace_region = 'eastus'  # eastus, westcentralus, southeastasia, australiaeast, westeurope

In [None]:
# By using the exist_ok param, if the worskpace already exists we get a reference to the existing workspace
ws = Workspace.create(
    name = workspace_name,
    subscription_id = subscription_id,
    resource_group = resource_group, 
    location = workspace_region,
    exist_ok = True)

print("Workspace Provisioning complete.")

## Create Azure Machine Learning TabularDataset

Upload the training data to the default workspace datastore which is backed by the Azure blob storage. Next, using the training data saved in the default workspace datastore, we will create an unregistered TabularDataset pointing to the path in the datastore. This dataset reference, will allow us to seamlessly access the training data during model training without worrying about connection strings or data paths.

In [None]:
# check current working directory
print("Current working directory is ", os.path.abspath(os.path.curdir))

# upload training dataset to default workspace datastore
datastore = ws.get_default_datastore()
datastore.upload_files(files = ['./data/UsedCars_Affordability.csv'],
                       target_path = 'train-dataset/tabular/',
                       overwrite = True,
                       show_progress = True)

# create TabularDataset reference
dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 
                                                        'train-dataset/tabular/UsedCars_Affordability.csv')])

# target or label column name
target_column_name = 'Affordable'

# preview the first 5 rows of the dataset
dataset.take(5).to_pandas_dataframe()

# Step 2 - Define a helper method that will use AutoML to train multiple models and pick the best one #

In [None]:
def auto_train_model(ws, experiment_name, dataset, target_column_name, training_target_accuracy):

    # start a training run by defining an experiment
    experiment = Experiment(ws, experiment_name)

    # Configure the automated ML job
    # The model training is configured to run on the local machine
    # The values for all settings are documented at 
    # https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train
    # We no longer have to scale the input values, as Auto ML will try various data scaling approaches automatically
    Automl_config = AutoMLConfig(task = 'classification',
                             training_data = dataset, 
                             label_column_name = target_column_name, 
                             primary_metric = 'accuracy', 
                             preprocess=True, 
                             iteration_timeout_minutes = 3, 
                             iterations = 20, 
                             n_cross_validations = 3, 
                             experiment_exit_score = training_target_accuracy, 
                             blacklist_models = ['KNN'], 
                             path='./04-automl/outputs')

    # Execute the job
    run = experiment.submit(Automl_config, show_output=True)

    # Get the run with the highest accuracy value.
    best_run, best_model = run.get_output()

    return (best_model, run, best_run)

# Step 3 - Execute the AutoML driven training #

In [None]:
experiment_name = "Experiment-AutoML-04"
training_target_accuracy = 0.93

best_model, run, best_run = auto_train_model(ws, 
                                             experiment_name, 
                                             dataset, 
                                             target_column_name, 
                                             training_target_accuracy)

# Examine some of the metrics for the best performing run
import pprint
pprint.pprint({k: v for k, v in best_run.get_metrics().items() if isinstance(v, float)})

# Step 4 - Try the best model #

In [None]:
age1 = 60
km1 = 40000
age2 = 23
km2 = 100000

inputs = [[age1, km1], [age2, km2]]
data_df = pd.DataFrame(np.array(inputs).reshape(-1, 2), columns = ['Age', 'KM'])
print(data_df)
print('')
print('Predictions: ', best_model.predict(data_df))

# Step 5 - Register the best performing model for later use and deployment #

In [None]:
model_name = 'usedcarsmodel'
description = 'AutoML trained used cars classifier'

run.register_model(model_name, description=description)