# AppEase Machine Learning

In this notebook, you use automated machine learning in Azure Machine Learning service to create a classification model to predict labels. This process accepts training data and configuration settings, and automatically iterates through combinations of different feature normalization/standardization methods, models, and hyperparameter settings to arrive at the best model.

To run this notebook, you only need an Azure subscription. Additionally, the data and labels JSON files need to be in the local directory. These two files should be merge-able on the index, and they should contain at least 63 records of data for training (this is the minimum assuming a 0.8/0.2 train/test split).

We found it easier to run this notebook in an Azure Data Science Virtual Machine using a Python 3 kernel to ensure that all the necessary packages were available.


In [30]:
# import packages
import azureml.core
from azureml.core.workspace import Workspace
from azureml.core import Workspace
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
import logging
from azureml.train.automl import AutoMLConfig
from azureml.core.experiment import Experiment
from azureml.widgets import RunDetails
from sklearn.metrics import mean_squared_error
from math import sqrt
import pickle

In [2]:
# required info
subscription = '<your-azure-subscription-id>'

# you choose these
workspace_resource_group = None # replace this if you'd like to use a pre-built resource group
workspace_loc = 'eastus' # feel free to change this
workspace_name = 'appeasewML'
compute_cluster_name = 'appeasecompute'

# name of files in local directory with data and labels (to be merged on indexes)
# NOTE: data_file must contain at least 63 records of data for training (with 0.8/0.2 train/test split)
data_file_name = 'simulated_health_data.json'
labels_file_name = 'random_labels.json'
label_column_name = 'Label'

In [None]:
# create an Azure workspace
if workspace_resource_group == None:
    create_RG = True
else:
    create_RG = False

try:
    ws = Workspace.get(name=workspace_name, subscription_id= subscription, resource_group=workspace_resource_group)
except:
    ws = Workspace.create(name= workspace_name, subscription_id=subscription,resource_group=workspace_resource_group, create_resource_group=create_RG,location=workspace_loc)

In [None]:
# create an Azure compute cluster
try: # Verify that cluster does not exist already
    cpu_cluster = ComputeTarget(workspace=ws, name=compute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, compute_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

In [5]:
# load data
data = pd.read_json(data_file_name)

# convert data types to int
data['bloodType'].replace(['A-','A+','B-','B+','AB-','AB+','O-','O+'], [0,1,2,3,4,5,6,7], inplace=True)
data['sex'].replace(['Male','Female'],[0,1],inplace=True)
data['name'] = data['name'].map(lambda x: int(x[4:]))
data['TimeStamp'] = data['TimeStamp'].astype(int)

labels = pd.read_json(labels_file_name, typ='series')
final_df = data.merge(labels.rename('Label'), left_index=True, right_index=True)

In [6]:
# Split the data into train and test sets
x_train, x_test = train_test_split(final_df, test_size=0.2, random_state=223)

In [7]:
# define settings for the experiment run (see parameters at https://docs.microsoft.com/azure/machine-learning/service/how-to-configure-auto-train)
automl_settings = {
    "n_cross_validations": 3,
    "primary_metric": "accuracy",
    "experiment_timeout_hours": 0.25,  # This is a time limit for testing purposes, remove it for real use cases, this will drastically limit ability to find the best model possible
    "verbosity": logging.INFO,
    "enable_stack_ensemble": False,
}

automl_config = AutoMLConfig(
    task="classification",
    debug_log="automl_errors.log",
    training_data=x_train,
    label_column_name=label_column_name,
    **automl_settings,
)

In [None]:
# create and run the Experiment
experiment = Experiment(ws, "AppEaseML")
local_run = experiment.submit(automl_config, show_output=True) # this can take about 20 minutes with the default settings

In [None]:
# explore the results and retrieve the best model
best_run, best_model = local_run.get_output()
best_model

In [None]:
# calculate the root mean squared error, mean absolute percent error, and accuracy of the best model
y_test = x_test.pop("Label")
y_predict = best_model.predict(x_test)

y_actual = y_test.values.flatten().tolist()
rmse = sqrt(mean_squared_error(y_actual, y_predict))
print("Model RMSE:")
print(rmse)
print()

sum_actuals = sum_errors = 0

for actual_val, predict_val in zip(y_actual, y_predict):
    abs_error = actual_val - predict_val
    if abs_error < 0:
        abs_error = abs_error * -1

    sum_errors = sum_errors + abs_error
    sum_actuals = sum_actuals + actual_val

mean_abs_percent_error = sum_errors / sum_actuals
print("Model MAPE:")
print(mean_abs_percent_error)
print()
print("Model Accuracy:")
print(1 - mean_abs_percent_error)

In [29]:
# save the best model
pkl_filename = "best_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(best_model, file)

# save training data and labels
data.to_csv('train_data.csv', index = False, header= True)
labels.to_csv('train_labels.csv', index= False, header = 'Label')