In [2]:
# https://docs.microsoft.com/en-us/learn/modules/train-local-model-with-azure-mls/6-train-model-using-experiment-service
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core import Workspace,Experiment,Run
import os

# Step 1: name the cluster and set the minimal and maximal number of nodes 
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "cpucluster")
min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 3)

# Step 2: choose environment variables 
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")

provisioning_config = AmlCompute.provisioning_configuration(
    vm_size = vm_size, min_nodes = min_nodes, max_nodes = max_nodes)

ws = Workspace.from_config()

# create the cluster
compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)

In [3]:
#upload data by using get_default_datastore()
ds = ws.get_default_datastore()
ds.upload(src_dir='./data_mnist', target_path='mnist', overwrite=True, show_progress=True)

Uploading an estimated of 4 files
Uploading ./data_mnist/test-images.gz
Uploading ./data_mnist/test-labels.gz
Uploading ./data_mnist/train-images.gz
Uploading ./data_mnist/train-labels.gz
Uploaded ./data_mnist/train-labels.gz, 1 files out of an estimated total of 4
Uploaded ./data_mnist/test-labels.gz, 2 files out of an estimated total of 4
Uploaded ./data_mnist/test-images.gz, 3 files out of an estimated total of 4
Uploaded ./data_mnist/train-images.gz, 4 files out of an estimated total of 4
Uploaded 4 files


$AZUREML_DATAREFERENCE_2e6c0a7ccfd34f0c93cde177b7853be9

In [4]:
import os

# create the folder
folder_training_script = './trial_model_mnist'
os.makedirs(folder_training_script, exist_ok=True)

In [5]:
%%writefile $folder_training_script/train.py

import argparse
import os
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.externals import joblib

from azureml.core import Run

import gzip
import struct

# load compressed MNIST gz files and return numpy arrays
def load_data(filename, label=False):
    with gzip.open(filename) as gz:
        struct.unpack('I', gz.read(4))
        n_items = struct.unpack('>I', gz.read(4))
        if not label:
            n_rows = struct.unpack('>I', gz.read(4))[0]
            n_cols = struct.unpack('>I', gz.read(4))[0]
            res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8)
            res = res.reshape(n_items[0], n_rows * n_cols)
        else:
            res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8)
            res = res.reshape(n_items[0], 1)
    return res

# create three parameters, the location of the data files, and the maximun value of k and the interval
parser = argparse.ArgumentParser()
parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point')
parser.add_argument('--kmax', type=int, dest='kmax', default=15, help='max k value')
parser.add_argument('--kinterval', type=int, dest='kinterval', default=2, help='k interval')
args = parser.parse_args()

data_folder = os.path.join(args.data_folder, 'mnist')
print('Data folder:', data_folder)

# load the train and test set into numpy arrays
X_train = load_data(os.path.join(data_folder, 'train-images.gz'), False) / 255.0
X_test = load_data(os.path.join(data_folder, 'test-images.gz'), False) / 255.0

# Print variable set dimension
print(X_train.shape, X_test.shape, sep = '\n')

y_train = load_data(os.path.join(data_folder, 'train-labels.gz'), True).reshape(-1)
y_test = load_data(os.path.join(data_folder, 'test-labels.gz'), True).reshape(-1)

# Print the response variable dimension
print( y_train.shape, y_test.shape, sep = '\n')

# Get hold of the current run
run = Run.get_context()

print('Train kNN models with k equals to', range(1, args.kmax, args.kinterval))

# Generate a wide range of k and find the best models.  Also create a list to store the evaluation result for each value of k
kVals = range(1, args.kmax,args.kinterval)
evaluation = []

# loop over the models with different parameters to find the one with the lowest error rate
for k in kVals:
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)

    # use the test dataset for evaluation and append the result to the evaluation list
    score = model.score(X_test, y_test)
    print("k=%d, accuracy=%.2f%%" % (k, score * 100))
    evaluation.append(score)

# Find the value of k with the best performance
i = int(np.argmax(evaluation))
print("k = %d with best performance with %.2f%% accuracy given current testset" % (kVals[i], evaluation[i] * 100))

model = KNeighborsClassifier(n_neighbors=kVals[i])

run.log('Best_k', kVals[i])
run.log('accuracy', evaluation[i])

os.makedirs('outputs', exist_ok=True)

# Save the model as a pickle file in the outputs folder of the experiment workspace. The pickle file is used to deploy the 
# model.  The file saved in the outputs folder automatically uploads into the experiment record
joblib.dump(value = model, filename = 'outputs/knn_mnist_model.pkl')

Writing ./trial_model_mnist/train.py


In [6]:
from azureml.train.estimator import Estimator

script_params = {
    '--data-folder': ds.as_mount(),
    '--kmax': 5,
    '--kinterval': 2
}

# Import the Scikit-learn package 
est = Estimator(source_directory=folder_training_script,
                script_params=script_params,
                compute_target=compute_target,
                entry_script='train.py',
                conda_packages=['scikit-learn'])

In [8]:
from azureml.core import Experiment

# Create an experiment
experiment = Experiment(workspace = ws, name = "my-first-experiment")

run = experiment.submit(config=est)
run

Experiment,Id,Type,Status,Details Page,Docs Page
my-first-experiment,my-first-experiment_1565405753_f0741b5d,azureml.scriptrun,Starting,Link to Azure Portal,Link to Documentation


In [9]:
# monitor the run
from azureml.widgets import RunDetails

RunDetails(run).show()

A Jupyter Widget

In [10]:
#get the result
print(run.get_metrics())

{}
