In [21]:
import inspect
from azureml.core import Workspace, Dataset
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.pipeline.core.graph import DataType
from azureml.pipeline.wrapper import Module, dsl, Pipeline

In [22]:
subscription_id = '4f455bd0-f95a-4b7d-8d08-078611508e0b'
resource_group = 'fundamental'
workspace_name = 'fundamental2' # use another workspace

namespace=workspace_name # for loading module

workspace = Workspace(subscription_id, resource_group, workspace_name)
print(workspace.name, workspace.resource_group, workspace.location, workspace.subscription_id, workspace.compute_targets.keys(),sep = '\n')

fundamental2
fundamental
eastasia
4f455bd0-f95a-4b7d-8d08-078611508e0b
dict_keys(['aml-compute', 'aml-compute2'])


In [23]:
print(workspace.compute_targets)
aml_compute_name = 'aml-compute'
try:
    aml_compute = AmlCompute(workspace, aml_compute_name)
    print("Found existing compute target: {}".format(aml_compute_name))
except:
    print("Creating new compute target: {}".format(aml_compute_name))

    provisioning_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2",
                                                                min_nodes=1,
                                                                max_nodes=4)
    aml_compute = ComputeTarget.create(workspace, aml_compute_name, provisioning_config)
    aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
print(aml_compute)

{'aml-compute': AmlCompute(workspace=Workspace.create(name='fundamental2', subscription_id='4f455bd0-f95a-4b7d-8d08-078611508e0b', resource_group='fundamental'), name=aml-compute, id=/subscriptions/4f455bd0-f95a-4b7d-8d08-078611508e0b/resourceGroups/fundamental/providers/Microsoft.MachineLearningServices/workspaces/fundamental2/computes/aml-compute, type=AmlCompute, provisioning_state=Succeeded, location=eastasia, tags=None), 'aml-compute2': AmlCompute(workspace=Workspace.create(name='fundamental2', subscription_id='4f455bd0-f95a-4b7d-8d08-078611508e0b', resource_group='fundamental'), name=aml-compute2, id=/subscriptions/4f455bd0-f95a-4b7d-8d08-078611508e0b/resourceGroups/fundamental/providers/Microsoft.MachineLearningServices/workspaces/fundamental2/computes/aml-compute2, type=AmlCompute, provisioning_state=Succeeded, location=eastasia, tags=None)}
Found existing compute target: aml-compute
AmlCompute(workspace=Workspace.create(name='fundamental2', subscription_id='4f455bd0-f95a-4b7d-

In [24]:
# register my own datatype
DataType.create_data_type(workspace, 'MyDirectory', description='', is_directory=True) # won't register repeatedly
DataType.create_data_type(workspace, 'MyFile', description='', is_directory=False)

<azureml.pipeline.core.graph.DataType at 0x1f4d1deba58>

In [25]:
# load data
dataset_name = 'THUCNews_TXT'
char2index_name = 'Char2Index_JSON'

if dataset_name not in workspace.datasets:
    print('Registering a THUCNews dataset for fasttext pipeline ...')
    path = ['https://datastore4fasttext.blob.core.windows.net/mytest3/THUCNews.txt']
    data = Dataset.File.from_files(path=path)
    data.register(workspace=workspace, name=dataset_name, description='THUCNews dataset is generated by filtering and filtering historical data of Sina News RSS subscription channel from 2005 to 2011')
    print('Registerd')
data = workspace.datasets[dataset_name]

if char2index_name not in workspace.datasets:
    print('Registering a Char2Index_JSON for fasttext pipeline ...')
    path = ['https://datastore4fasttext.blob.core.windows.net/mytest3/character2index.json']
    data = Dataset.File.from_files(path=path)
    data.register(workspace=workspace, name=char2index_name, description='The mapping relationship between character and index ')
    print('Registerd')
char2index = workspace.datasets[char2index_name]

print(data)
print(char2index)

FileDataset
{
  "source": [
    "https://datastore4fasttext.blob.core.windows.net/mytest3/THUCNews.txt"
  ],
  "definition": [
    "GetFiles"
  ],
  "registration": {
    "id": "0a805b2d-f748-49a7-94a9-be849144332f",
    "name": "THUCNews_TXT",
    "version": 1,
    "description": "THUCNews dataset is generated by filtering and filtering historical data of Sina News RSS subscription channel from 2005 to 2011",
    "workspace": "Workspace.create(name='fundamental2', subscription_id='4f455bd0-f95a-4b7d-8d08-078611508e0b', resource_group='fundamental')"
  }
}
FileDataset
{
  "source": [
    "https://datastore4fasttext.blob.core.windows.net/mytest3/character2index.json"
  ],
  "definition": [
    "GetFiles"
  ],
  "registration": {
    "id": "d2b97358-08d5-4428-b17e-559cf3a3de9b",
    "name": "Char2Index_JSON",
    "version": 1,
    "description": "The mapping relationship between character and index ",
    "workspace": "Workspace.create(name='fundamental2', subscription_id='4f455bd0-f95a-

In [26]:
# load module
try:
    split_data_txt_module_func = Module.load(workspace=workspace, namespace=namespace, name='Split Data Txt')
    print('found split_data_txt_module')
except:
    print('not found split_data_txt_module, register it now...')
    yaml_file='split_data_txt.spec.yaml'
    split_data_txt_module_func = Module.register(workspace=workspace, yaml_file=yaml_file)

try:
    fasttext_train_module_func = Module.load(workspace=workspace, namespace=namespace, name='FastText Train')
    print('found fasttext_train_module')
except:
    print('not found fasttext_train_module, register it now...')
    yaml_file='fasttext_train.spec.yaml'
    fasttext_train_module_func = Module.register(workspace=workspace, yaml_file=yaml_file)

try:
    fasttext_test_module_func = Module.load(workspace=workspace, namespace=namespace, name='FastText Test')
    print('found fasttext_test_module')
except:
    print('not found fasttext_test_module, register it now...')
    yaml_file='fasttext_test.spec.yaml'
    fasttext_test_module_func = Module.register(workspace=workspace, yaml_file=yaml_file)
# inspect signature
print(inspect.signature(split_data_txt_module_func))
print(inspect.signature(fasttext_train_module_func))
print(inspect.signature(fasttext_test_module_func))

found split_data_txt_module
found fasttext_train_module
found fasttext_test_module
(input_dir:'Input Dir'=None, training_data_ratio:'Training Data Ratio'='0.7', validation_data_ratio:'Validation Data Ratio'='0.1', random_split:'Random Split'='False', seed:'Seed'='0')
(training_data_dir:'Training Data Dir'=None, validation_data_dir:'Validation Data Dir'=None, char2_index_dir:'Char2Index Dir'=None, epochs:'Epochs'='2', batch_size:'Batch Size'='32', learning_rate:'Learning Rate'='0.0005', embedding_dim:'Embedding Dim'='128')
(trained_model_dir:'Trained Model Dir'=None, test_data_dir:'Test Data Dir'=None, char2_index_dir:'Char2Index Dir'=None)


In [27]:
# connect module
split_data_txt = split_data_txt_module_func(
    input_dir = data,
    training_data_ratio = 0.7,
    validation_data_ratio = 0.1,
    random_split = False,
    seed = 1
)
print(split_data_txt.outputs)

fasttext_train = fasttext_train_module_func(
    training_data_dir = split_data_txt.outputs.training_data_output,
    validation_data_dir = split_data_txt.outputs.validation_data_output,
    char2_index_dir = char2index,
    epochs = 1,
    batch_size = 64,
    learning_rate = 0.0005,
    embedding_dim = 128
)
print(fasttext_train.outputs)

fasttext_test = fasttext_test_module_func(
    trained_model_dir = fasttext_train.outputs.trained_model_dir,
    test_data_dir = split_data_txt.outputs.test_data_output,
    char2_index_dir = char2index
)

{'training_data_output': <azureml.pipeline.wrapper._module._OutputBuilder object at 0x000001F4D1DEBA20>, 'validation_data_output': <azureml.pipeline.wrapper._module._OutputBuilder object at 0x000001F4D1DEBC88>, 'test_data_output': <azureml.pipeline.wrapper._module._OutputBuilder object at 0x000001F4D1DEB8D0>}
{'trained_model_dir': <azureml.pipeline.wrapper._module._OutputBuilder object at 0x000001F4D1D857F0>}


In [28]:
# pipeline
pipeline = Pipeline(nodes=[split_data_txt, fasttext_train, fasttext_test], workspace=workspace, default_compute_target=aml_compute_name)

In [29]:
# validate
pipeline.validate()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

SupportDetectView()

{'result': 'validation passed', 'errors': []}

In [30]:
# run
run = pipeline.submit(experiment_name='fasttext_test')
run.wait_for_completion()
pipeline.save(experiment_name='fasttext_test')

vfVJpax4bk6BiyIl47eco%2Bc%2Fq60AbYcbL0%3D&st=2020-06-17T09%3A12%3A56Z&se=2020-06-17T17%3A22%3A56Z&sp=r', 'azureml-logs/75_job_post-tvmps_0e5b20827f3ec455d9622f3716ef6ecba3723f15ab0c6630ee1b90283621397b_d.txt': 'https://fundamental29908941189.blob.core.windows.net/azureml/ExperimentRun/dcid.84d4c61b-7adf-429a-af0d-ed2cf2f9096c/azureml-logs/75_job_post-tvmps_0e5b20827f3ec455d9622f3716ef6ecba3723f15ab0c6630ee1b90283621397b_d.txt?sv=2019-02-02&sr=b&sig=Ah%2Fq9OaTPIQMinoKOux%2F49ArA7bjQdflqqwms%2FQlx8I%3D&st=2020-06-17T09%3A12%3A56Z&se=2020-06-17T17%3A22%3A56Z&sp=r', 'azureml-logs/process_info.json': 'https://fundamental29908941189.blob.core.windows.net/azureml/ExperimentRun/dcid.84d4c61b-7adf-429a-af0d-ed2cf2f9096c/azureml-logs/process_info.json?sv=2019-02-02&sr=b&sig=LuX9pvuGT9mqh3YLkP8S8h1lfCbX0UbDqT0FfBFqb%2Bw%3D&st=2020-06-17T09%3A12%3A56Z&se=2020-06-17T17%3A22%3A56Z&sp=r', 'azureml-logs/process_status.json': 'https://fundamental29908941189.blob.core.windows.net/azureml/ExperimentRun/d

Name,Id,Details page,Pipeline type,Updated on,Created by,Tags
Pipeline-Created-on-6-17-2020,b2ad2df4-643e-46a1-8366-89535caff476,Link,TrainingPipeline,"June 17, 2020 05:23 PM",Xiaoyu Yang,azureml.Designer: true

0
azureml.Designer: true
