In [1]:
import inspect
from azureml.core import Workspace, Dataset
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.pipeline.core.graph import DataType
from azureml.pipeline.wrapper import Module, dsl, Pipeline

In [2]:
subscription_id = '4f455bd0-f95a-4b7d-8d08-078611508e0b'
resource_group = 'fundamental'
workspace_name = 'fundamental3' # use another workspace

namespace=workspace_name # for loading module

workspace = Workspace(subscription_id, resource_group, workspace_name)
print(workspace.name, workspace.resource_group, workspace.location, workspace.subscription_id, workspace.compute_targets.keys(),sep = '\n')

fundamental3
fundamental
eastasia
4f455bd0-f95a-4b7d-8d08-078611508e0b
dict_keys(['aml-compute'])


In [3]:
# choose compute target
print(workspace.compute_targets)
aml_compute_name = 'aml-compute'
try:
    aml_compute = AmlCompute(workspace, aml_compute_name)
    print("Found existing compute target: {}".format(aml_compute_name))
except:
    print("Creating new compute target: {}".format(aml_compute_name))

    provisioning_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2",
                                                                min_nodes=1,
                                                                max_nodes=4)
    aml_compute = ComputeTarget.create(workspace, aml_compute_name, provisioning_config)
    aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
print(aml_compute)

{'aml-compute': AmlCompute(workspace=Workspace.create(name='fundamental3', subscription_id='4f455bd0-f95a-4b7d-8d08-078611508e0b', resource_group='fundamental'), name=aml-compute, id=/subscriptions/4f455bd0-f95a-4b7d-8d08-078611508e0b/resourceGroups/fundamental/providers/Microsoft.MachineLearningServices/workspaces/fundamental3/computes/aml-compute, type=AmlCompute, provisioning_state=Succeeded, location=eastasia, tags=None)}
Found existing compute target: aml-compute
AmlCompute(workspace=Workspace.create(name='fundamental3', subscription_id='4f455bd0-f95a-4b7d-8d08-078611508e0b', resource_group='fundamental'), name=aml-compute, id=/subscriptions/4f455bd0-f95a-4b7d-8d08-078611508e0b/resourceGroups/fundamental/providers/Microsoft.MachineLearningServices/workspaces/fundamental3/computes/aml-compute, type=AmlCompute, provisioning_state=Succeeded, location=eastasia, tags=None)


In [4]:
# register my own datatype
DataType.create_data_type(workspace, 'MyDirectory', description='', is_directory=True) # won't register repeatedly
DataType.create_data_type(workspace, 'MyFile', description='', is_directory=False)

<azureml.pipeline.core.graph.DataType at 0x7fecc93fa8d0>

In [5]:
# load data
dataset_name = 'THUCNews_TXT'
char2index_name = 'Char2Index_JSON'

if dataset_name not in workspace.datasets:
    print('Registering a THUCNews dataset for fasttext pipeline ...')
    path = ['https://datastore4fasttext.file.core.windows.net/data4fasttext/THUCNews.txt']
    data = Dataset.File.from_files(path=path)
    data.register(workspace=workspace, name=dataset_name, description='THUCNews dataset is generated by filtering and filtering historical data of Sina News RSS subscription channel from 2005 to 2011')
    print('Registerd')
data = workspace.datasets[dataset_name]

if char2index_name not in workspace.datasets:
    print('Registering a Char2Index_JSON for fasttext pipeline ...')
    path = ['https://datastore4fasttext.file.core.windows.net/data4fasttext/character2index.json']
    data = Dataset.File.from_files(path=path)
    data.register(workspace=workspace, name=char2index_name, description='The mapping relationship between character and index ')
    print('Registerd')
char2index = workspace.datasets[char2index_name]

print(data)
print(char2index)

FileDataset
{
  "source": [
    "https://datastore4fasttext.blob.core.windows.net/mytest3/THUCNews.txt"
  ],
  "definition": [
    "GetFiles"
  ],
  "registration": {
    "id": "9e16ea04-3074-4f84-8a8c-83adb226c4ae",
    "name": "THUCNews_TXT",
    "version": 1,
    "description": "THUCNews dataset is generated by filtering and filtering historical data of Sina News RSS subscription channel from 2005 to 2011",
    "workspace": "Workspace.create(name='fundamental3', subscription_id='4f455bd0-f95a-4b7d-8d08-078611508e0b', resource_group='fundamental')"
  }
}
FileDataset
{
  "source": [
    "https://datastore4fasttext.blob.core.windows.net/mytest3/character2index.json"
  ],
  "definition": [
    "GetFiles"
  ],
  "registration": {
    "id": "9de9b550-46a4-41ce-b1b4-54b6c665fcf3",
    "name": "Char2Index_JSON",
    "version": 1,
    "description": "The mapping relationship between character and index ",
    "workspace": "Workspace.create(name='fundamental3', subscription_id='4f455bd0-f95a-

In [6]:
# load module
try:
    split_data_txt_module_func = Module.load(workspace=workspace, namespace=namespace, name='Split Data Txt')
    print('found split_data_txt_module')
except:
    print('not found split_data_txt_module, register it now...')
    yaml_file='split_data_txt/split_data_txt.spec.yaml'
    split_data_txt_module_func = Module.register(workspace=workspace, yaml_file=yaml_file)

try:
    fasttext_train_module_func1 = Module.load(workspace=workspace, namespace=namespace, name='FastText Train')
    fasttext_train_module_func2 = Module.load(workspace=workspace, namespace=namespace, name='FastText Train')
    print('found fasttext_train_module')
except:
    print('not found fasttext_train_module, register it now...')
    yaml_file='fasttext_train/fasttext_train.spec.yaml'
    fasttext_train_module_func1 = Module.register(workspace=workspace, yaml_file=yaml_file)
    fasttext_train_module_func2 = Module.register(workspace=workspace, yaml_file=yaml_file)

try:
    fasttext_test_module_func1 = Module.load(workspace=workspace, namespace=namespace, name='FastText Test')
    fasttext_test_module_func2 = Module.load(workspace=workspace, namespace=namespace, name='FastText Test')
    print('found fasttext_test_module')
except:
    print('not found fasttext_test_module, register it now...')
    yaml_file='fasttext_test/fasttext_test.spec.yaml'
    fasttext_test_module_func1 = Module.register(workspace=workspace, yaml_file=yaml_file)
    fasttext_test_module_func2 = Module.register(workspace=workspace, yaml_file=yaml_file)
# inspect signature
print(inspect.signature(split_data_txt_module_func))
print(inspect.signature(fasttext_train_module_func1))
print(inspect.signature(fasttext_test_module_func1))

found split_data_txt_module
found fasttext_train_module
found fasttext_test_module
(input_dir:'Input Dir'=None, training_data_ratio:'Training Data Ratio'='0.7', validation_data_ratio:'Validation Data Ratio'='0.1', random_split:'Random Split'='False', seed:'Seed'='0')
(training_data_dir:'Training Data Dir'=None, validation_data_dir:'Validation Data Dir'=None, char2_index_dir:'Char2Index Dir'=None, epochs:'Epochs'='2', batch_size:'Batch Size'='32', learning_rate:'Learning Rate'='0.0005', embedding_dim:'Embedding Dim'='128')
(trained_model_dir:'Trained Model Dir'=None, test_data_dir:'Test Data Dir'=None, char2_index_dir:'Char2Index Dir'=None)


In [7]:
# connect module
split_data_txt = split_data_txt_module_func(
    input_dir = data,
    training_data_ratio = 0.7,
    validation_data_ratio = 0.1,
    random_split = False,
    seed = 1
)
print(split_data_txt.outputs)

fasttext_train1 = fasttext_train_module_func1(
    training_data_dir = split_data_txt.outputs.training_data_output,
    validation_data_dir = split_data_txt.outputs.validation_data_output,
    char2_index_dir = char2index,
    epochs = 1,
    batch_size = 64,
    learning_rate = 0.0005,
    embedding_dim = 128
)
print(fasttext_train1.outputs)

fasttext_train2 = fasttext_train_module_func2(
    training_data_dir = split_data_txt.outputs.training_data_output,
    validation_data_dir = split_data_txt.outputs.validation_data_output,
    char2_index_dir = char2index,
    epochs = 2,
    batch_size = 64,
    learning_rate = 0.0005,
    embedding_dim = 128
)

fasttext_test1 = fasttext_test_module_func1(
    trained_model_dir = fasttext_train1.outputs.trained_model_dir,
    test_data_dir = split_data_txt.outputs.test_data_output,
    char2_index_dir = char2index
)

fasttext_test2 = fasttext_test_module_func2(
    trained_model_dir = fasttext_train2.outputs.trained_model_dir,
    test_data_dir = split_data_txt.outputs.test_data_output,
    char2_index_dir = char2index
)

{'training_data_output': <azureml.pipeline.wrapper._module._OutputBuilder object at 0x7fecc93faef0>, 'validation_data_output': <azureml.pipeline.wrapper._module._OutputBuilder object at 0x7fecc93fae10>, 'test_data_output': <azureml.pipeline.wrapper._module._OutputBuilder object at 0x7fecc8913c18>}
{'trained_model_dir': <azureml.pipeline.wrapper._module._OutputBuilder object at 0x7fecc836ca58>}


In [8]:
# pipeline
pipeline = Pipeline(nodes=[split_data_txt, fasttext_train1, fasttext_test1, fasttext_train2, fasttext_test2], workspace=workspace, default_compute_target=aml_compute_name)

In [9]:
# validate
pipeline.validate()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

SupportDetectView()

{'result': 'validation passed', 'errors': []}

In [10]:
# run
run = pipeline.submit(experiment_name='fasttext_test')
run.wait_for_completion()
pipeline.save(experiment_name='fasttext_test')

arning_Rate', '--embedding_dim', '$AZUREML_PARAMETER_Embedding_Dim', '--trained_model_dir', '$AZUREML_DATAREFERENCE_Trained_Model_Dir'], 'sourceDirectoryDataStore': None, 'framework': 'Python', 'communicator': 'None', 'target': 'aml-compute', 'dataReferences': {'Training_Data_Dir': {'dataStoreName': 'workspaceblobstore', 'mode': 'Mount', 'pathOnDataStore': 'azureml/c2774f44-79a8-4937-9609-1d13eb9c0d3d/Training_Data_Output', 'pathOnCompute': None, 'overwrite': True}, 'Validation_Data_Dir': {'dataStoreName': 'workspaceblobstore', 'mode': 'Mount', 'pathOnDataStore': 'azureml/c2774f44-79a8-4937-9609-1d13eb9c0d3d/Validation_Data_Output', 'pathOnCompute': None, 'overwrite': True}, 'Trained_Model_Dir': {'dataStoreName': 'workspaceblobstore', 'mode': 'Mount', 'pathOnDataStore': 'azureml/7315dfc1-1bb6-4ff7-9525-dbf5db2cd0c3/Trained_Model_Dir', 'pathOnCompute': None, 'overwrite': True}}, 'data': {'Char2_Index_Dir': {'dataLocation': {'dataset': {'id': '9de9b550-46a4-41ce-b1b4-54b6c665fcf3', 'name

Name,Id,Details page,Pipeline type,Updated on,Created by,Tags
Pipeline-Created-on-6-21-2020,102ba191-14d8-43d4-9050-3926850f5d5a,Link,TrainingPipeline,"June 21, 2020 08:40 AM",Xiaoyu Yang,azureml.Designer: true

0
azureml.Designer: true
