In [7]:
import inspect
from azureml.core import Workspace, Dataset
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.pipeline.wrapper import Module, dsl, Pipeline

In [8]:
subscription_id = '4f455bd0-f95a-4b7d-8d08-078611508e0b'
resource_group = 'fundamental'
workspace_name = 'fundamental' # use another workspace

namespace='fundamental' # for loading module

workspace = Workspace(subscription_id, resource_group, workspace_name)
print(workspace.name, workspace.resource_group, workspace.location, workspace.subscription_id, workspace.compute_targets.keys(),sep = '\n')

fundamental2
fundamental
eastasia
4f455bd0-f95a-4b7d-8d08-078611508e0b
dict_keys(['aml-compute', 'aml-compute2'])


In [9]:
# choose compute target
print(workspace.compute_targets)
aml_compute_name = 'aml-compute'
try:
    aml_compute = AmlCompute(workspace, aml_compute_name)
    print("Found existing compute target: {}".format(aml_compute_name))
except:
    print("Creating new compute target: {}".format(aml_compute_name))

    provisioning_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2",
                                                                min_nodes=1,
                                                                max_nodes=4)
    aml_compute = ComputeTarget.create(workspace, aml_compute_name, provisioning_config)
    aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
print(aml_compute)

{'aml-compute': AmlCompute(workspace=Workspace.create(name='fundamental2', subscription_id='4f455bd0-f95a-4b7d-8d08-078611508e0b', resource_group='fundamental'), name=aml-compute, id=/subscriptions/4f455bd0-f95a-4b7d-8d08-078611508e0b/resourceGroups/fundamental/providers/Microsoft.MachineLearningServices/workspaces/fundamental2/computes/aml-compute, type=AmlCompute, provisioning_state=Succeeded, location=eastasia, tags=None), 'aml-compute2': AmlCompute(workspace=Workspace.create(name='fundamental2', subscription_id='4f455bd0-f95a-4b7d-8d08-078611508e0b', resource_group='fundamental'), name=aml-compute2, id=/subscriptions/4f455bd0-f95a-4b7d-8d08-078611508e0b/resourceGroups/fundamental/providers/Microsoft.MachineLearningServices/workspaces/fundamental2/computes/aml-compute2, type=AmlCompute, provisioning_state=Succeeded, location=eastasia, tags=None)}
Found existing compute target: aml-compute2
AmlCompute(workspace=Workspace.create(name='fundamental2', subscription_id='4f455bd0-f95a-4b7d

In [10]:
# load data
dataset_name = 'THUCNews_TXT'
char2index_name = 'Char2Index_JSON'

if dataset_name not in workspace.datasets:
    print('Registering a THUCNews dataset for fasttext pipeline ...')
    path = ['https://datastore4fasttext.file.core.windows.net/data4fasttext/THUCNews.txt']
    data = Dataset.File.from_files(path=path)
    data.register(workspace=workspace, name=dataset_name, description='THUCNews dataset is generated by filtering and filtering historical data of Sina News RSS subscription channel from 2005 to 2011')
    print('Registerd')
data = workspace.datasets[dataset_name]

if char2index_name not in workspace.datasets:
    print('Registering a Char2Index_JSON for fasttext pipeline ...')
    path = ['https://datastore4fasttext.file.core.windows.net/data4fasttext/character2index.json']
    data = Dataset.File.from_files(path=path)
    data.register(workspace=workspace, name=char2index_name, description='The mapping relationship between character and index ')
    print('Registerd')
char2index = workspace.datasets[char2index_name]

print(data)
print(char2index)

FileDataset
{
  "source": [
    "https://datastore4fasttext.file.core.windows.net/data4fasttext/THUCNews.txt"
  ],
  "definition": [
    "GetFiles"
  ],
  "registration": {
    "id": "8e18110b-5217-4cac-b340-059b1f1134cd",
    "name": "THUCNews_TXT",
    "version": 1,
    "description": "THUCNews dataset is generated by filtering and filtering historical data of Sina News RSS subscription channel from 2005 to 2011",
    "workspace": "Workspace.create(name='fundamental2', subscription_id='4f455bd0-f95a-4b7d-8d08-078611508e0b', resource_group='fundamental')"
  }
}
FileDataset
{
  "source": [
    "https://datastore4fasttext.file.core.windows.net/data4fasttext/character2index.json"
  ],
  "definition": [
    "GetFiles"
  ],
  "registration": {
    "id": "e542f20b-c02b-41bc-ba8b-3372bdcf51c6",
    "name": "Char2Index_JSON",
    "version": 1,
    "description": "The mapping relationship between character and index ",
    "workspace": "Workspace.create(name='fundamental2', subscription_id='4f

In [11]:
# load module
try:
    split_data_txt_module_func = Module.load(workspace=workspace, namespace=namespace, name='Split Data Txt')
    print('found split_data_txt_module')
except:
    print('not found split_data_txt_module, register it now...')
    yaml_file='split_data_txt.spec.yaml'
    split_data_txt_module_func = Module.register(workspace=workspace, yaml_file=yaml_file)

try:
    fasttext_train_module_func1 = Module.load(workspace=workspace, namespace=namespace, name='FastText Train')
    fasttext_train_module_func2 = Module.load(workspace=workspace, namespace=namespace, name='FastText Train')
    print('found fasttext_train_module')
except:
    print('not found fasttext_train_module, register it now...')
    yaml_file='fasttext_train.spec.yaml'
    fasttext_train_module_func1 = Module.register(workspace=workspace, yaml_file=yaml_file)
    fasttext_train_module_func2 = Module.register(workspace=workspace, yaml_file=yaml_file)

try:
    fasttext_test_module_func1 = Module.load(workspace=workspace, namespace=namespace, name='FastText Test')
    fasttext_test_module_func2 = Module.load(workspace=workspace, namespace=namespace, name='FastText Test')
    print('found fasttext_test_module')
except:
    print('not found fasttext_test_module, register it now...')
    yaml_file='fasttext_test.spec.yaml'
    fasttext_test_module_func1 = Module.register(workspace=workspace, yaml_file=yaml_file)
    fasttext_test_module_func2 = Module.register(workspace=workspace, yaml_file=yaml_file)
# inspect signature
print(inspect.signature(split_data_txt_module_func))
print(inspect.signature(fasttext_train_module_func1))
print(inspect.signature(fasttext_test_module_func1))

found split_data_txt_module
found fasttext_train_module
found fasttext_test_module
(input_dir:'Input Dir'=None, training_data_ratio:'Training Data Ratio'='0.7', validation_data_ratio:'Validation Data Ratio'='0.1', random_split:'Random Split'='False', seed:'Seed'='0')
(training_data_dir:'Training Data Dir'=None, validation_data_dir:'Validation Data Dir'=None, char2_index_dir:'Char2Index Dir'=None, epochs:'Epochs'='2', batch_size:'Batch Size'='32', learning_rate:'Learning Rate'='0.0005', embedding_dim:'Embedding Dim'='128')
(trained_model_dir:'Trained Model Dir'=None, test_data_dir:'Test Data Dir'=None, char2_index_dir:'Char2Index Dir'=None)


In [12]:
# connect module
split_data_txt = split_data_txt_module_func(
    input_dir = data,
    training_data_ratio = 0.7,
    validation_data_ratio = 0.1,
    random_split = False,
    seed = 1
)
print(split_data_txt.outputs)

fasttext_train1 = fasttext_train_module_func1(
    training_data_dir = split_data_txt.outputs.training_data_output,
    validation_data_dir = split_data_txt.outputs.validation_data_output,
    char2_index_dir = char2index,
    epochs = 1,
    batch_size = 64,
    learning_rate = 0.0005,
    embedding_dim = 128
)
print(fasttext_train1.outputs)

fasttext_train2 = fasttext_train_module_func2(
    training_data_dir = split_data_txt.outputs.training_data_output,
    validation_data_dir = split_data_txt.outputs.validation_data_output,
    char2_index_dir = char2index,
    epochs = 3,
    batch_size = 64,
    learning_rate = 0.0005,
    embedding_dim = 128
)

fasttext_test1 = fasttext_test_module_func1(
    trained_model_dir = fasttext_train1.outputs.trained_model_dir,
    test_data_dir = split_data_txt.outputs.test_data_output,
    char2_index_dir = char2index
)

fasttext_test2 = fasttext_test_module_func2(
    trained_model_dir = fasttext_train2.outputs.trained_model_dir,
    test_data_dir = split_data_txt.outputs.test_data_output,
    char2_index_dir = char2index
)

{'training_data_output': <azureml.pipeline.wrapper._module._OutputBuilder object at 0x000002881BDC2F28>, 'validation_data_output': <azureml.pipeline.wrapper._module._OutputBuilder object at 0x000002881F61A710>, 'test_data_output': <azureml.pipeline.wrapper._module._OutputBuilder object at 0x000002881F61ADA0>}
{'trained_model_dir': <azureml.pipeline.wrapper._module._OutputBuilder object at 0x000002881F5C2470>}


In [13]:
# pipeline
pipeline = Pipeline(nodes=[split_data_txt, fasttext_train1, fasttext_test1, fasttext_train2, fasttext_test2], workspace=workspace, default_compute_target=aml_compute_name)

In [14]:
# validate
pipeline.validate()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

SupportDetectView()

{'result': 'validation passed', 'errors': []}

In [None]:
# run
run = pipeline.submit(experiment_name='fasttext_test')
run.wait_for_completion()
pipeline.save(experiment_name='fasttext_test')

Submitted PipelineRun 589a0082-17d6-424e-9654-a453d40eecfe
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/fasttext_test/runs/589a0082-17d6-424e-9654-a453d40eecfe?wsid=/subscriptions/4f455bd0-f95a-4b7d-8d08-078611508e0b/resourcegroups/fundamental/workspaces/fundamental2
PipelineRunId: 589a0082-17d6-424e-9654-a453d40eecfe
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/fasttext_test/runs/589a0082-17d6-424e-9654-a453d40eecfe?wsid=/subscriptions/4f455bd0-f95a-4b7d-8d08-078611508e0b/resourcegroups/fundamental/workspaces/fundamental2
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 7954ad74-3683-4345-93ab-62fea9b8d58a
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/fasttext_test/runs/7954ad74-3683-4345-93ab-62fea9b8d58a?wsid=/subscriptions/4f455bd0-f95a-4b7d-8d08-078611508e0b/resourcegroups/fundamental/workspaces/fundamental2
StepRun( Split Data Txt ) Status: NotStarted
