In [1]:
from azureml.core import Workspace, Dataset
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.pipeline.wrapper import Module, dsl

In [2]:
subscription_id = '4f455bd0-f95a-4b7d-8d08-078611508e0b'
resource_group = 'fundamental'
workspace_name = 'fundamental3'
tenant_id = "72f988bf-86f1-41af-91ab-2d7cd011db47"
# for loading module
namespace = workspace_name
experiment_name = 'fasttext_with_one_training_process'
interactive_auth = InteractiveLoginAuthentication(tenant_id=tenant_id)
workspace = Workspace(subscription_id, resource_group, workspace_name)
print(workspace.name, workspace.resource_group, workspace.location, workspace.subscription_id,
      workspace.compute_targets.keys(), sep='\n')

fundamental3
fundamental
eastasia
4f455bd0-f95a-4b7d-8d08-078611508e0b
dict_keys(['myaks2', 'aml-compute', 'my-compute', 'compute-deploy', 'aml-compute-tmp'])


In [3]:
aml_compute_name = 'aml-compute'
try:
    aml_compute = AmlCompute(workspace, aml_compute_name)
    print("Found existing compute target: {}".format(aml_compute_name))
except:
    print("Creating new compute target: {}".format(aml_compute_name))

    provisioning_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2", min_nodes=1, max_nodes=4)
    aml_compute = ComputeTarget.create(workspace, aml_compute_name, provisioning_config)
    aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

Found existing compute target: aml-compute


In [4]:
# load data
dataset_name = "THUNews"
char2index_name = 'CharToIndex'
if dataset_name not in workspace.datasets:
    web_path = ['https://datastore4fasttext.blob.core.windows.net/data/dataset/']
    data = Dataset.File.from_files(path=web_path)
    data.register(workspace=workspace, name=dataset_name, description='THUCNews dataset is generated by filtering \
    and filtering historical data of Sina News RSS subscription channel from 2005 to 2011', create_new_version=True)
dataset = workspace.datasets[dataset_name]

if char2index_name not in workspace.datasets:
    print('Registering CharToIndex for fasttext pipeline ...')
    path = ['https://datastore4fasttext.blob.core.windows.net/data/map/']
    data = Dataset.File.from_files(path=path)
    data.register(workspace=workspace, name=char2index_name,
                  description='The mapping relationship between character and index ')
    print('Registerd')
char2index = workspace.datasets[char2index_name]

In [5]:
# load module
split_data_txt_module_func = Module.from_yaml(workspace, 'split_data_txt/split_data_txt.spec.yaml')
fasttext_train_module_func = Module.from_yaml(workspace, 'fasttext_train/fasttext_train.spec.yaml')
fasttext_evaluation_module_func = Module.from_yaml(workspace, 'fasttext_evaluation/fasttext_evaluation.spec.yaml')
fasttext_score_module_func = Module.from_yaml(workspace, 'fasttext_score/fasttext_score.spec.yaml')
compare_two_models_module_func = Module.from_yaml(workspace, 'compare_two_models/compare_two_models.spec.yaml')

In [6]:
@dsl.pipeline(name='training_pipeline', description='A sub pipeline including data processing/train/evaluation',
              default_compute_target=aml_compute_name)
def training_pipeline(epochs):
    split_data_txt = split_data_txt_module_func(
        input_dir=dataset,
        training_data_ratio=0.1,
        validation_data_ratio=0.2
    )
    fasttext_train = fasttext_train_module_func(
        training_data_dir=split_data_txt.outputs.training_data_output,
        validation_data_dir=split_data_txt.outputs.validation_data_output,
        char2index_dir=char2index,
        epochs=epochs
    )

    fasttext_evaluation = fasttext_evaluation_module_func(
        trained_model_dir=fasttext_train.outputs.trained_model_dir,
        test_data_dir=split_data_txt.outputs.test_data_output,
        char2index_dir=char2index
    )

    return {**fasttext_evaluation.outputs, **fasttext_train.outputs}

In [7]:
@dsl.pipeline(name='dummy_automl_pipeline',
              description='A dummy pipeline that trains two models and output the better one',
              default_compute_target=aml_compute_name)
def dummy_automl_pipeline():
    sentence = '受疫情影响, 很多学生不得不在家上课'
    train_and_evalute_model1 = training_pipeline(epochs=1)
    train_and_evalute_model2 = training_pipeline(epochs=2)
    compare = compare_two_models_module_func(
        first_trained_model=train_and_evalute_model1.outputs.trained_model_dir,
        first_trained_result=train_and_evalute_model1.outputs.model_testing_result,
        second_trained_model=train_and_evalute_model2.outputs.trained_model_dir,
        second_trained_result=train_and_evalute_model2.outputs.model_testing_result
    )

    fasttext_score = fasttext_score_module_func(
        input_sentence=sentence,
        fasttext_model_dir=compare.outputs.the_better_model,
        char2index_dir=char2index
    )
    return {**compare.outputs, **fasttext_score.outputs}


In [8]:
# pipeline
pipeline = dummy_automl_pipeline()
# pipeline.save(experiment_name=experiment_name)

In [9]:
# validate
pipeline.validate()

<IPython.core.display.Javascript object>

ValidateView(container_id='container_id_4b83c5e7-716c-48fc-9baa-194a764c47af_widget', env_json='{"subscription…

{'result': 'validation passed', 'errors': []}

In [10]:
# pipeline_run
pipeline_run = pipeline.submit(experiment_name=experiment_name, regenerate_outputs=True)
# pipeline_run.wait_for_completion()

Submitted PipelineRun ba4df1a2-776e-441b-8ec7-07649c1638b5
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/fasttext_with_one_training_process/runs/ba4df1a2-776e-441b-8ec7-07649c1638b5?wsid=/subscriptions/4f455bd0-f95a-4b7d-8d08-078611508e0b/resourcegroups/fundamental/workspaces/fundamental3
