In [1]:
from azureml.pipeline.wrapper import dsl
from fasttext_pipeline_utils import choose_workspace, choose_compute_target, load_dataset, load_module

In [2]:
# chose a workspace
subscription_id = '4f455bd0-f95a-4b7d-8d08-078611508e0b'
resource_group = 'fundamental'
workspace_name = 'fundamental3'
# set this if you have multiple tenant
tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47"
workspace=choose_workspace(subscription_id, resource_group, workspace_name, tenant_id)

name: fundamental3
resource_group fundamental
location eastasia
subscription_id 4f455bd0-f95a-4b7d-8d08-078611508e0b
compute_targets dict_keys(['myaks2', 'aml-compute', 'my-compute', 'compute-deploy'])


In [3]:
# choose a compute target
aml_compute_name='aml-compute'
aml_compute = choose_compute_target(workspace=workspace, name=aml_compute_name)

Found existing compute target: aml-compute
AmlCompute(workspace=Workspace.create(name='fundamental3', subscription_id='4f455bd0-f95a-4b7d-8d08-078611508e0b', resource_group='fundamental'), name=aml-compute, id=/subscriptions/4f455bd0-f95a-4b7d-8d08-078611508e0b/resourceGroups/fundamental/providers/Microsoft.MachineLearningServices/workspaces/fundamental3/computes/aml-compute, type=AmlCompute, provisioning_state=Succeeded, location=eastasia, tags=None)


In [4]:
# load data
data = load_dataset(name='THUCNews', 
                    path=['https://datastore4fasttext.blob.core.windows.net/data/dataset/'], 
                    description='THUCNews dataset is generated by filtering and filtering historical data of Sina News RSS subscription channel from 2005 to 2011', 
                    workspace=workspace)

char2index = load_dataset(name='Char2Index',
                    path=['https://datastore4fasttext.blob.core.windows.net/data/map/'], 
                    description='The mapping relationship between character and index', 
                    workspace=workspace)
print('data:',data.description)
print('char2index:',char2index.description)

Successfully loaded THUCNews
Successfully loaded Char2Index
data: THUCNews dataset is generated by filtering and filtering historical data of Sina News RSS subscription channel from 2005 to 2011
char2index: The mapping relationship between character and index


In [5]:
# load module
namespace=workspace.name
name='Split Data Txt'
yaml_file_path='split_data_txt/split_data_txt.spec.yaml'
split_data_txt_module_func = load_module(workspace, namespace, name, yaml_file_path)

name='FastText Train'
yaml_file_path='fasttext_train/fasttext_train.spec.yaml'
fasttext_train_module_func = load_module(workspace, namespace, name, yaml_file_path)

name='FastText Evaluation'
yaml_file_path='fasttext_evaluation/fasttext_evaluation.spec.yaml'
fasttext_evaluation_module_func = load_module(workspace, namespace, name, yaml_file_path)

name='FastText Score'
yaml_file_path='fasttext_score/fasttext_score.spec.yaml'
fasttext_score_module_func = load_module(workspace, namespace, name, yaml_file_path)

found the module of Split Data Txt
found the module of FastText Train
found the module of FastText Evaluation
found the module of FastText Score


In [6]:
# construct pipeline
@dsl.pipeline(name='test deploy', description='Test Deploy', default_compute_target=aml_compute_name)
def training_pipeline(epochs):
    split_data_txt = split_data_txt_module_func(
    input_dir = data,
    training_data_ratio = 0.1,
    validation_data_ratio = 0.1,
    random_split = True,
    seed = 9
    )

    fasttext_train = fasttext_train_module_func(
    training_data_dir = split_data_txt.outputs.training_data_output,
    validation_data_dir = split_data_txt.outputs.validation_data_output,
    char2index_dir = char2index,
    epochs = epochs,
    batch_size = 64,
    learning_rate = 0.0005,
    embedding_dim = 128
    )
    
    fasttext_evaluation = fasttext_evaluation_module_func(
    test_data_dir = split_data_txt.outputs.test_data_output,
    trained_model_dir = fasttext_train.outputs.trained_model_dir,
    char2index_dir = char2index
    )

    return {**fasttext_evaluation.outputs, **fasttext_train.outputs}


In [7]:
# pipeline
pipeline = training_pipeline(epochs=1)

In [8]:
# visualization
# pipeline.validate()

In [9]:
# pipeline_draft = pipeline.save(experiment_name='my test')
# pipeline_draft

In [10]:
# run
run = pipeline.submit(experiment_name='deploy')
run.wait_for_completion()
run


Submitted PipelineRun b7a58aa2-9ff7-4410-a36d-8d96ece7736f
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/deploy/runs/b7a58aa2-9ff7-4410-a36d-8d96ece7736f?wsid=/subscriptions/4f455bd0-f95a-4b7d-8d08-078611508e0b/resourcegroups/fundamental/workspaces/fundamental3
PipelineRunId: b7a58aa2-9ff7-4410-a36d-8d96ece7736f
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/deploy/runs/b7a58aa2-9ff7-4410-a36d-8d96ece7736f?wsid=/subscriptions/4f455bd0-f95a-4b7d-8d08-078611508e0b/resourcegroups/fundamental/workspaces/fundamental3


<IPython.core.display.Javascript object>

ValidateView(container_id='container_id_a10c9c17-2cf2-48e6-82c3-a7f5302f9fe3_widget', env_json='{}', graph_jso…

Experiment,Id,Type,Status,Details Page,Docs Page
deploy,b7a58aa2-9ff7-4410-a36d-8d96ece7736f,azureml.PipelineRun,Completed,Link to Azure Machine Learning studio,Link to Documentation
