In [1]:
import inspect
from azureml.core import Workspace, Dataset
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.pipeline.core.graph import DataType
from azureml.pipeline.wrapper import Module, dsl, Pipeline
from azureml.core.authentication import InteractiveLoginAuthentication

In [2]:
subscription_id = '4f455bd0-f95a-4b7d-8d08-078611508e0b'
resource_group = 'fundamental'
workspace_name = 'fundamental3'
namespace=workspace_name # for loading module
# set this if you have multiple tenant
tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47"
interactive_auth = InteractiveLoginAuthentication(tenant_id=tenant_id)

workspace = Workspace(subscription_id, resource_group, workspace_name)
print(workspace.name, workspace.resource_group, workspace.location, workspace.subscription_id, workspace.compute_targets.keys(),sep = '\n')

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


fundamental3
fundamental
eastasia
4f455bd0-f95a-4b7d-8d08-078611508e0b
dict_keys(['aml-compute'])


In [3]:
print(workspace.compute_targets)
aml_compute_name = 'aml-compute'
try:
    aml_compute = AmlCompute(workspace, aml_compute_name)
    print("Found existing compute target: {}".format(aml_compute_name))
except:
    print("Creating new compute target: {}".format(aml_compute_name))

    provisioning_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2",
                                                                min_nodes=1,
                                                                max_nodes=4)
    aml_compute = ComputeTarget.create(workspace, aml_compute_name, provisioning_config)
    aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
print(aml_compute)

{'aml-compute': AmlCompute(workspace=Workspace.create(name='fundamental3', subscription_id='4f455bd0-f95a-4b7d-8d08-078611508e0b', resource_group='fundamental'), name=aml-compute, id=/subscriptions/4f455bd0-f95a-4b7d-8d08-078611508e0b/resourceGroups/fundamental/providers/Microsoft.MachineLearningServices/workspaces/fundamental3/computes/aml-compute, type=AmlCompute, provisioning_state=Succeeded, location=eastasia, tags=None)}
Found existing compute target: aml-compute
AmlCompute(workspace=Workspace.create(name='fundamental3', subscription_id='4f455bd0-f95a-4b7d-8d08-078611508e0b', resource_group='fundamental'), name=aml-compute, id=/subscriptions/4f455bd0-f95a-4b7d-8d08-078611508e0b/resourceGroups/fundamental/providers/Microsoft.MachineLearningServices/workspaces/fundamental3/computes/aml-compute, type=AmlCompute, provisioning_state=Succeeded, location=eastasia, tags=None)


In [4]:
# register my own datatype
DataType.create_data_type(workspace, 'MyDirectory', description='', is_directory=True) # won't register repeatedly
DataType.create_data_type(workspace, 'MyFile', description='', is_directory=False)

<azureml.pipeline.core.graph.DataType at 0x1884db798d0>

In [5]:
# load data
dataset_name = 'THUCNews_TXT'
char2index_name = 'Char2Index_JSON'

if dataset_name not in workspace.datasets:
    print('Registering a THUCNews dataset for fasttext pipeline ...')
    path = ['https://datastore4fasttext.blob.core.windows.net/mytest3/THUCNews.txt']
    data = Dataset.File.from_files(path=path)
    data.register(workspace=workspace, name=dataset_name, description='THUCNews dataset is generated by filtering and filtering historical data of Sina News RSS subscription channel from 2005 to 2011')
    print('Registerd')
data = workspace.datasets[dataset_name]

if char2index_name not in workspace.datasets:
    print('Registering a Char2Index_JSON for fasttext pipeline ...')
    path = ['https://datastore4fasttext.blob.core.windows.net/mytest3/character2index.json']
    data = Dataset.File.from_files(path=path)
    data.register(workspace=workspace, name=char2index_name, description='The mapping relationship between character and index ')
    print('Registerd')
char2index = workspace.datasets[char2index_name]

print(data)
print(char2index)

FileDataset
{
  "source": [
    "https://datastore4fasttext.blob.core.windows.net/mytest3/THUCNews.txt"
  ],
  "definition": [
    "GetFiles"
  ],
  "registration": {
    "id": "9e16ea04-3074-4f84-8a8c-83adb226c4ae",
    "name": "THUCNews_TXT",
    "version": 1,
    "description": "THUCNews dataset is generated by filtering and filtering historical data of Sina News RSS subscription channel from 2005 to 2011",
    "workspace": "Workspace.create(name='fundamental3', subscription_id='4f455bd0-f95a-4b7d-8d08-078611508e0b', resource_group='fundamental')"
  }
}
FileDataset
{
  "source": [
    "https://datastore4fasttext.blob.core.windows.net/mytest3/character2index.json"
  ],
  "definition": [
    "GetFiles"
  ],
  "registration": {
    "id": "9de9b550-46a4-41ce-b1b4-54b6c665fcf3",
    "name": "Char2Index_JSON",
    "version": 1,
    "description": "The mapping relationship between character and index ",
    "workspace": "Workspace.create(name='fundamental3', subscription_id='4f455bd0-f95a-

In [6]:
# load module
try:
    split_data_txt_module_func = Module.load(workspace=workspace, namespace=namespace, name='Split Data Txt')
    print('found split_data_txt_module')
except:
    print('not found split_data_txt_module, register it now...')
    yaml_file='split_data_txt/split_data_txt.spec.yaml'
    split_data_txt_module_func = Module.register(workspace=workspace, yaml_file=yaml_file)

try:
    fasttext_train_module_func = Module.load(workspace=workspace, namespace=namespace, name='FastText Train')
    print('found fasttext_train_module')
except:
    print('not found fasttext_train_module, register it now...')
    yaml_file='fasttext_train/fasttext_train.spec.yaml'
    fasttext_train_module_func = Module.register(workspace=workspace, yaml_file=yaml_file)

try:
    fasttext_score_module_func = Module.load(workspace=workspace, namespace=namespace, name='FastText Score')
    print('found fasttext_score_module')
except:
    print('not found fasttext_score_module, register it now...')
    yaml_file='fasttext_score/fasttext_score.spec.yaml'
    fasttext_score_module_func = Module.register(workspace=workspace, yaml_file=yaml_file)

try:
    compare2model_module_func = Module.load(workspace=workspace, namespace=namespace, name='Compare Two Models')
    print('found compare2model_module')
except:
    print('not found compare2model_module, register it now...')
    yaml_file='compare2model/compare2model.spec.yaml'
    compare2model_module_func = Module.register(workspace=workspace, yaml_file=yaml_file)

try:
    fasttext_predict_module_func = Module.load(workspace=workspace, namespace=namespace, name='FastText Predict')
    print('found fasttext_predict_module')
except:
    print('not found fasttext_predict_module, register it now...')
    yaml_file='fasttext_predict_module/fasttext_predict.spec.yaml'
    fasttext_predict_module_func = Module.register(workspace=workspace, yaml_file=yaml_file)

# inspect signature
print(inspect.signature(split_data_txt_module_func))
print(inspect.signature(fasttext_train_module_func))
print(inspect.signature(fasttext_score_module_func))
print(inspect.signature(compare2model_module_func))
print(inspect.signature(fasttext_predict_module_func))

found split_data_txt_module
found fasttext_train_module
found fasttext_score_module
found compare2model_module
found fasttext_predict_module
(input_dir:'Input Dir'=None, training_data_ratio:'Training Data Ratio'='0.7', validation_data_ratio:'Validation Data Ratio'='0.1', random_split:'Random Split'='False', seed:'Seed'='0')
(training_data_dir:'Training Data Dir'=None, validation_data_dir:'Validation Data Dir'=None, char2_index_dir:'Char2Index Dir'=None, epochs:'Epochs'='2', batch_size:'Batch Size'='32', learning_rate:'Learning Rate'='0.0005', embedding_dim:'Embedding Dim'='128')
(trained_model_dir:'Trained Model Dir'=None, test_data_dir:'Test Data Dir'=None, char2_index_dir:'Char2Index Dir'=None)
(first_trained_model:'First Trained Model'=None, first_trained_result:'First Trained Result'=None, second_trained_model:'Second Trained Model'=None, second_trained_result:'Second Trained Result'=None)
(fasttext_model:'Fasttext Model'=None, char2_index_dir:'Char2Index Dir'=None, input_sentence:

In [7]:
# connect module
split_data_txt = split_data_txt_module_func(
    input_dir = data,
    training_data_ratio = 0.7,
    validation_data_ratio = 0.1,
    random_split = False,
    seed = 1
)
print(split_data_txt.outputs)

fasttext_train = fasttext_train_module_func(
    training_data_dir = split_data_txt.outputs.training_data_output,
    validation_data_dir = split_data_txt.outputs.validation_data_output,
    char2_index_dir = char2index,
    epochs = 1,
    batch_size = 64,
    learning_rate = 0.0005,
    embedding_dim = 128
)
print(fasttext_train.outputs)

fasttext_score = fasttext_score_module_func(
    trained_model_dir = fasttext_train.outputs.trained_model_dir,
    test_data_dir = split_data_txt.outputs.test_data_output,
    char2_index_dir = char2index
)
sentence = '受疫情影响, 今年很多学生在家里待了半年'
fasttext_predict = fasttext_predict_module_func(
        input_sentence =  sentence,
        fasttext_model = fasttext_train.outputs.trained_model_dir,
        char2_index_dir = char2index
)
print(fasttext_predict.outputs)

{'training_data_output': <azureml.pipeline.wrapper._module._OutputBuilder object at 0x000001884DB6AEB8>, 'validation_data_output': <azureml.pipeline.wrapper._module._OutputBuilder object at 0x000001884E903E48>, 'test_data_output': <azureml.pipeline.wrapper._module._OutputBuilder object at 0x000001884E903EB8>}
{'trained_model_dir': <azureml.pipeline.wrapper._module._OutputBuilder object at 0x000001884E92A198>}
{}


In [8]:
# pipeline
pipeline = Pipeline(nodes=[split_data_txt, fasttext_train, fasttext_score, fasttext_predict], workspace=workspace, default_compute_target=aml_compute_name)

In [9]:
# validate
pipeline.validate()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

SupportDetectView()

{'result': 'validation passed', 'errors': []}

In [10]:
# run
run = pipeline.submit(experiment_name='fasttext_with_one_training_process')
run.wait_for_completion()
pipeline.save(experiment_name='fasttext_with_one_training_process')



Submitted PipelineRun 9946ae68-98c1-4866-9b02-5e592faa1eed
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/fasttext_with_one_training_process/runs/9946ae68-98c1-4866-9b02-5e592faa1eed?wsid=/subscriptions/4f455bd0-f95a-4b7d-8d08-078611508e0b/resourcegroups/fundamental/workspaces/fundamental3
PipelineRunId: 9946ae68-98c1-4866-9b02-5e592faa1eed
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/fasttext_with_one_training_process/runs/9946ae68-98c1-4866-9b02-5e592faa1eed?wsid=/subscriptions/4f455bd0-f95a-4b7d-8d08-078611508e0b/resourcegroups/fundamental/workspaces/fundamental3
PipelineRun Status: Running


StepRunId: 42cb7c9f-fb09-444c-87fb-4c83b7419e03
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/fasttext_with_one_training_process/runs/42cb7c9f-fb09-444c-87fb-4c83b7419e03?wsid=/subscriptions/4f455bd0-f95a-4b7d-8d08-078611508e0b/resourcegroups/fundamental/workspaces/fundamental3

StepRun(FastText Train) Execution Summa

Name,Id,Details page,Pipeline type,Updated on,Created by,Tags
Pipeline-Created-on-6-25-2020,a7d064b7-f217-4e47-a73e-b0eb68fbaf7c,Link,TrainingPipeline,"June 25, 2020 06:00 PM",Xiaoyu Yang,azureml.Designer: true

0
azureml.Designer: true
