In [1]:
import inspect
from azureml.core import Workspace, Dataset
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.pipeline.wrapper import Module, dsl, Pipeline

In [2]:
subscription_id = '4f455bd0-f95a-4b7d-8d08-078611508e0b'
resource_group = 'fundamental'
workspace_name = 'fundamental' # use another workspace

namespace='fundamental' # for loading module

workspace = Workspace(subscription_id, resource_group, workspace_name)
print(workspace.name, workspace.resource_group, workspace.location, workspace.subscription_id, workspace.compute_targets.keys(),sep = '\n')

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


fundamental
fundamental
eastus
4f455bd0-f95a-4b7d-8d08-078611508e0b
dict_keys(['fundamental-d3v2', 'default', 'cpu-cluster', 'aml-compute'])


In [3]:
print(workspace.compute_targets)
aml_compute_name = 'aml-compute'
try:
    aml_compute = AmlCompute(workspace, aml_compute_name)
    print("Found existing compute target: {}".format(aml_compute_name))
except:
    print("Creating new compute target: {}".format(aml_compute_name))

    provisioning_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2",
                                                                min_nodes=1,
                                                                max_nodes=4)
    aml_compute = ComputeTarget.create(workspace, aml_compute_name, provisioning_config)
    aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
print(aml_compute)

{'fundamental-d3v2': id: /subscriptions/4f455bd0-f95a-4b7d-8d08-078611508e0b/resourceGroups/fundamental/providers/Microsoft.MachineLearningServices/workspaces/fundamental/computes/fundamental-d3v2,
name: fundamental-d3v2,
location: eastus,
tags: None,
properties: {'description': None, 'computeType': 'ComputeInstance', 'computeLocation': 'eastus', 'provisioningErrors': None, 'properties': {'vmSize': 'STANDARD_D3_V2', 'applications': [{'displayName': 'Jupyter', 'endpointUri': 'https://fundamental-d3v2.eastus.instances.azureml.net'}, {'displayName': 'Jupyter Lab', 'endpointUri': 'https://fundamental-d3v2.eastus.instances.azureml.net/lab'}, {'displayName': 'RStudio', 'endpointUri': 'https://fundamental-d3v2-8787.eastus.instances.azureml.net'}], 'connectivityEndpoints': {'publicIpAddress': '20.185.101.4', 'privateIpAddress': '10.0.0.4'}, 'sshSettings': {'sshPublicAccess': 'Disabled', 'adminUserName': 'azureuser', 'adminPublicKey': None, 'sshPort': 4000}, 'subnet': {'id': None}, 'errors': No

In [4]:
# load data
dataset_name = 'THUCNews_TXT'
char2index_name = 'Char2Index_JSON'

if dataset_name not in workspace.datasets:
    print('Registering a THUCNews dataset for fasttext pipeline ...')
    path = ['https://datastore4fasttext.file.core.windows.net/data4fasttext/THUCNews.txt']
    data = Dataset.File.from_files(path=path)
    data.register(workspace=workspace, name=dataset_name, description='THUCNews dataset is generated by filtering and filtering historical data of Sina News RSS subscription channel from 2005 to 2011')
    print('Registerd')
data = workspace.datasets[dataset_name]

if char2index_name not in workspace.datasets:
    print('Registering a Char2Index_JSON for fasttext pipeline ...')
    path = ['https://datastore4fasttext.file.core.windows.net/data4fasttext/character2index.json']
    data = Dataset.File.from_files(path=path)
    data.register(workspace=workspace, name=char2index_name, description='The mapping relationship between character and index ')
    print('Registerd')
char2index = workspace.datasets[char2index_name]

print(data)
print(char2index)

FileDataset
{
  "source": [
    "('workspaceblobstore', 'UI/06-13-2020_021050_UTC/THUCNews.txt')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ],
  "registration": {
    "id": "d138049a-c5bb-40ef-b9ae-d2af9c3528de",
    "name": "THUCNews_TXT",
    "version": 1,
    "workspace": "Workspace.create(name='fundamental', subscription_id='4f455bd0-f95a-4b7d-8d08-078611508e0b', resource_group='fundamental')"
  }
}
FileDataset
{
  "source": [
    "('workspaceblobstore', 'UI/06-13-2020_020957_UTC/character2index.json')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ],
  "registration": {
    "id": "c5e6e1ce-967b-4422-96ff-bb7dbe973381",
    "name": "Char2Index_JSON",
    "version": 1,
    "workspace": "Workspace.create(name='fundamental', subscription_id='4f455bd0-f95a-4b7d-8d08-078611508e0b', resource_group='fundamental')"
  }
}


In [None]:
# load module
try:
    split_data_txt_module_func = Module.load(workspace=workspace, namespace=namespace, name='Split Data Txt')
    print('found split_data_txt_module')
except:
    print('not found split_data_txt_module, register it now...')
    yaml_file='split_data_txt.spec.yaml'
    split_data_txt_module_func = Module.register(workspace=workspace, yaml_file=yaml_file)

try:
    fasttext_train_module_func = Module.load(workspace=workspace, namespace=namespace, name='FastText Train')
    print('found fasttext_train_module')
except:
    print('not found fasttext_train_module, register it now...')
    yaml_file='fasttext_train.spec.yaml'
    fasttext_train_module_func = Module.register(workspace=workspace, yaml_file=yaml_file)

try:
    fasttext_test_module_func = Module.load(workspace=workspace, namespace=namespace, name='FastText Test')
    print('found fasttext_test_module')
except:
    print('not found fasttext_test_module, register it now...')
    yaml_file='fasttext_test.spec.yaml'
    fasttext_test_module_func = Module.register(workspace=workspace, yaml_file=yaml_file)
# inspect signature
print(inspect.signature(split_data_txt_module_func))
print(inspect.signature(fasttext_train_module_func))
print(inspect.signature(fasttext_test_module_func))

found split_data_txt_module


In [None]:
# connect module
split_data_txt = split_data_txt_module_func(
    input_dir = data,
    training_data_ratio = 0.7,
    validation_data_ratio = 0.1,
    random_split = False,
    seed = 1
)
print(split_data_txt.outputs)

fasttext_train = fasttext_train_module_func(
    training_data_dir = split_data_txt.outputs.training_data_output,
    validation_data_dir = split_data_txt.outputs.validation_data_output,
    char2_index_dir = char2index,
    epochs = 1,
    batch_size = 64,
    learning_rate = 0.0005,
    embedding_dim = 128
)
print(fasttext_train.outputs)

fasttext_test = fasttext_test_module_func(
    trained_model_dir = fasttext_train.outputs.trained_model_dir,
    test_data_dir = split_data_txt.outputs.test_data_output,
    char2_index_dir = char2index
)

In [None]:
# pipeline
pipeline = Pipeline(nodes=[split_data_txt, fasttext_train, fasttext_test], workspace=workspace, default_compute_target=aml_compute_name)

In [None]:
# validate
pipeline.validate()

In [None]:
# run
run = pipeline.submit(experiment_name='fasttext_test')
run.wait_for_completion()
pipeline.save(experiment_name='fasttext_test')