In [14]:
from azureml.core import Workspace, Datastore, Dataset

ws = Workspace.from_config()

In [8]:
ds_def = Datastore.get_default(ws) # Get workspace blob store

In [17]:
ds_def.upload_files(files=['data/diabetes.csv'],
                   target_path='dia-data/',
                   show_progress=True)

Uploading an estimated of 1 files
Target already exists. Skipping upload for dia-data/diabetes.csv
Uploaded 0 files


$AZUREML_DATAREFERENCE_e510e4d414344af4b117032480f3cdf1

In [18]:
tab_ds = Dataset.Tabular.from_delimited_files(path=(ds_def, 'dia-data/diabetes.csv'))

In [21]:
tab_ds.register(workspace=ws, 
                name='diab_dataset',
                description='diabetes data',
                tags = {'format':'CSV'},
                create_new_version=True)

{
  "source": [
    "('workspaceblobstore', 'dia-data/diabetes.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "c6830310-2400-40f1-8578-1cd942f89e34",
    "name": "diab_dataset",
    "version": 1,
    "description": "diabetes data",
    "tags": {
      "format": "CSV"
    },
    "workspace": "Workspace.create(name='ML-2s-sandbox', subscription_id='08265842-251e-450c-8d28-a06ee3f3c611', resource_group='DataSandbox')"
  }
}

In [None]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep
from azureml.train.estimator import Estimator
from azureml.core.runconfig import RunConfiguration

# Output folder name
compute_target = 'DS-Ass-Cluster'

# Define data preparation PipelineData object
prepped_data = PipelineData(name='prepped_data', datastore=ws.get_default_datastore())

model_pipe = PipelineData(name='model', datastore=ws.get_default_datastore())

# Define estimator
estimator = Estimator(source_directory='pipeline_script',
                      entry_script='preprocess_data.py',
                      compute_target=compute_target,
                      conda_packages=['scikit-learn', 'matplotlib'],
                      pip_packages=['azureml-dataprep[pandas]'])

estimator2 = Estimator(source_directory='pipeline_script',
                      entry_script='diabetes_experiment.py',
                      compute_target=compute_target,
                      conda_packages=['scikit-learn', 'matplotlib'],
                      pip_packages=['azureml-dataprep[pandas]'])




# First step, prepares data by preprocessing it (just removed a column here), then outputs
# the prepared data to output folder
step1 = EstimatorStep(name='prep data', 
                     estimator=estimator,
                     compute_target=compute_target,
                     inputs=[tab_ds.as_named_input('raw_data')],
                     outputs=[prepped_data],
                     estimator_entry_script_arguments=['--output-folder', prepped_data])

# Second step, reads the data and train a model
step2 = EstimatorStep(name='train model',
                     estimator=estimator2,
                     compute_target=compute_target,
                     inputs=[prepped_data],
                     estimator_entry_script_arguments=['--input-folder', prepped_data])

In [58]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline

pipe_steps = [step1, step2]
pipe = Pipeline(workspace=ws, steps=pipe_steps)

experiment = Experiment(workspace = ws, name = 'diabetes-test-pipeline')
pipeline_run = experiment.submit(pipe, regenerate_outputs=True)
print("Pipeline submitted for execution.")
pipeline_run.wait_for_completion(show_output=True)

Created step prep data [26eb2e10][50af0a44-acf7-435e-8c28-d5c356a8def3], (This step will run and generate new outputs)
Created step train model [c7b16640][fec89449-8b51-4478-93a7-dca54c311451], (This step will run and generate new outputs)
Submitted PipelineRun 3ff91f37-a11d-4d57-a9c5-df44797d7b90
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/diabetes-test-pipeline/runs/3ff91f37-a11d-4d57-a9c5-df44797d7b90?wsid=/subscriptions/08265842-251e-450c-8d28-a06ee3f3c611/resourcegroups/DataSandbox/workspaces/ML-2s-sandbox
Pipeline submitted for execution.
PipelineRunId: 3ff91f37-a11d-4d57-a9c5-df44797d7b90
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/diabetes-test-pipeline/runs/3ff91f37-a11d-4d57-a9c5-df44797d7b90?wsid=/subscriptions/08265842-251e-450c-8d28-a06ee3f3c611/resourcegroups/DataSandbox/workspaces/ML-2s-sandbox
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: f9a75782-f51a-43c1-a237-b5bb3c504167
Link to Azure 

'Finished'

In [64]:
pipeline_run

Experiment,Id,Type,Status,Details Page,Docs Page
diabetes-test-pipeline,3ff91f37-a11d-4d57-a9c5-df44797d7b90,azureml.PipelineRun,Completed,Link to Azure Machine Learning studio,Link to Documentation
