In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import os, shutil
import json

#  Azure Machine Learning Python SDK

In [None]:
import azureml.core

print("Ready to use Azure ML", azureml.core.VERSION)

# Connect to your workspace

In [None]:
from azureml.core import Workspace

ws = Workspace.from_config()
print(ws.name, "loaded")

# Run Experiments

In [None]:
from azureml.core import Experiment

# Create an Azure ML experiment in your workspace
experiment = Experiment(workspace=ws, name="diabetes-experiment")

# Start logging data from the experiment, obtaining a reference to the experiment run
run = experiment.start_logging()
print("Starting experiment:", experiment.name)

### taking logs

In [None]:
run.log('observations', len(data))
run.log_list('pregnancy categories', data.A.unique())
run.log_image(name='label distribution', plot=fig)
for index in range(len(keys)):
        run.log_row(col, stat=keys[index], value = values[index])

### saving data and completing run

In [None]:
# Save a sample of the data and upload it to the experiment output
data.sample(100).to_csv('sample.csv', index=False, header=True)
run.upload_file(name='outputs/sample.csv', path_or_stream='./sample.csv')

# Complete the run
run.complete()

# Run an experiment script

In [None]:
 # create a folder for the experiment files, and copy the data into it:
shutil.copy('data/diabetes.csv', os.path.join(folder_name, "diabetes.csv"))

## create a Python script containing the code for our experiment, and save it in the experiment folder.
Note: This code creates the script - it doesn't run it!

In [None]:
%%writefile $folder_name/diabetes_experiment.py
from azureml.core import Run

run = Run.get_context()# Get the experiment run context
##########script####################

run.complete() # Complete the run

# configures and submits the script-based experiment.

In [None]:

from azureml.core import Experiment, ScriptRunConfig
from azureml.widgets import RunDetails

In [None]:

# Create a script config
script_config = ScriptRunConfig(source_directory=experiment_folder, script='diabetes_experiment.py') 

# submit the experiment
experiment = Experiment(workspace=ws, name='diabetes-experiment')
run = experiment.submit(config=script_config)

RunDetails(run).show()
run.wait_for_completion()

# Create a training script

In [None]:
%%writefile $training_folder/diabetes_training.py

# Get the experiment run context
run = Run.get_context()

###############code########
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')
run.complete()

# Run the training script as an experiment

In [None]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.widgets import RunDetails

# Create a Python environment for the experiment
sklearn_env = Environment("sklearn-env")

# Ensure the required packages are installed (we need scikit-learn and Azure ML defaults)
packages = CondaDependencies.create(pip_packages=['scikit-learn','azureml-defaults'])
sklearn_env.python.conda_dependencies = packages

In [None]:
# Create a script config
script_config = ScriptRunConfig(source_directory=training_folder,  script='diabetes_training.py', 
                                environment=sklearn_env) 

# submit the experiment run
experiment_name = 'diabetes-training'
experiment = Experiment(workspace=ws, name=experiment_name)
run = experiment.submit(config=script_config)

# Show the running experiment run in the notebook widget
RunDetails(run).show()

# Block until the experiment run has completed
run.wait_for_completion()

In [None]:
metrics = run.get_metrics()
[print(key, metrics.get(key)) for key in metrics.keys()]
[print(file) for file in run.get_file_names()]

# Create a parameterized training script

You can increase the flexibility of your training experiment by adding parameters to your script, enabling you to repeat the same training experiment with different settings. 

In [None]:
# Get the experiment run context
run = Run.get_context()

# Set regularization hyperparameter
parser = argparse.ArgumentParser()
parser.add_argument('--reg_rate', type=float, dest='reg', default=0.01)
args = parser.parse_args()
reg = args.reg


#########code############
run.complete()

# Create a script config
script_config = ScriptRunConfig(source_directory=training_folder, script='diabetes_training.py',
                                arguments = ['--reg_rate', 0.1],environment=sklearn_env) 

In [None]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.widgets import RunDetails

# Create a Python environment for the experiment
sklearn_env = Environment("sklearn-env")

# Ensure the required packages are installed (we need scikit-learn and Azure ML defaults)
packages = CondaDependencies.create(pip_packages=['scikit-learn','azureml-defaults'])
sklearn_env.python.conda_dependencies = packages

# Create a script config
script_config = ScriptRunConfig(source_directory=training_folder,script='diabetes_training.py',  environment=sklearn_env) 

In [None]:

# submit the experiment run
experiment_name = 'diabetes-training'
experiment = Experiment(workspace=ws, name=experiment_name)
run = experiment.submit(config=script_config)

# Show the running experiment run in the notebook widget
RunDetails(run).show()

# Block until the experiment run has completed
run.wait_for_completion()

# Register the trained model

Note that the outputs of the experiment include the trained model file (diabetes_model.pkl). 

In [None]:
from azureml.core import Model

# Register the model
run.register_model(model_path='outputs/diabetes_model.pkl',
                   model_name='diabetes_model',
                   tags={'Training context':'Script'},#cange tags for new version
                   properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})

In [None]:
# List registered models
for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    [print(model.tags[tag_name])  for tag_name in model.tags]
    [print( model.properties[prop_name])for prop_name in model.properties]


# View datastores

In [None]:
ws = Workspace.from_config()
default_ds = ws.get_default_datastore()
[print(ds_name, ds_name == default_ds.name) for ds_name in ws.datastores]

## Upload data to a datastore

In [None]:
default_ds.upload_files(files=['./data/diabetes.csv', './data/diabetes2.csv'], # Upload the diabetes csv files in /data
                       target_path='diabetes-data/', # Put it in a folder path in the datastore
                       overwrite=True, show_progress=True)

# Work with datasets
 Datasets can be tabular or file-based.

In [None]:
#Create a tabular dataset from the path on the datastore (this may take a short while)
tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))
tab_data_set.take(20).to_pandas_dataframe()

In [None]:
#Create a file dataset from the path on the datastore (this may take a short while)
file_data_set = Dataset.File.from_files(path=(default_ds, 'diabetes-data/*.csv'))
[ print(file_path) for file_path in file_data_set.to_path()]

# Register datasets

In [None]:
tab_data_set = tab_data_set.register(workspace=ws, 
                                        name='diabetes dataset',
                                        description='diabetes data',
                                        tags = {'format':'CSV'},
                                        create_new_version=True)

file_data_set = file_data_set.register(workspace=ws,
                                            name='diabetes file dataset',
                                            description='diabetes files',
                                            tags = {'format':'CSV'},
                                            create_new_version=True)

# Train a model from a tabular dataset
 dataset is passed as a parameter (or argument). In the case of a tabular dataset, this argument will contain the ID of the registered dataset; so you could write code in the script to get the experiment's workspace from the run context, and then get the dataset using its ID; like this:

run = Run.get_context()

ws = run.experiment.workspace

dataset = Dataset.get_by_id(ws, id=args.training_dataset_id)

diabetes = dataset.to_pandas_dataframe()

In [None]:

# Get the script arguments (regularization rate and training dataset ID)
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
parser.add_argument("--input-data", type=str, dest='training_dataset_id', help='training dataset')
args = parser.parse_args()

# Set regularization hyperparameter (passed as an argument to the script)
reg = args.reg_rate
diabetes = run.input_datasets['training_data'].to_pandas_dataframe()

The --input-data argument passes the dataset as a named input that includes a friendly name for the dataset, which is used by the script to read it from the input_datasets collection in the experiment run. The string value in the --input-data argument is actually the registered dataset's ID. As an alternative approach, you could simply pass diabetes_ds.id, in which case the script can access the dataset ID from the script arguments and use it to get the dataset from the workspace, but not from the input_datasets collection.

In [None]:
# Ensure the required packages are installed (we need scikit-learn, Azure ML defaults, and Azure ML dataprep)
packages = CondaDependencies.create(pip_packages=['scikit-learn','azureml-defaults','azureml-dataprep[pandas]'])
sklearn_env.python.conda_dependencies = packages

# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")

# Create a script config
script_config = ScriptRunConfig(source_directory=experiment_folder,
                              script='diabetes_training.py',
                              arguments = ['--regularization', 0.1, # Regularizaton rate parameter
                                           '--input-data', diabetes_ds.as_named_input('training_data')], # Reference to dataset
                              environment=sklearn_env) 

# Train a model from a file dataset

When you're using a file dataset, the dataset argument passed to the script represents a mount point containing file paths.
CSV files, you can use the Python glob module to create a list of files in the virtual mount point defined by the dataset, and read them all into Pandas dataframes that are concatenated into a single dataframe.

In [None]:
# Get script arguments (rgularization rate and file dataset mount point)
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
parser.add_argument('--input-data', type=str, dest='dataset_folder', help='data mount point')
args = parser.parse_args()

# Set regularization hyperparameter (passed as an argument to the script)
reg = args.reg_rate

# Get the experiment run context
run = Run.get_context()

# load the diabetes dataset

data_path = run.input_datasets['training_files']
all_files = glob.glob(data_path + "/*.csv")
diabetes = pd.concat((pd.read_csv(f) for f in all_files), sort=False)

to change the way we pass the dataset to the script - it needs to define a path from which the script can read the files. You can use either the as_download or as_mount method to do this. Using as_download causes the files in the file dataset to be downloaded to a temporary location on the compute where the script is being run, while as_mount creates a mount point from which the files can be streamed directly from the datasetore.

You can combine the access method with the as_named_input method to include the dataset in the input_datasets collection in the experiment run (if you omit this, for example by setting the argument to diabetes_ds.as_mount(), the script will be able to access the dataset mount point from the script arguments, but not from the input_datasets collection).

In [None]:
# Create a script config
script_config = ScriptRunConfig(source_directory=experiment_folder,
                                script='diabetes_training.py',
                                arguments = ['--regularization', 0.1, # Regularizaton rate parameter
                                             '--input-data', diabetes_ds.as_named_input('training_files').as_download()], # Reference to dataset location
                                environment=sklearn_env) #

# Work with Compute

## Prepare data for an experiment

In [None]:


from azureml.core import Dataset

default_ds = ws.get_default_datastore()

if 'diabetes dataset' not in ws.datasets:
    default_ds.upload_files(files=['./data/diabetes.csv', './data/diabetes2.csv'], # Upload the diabetes csv files in /data
                        target_path='diabetes-data/', # Put it in a folder path in the datastore
                        overwrite=True, # Replace existing files of the same name
                        show_progress=True)

    #Create a tabular dataset from the path on the datastore (this may take a short while)
    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))

    # Register the tabular dataset
    tab_data_set = tab_data_set.register(workspace=ws, 
                                name='diabetes dataset',
                                description='diabetes data',
                                tags = {'format':'CSV'},
                                create_new_version=True)

## Define an environment
 The conda dependencies are installed first, followed by the pip dependencies. Since the pip package is required to install the pip dependencies, it's good practice to include it in the conda dependencies 

In [None]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies

# Create a Python environment for the experiment
diabetes_env = Environment("diabetes-experiment-env")
diabetes_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies
diabetes_env.docker.enabled = True # Use a docker container

# Create a set of package dependencies (conda or pip as required)
diabetes_packages = CondaDependencies.create(conda_packages=['scikit-learn','ipykernel','matplotlib','pandas','pip'],
                                             pip_packages=['azureml-sdk','pyarrow'])

# Add the dependencies to the environment
diabetes_env.python.conda_dependencies = diabetes_packages

print(diabetes_env.name, 'defined.')

you can use the environment to run a script as an experiment.

In [None]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.widgets import RunDetails

# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")

# Create a script config
script_config = ScriptRunConfig(source_directory=experiment_folder,
                                script='diabetes_training.py',
                                arguments = ['--regularization', 0.1, # Regularizaton rate parameter
                                             '--input-data', diabetes_ds.as_named_input('training_data')], # Reference to dataset
                                environment=diabetes_env) 

# submit the experiment
experiment_name = 'diabetes-training'
experiment = Experiment(workspace=ws, name=experiment_name)
run = experiment.submit(config=script_config)
RunDetails(run).show()
run.wait_for_completion()

The experiment successfully used the environment

# Register the environment

In [None]:
from azureml.core import Environment
envs = Environment.list(workspace=ws)

In [None]:
for env in envs:
    if env.startswith("AzureML"):
        print("Name",env)
        print("packages", envs[env].python.conda_dependencies.serialize_to_string())
        
 Print all child runs, sorted by the primary metric
for child_run in run.get_children_sorted_by_primary_metric():
    print(child_run)

# Get the best run, and its metrics and arguments
best_run = run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
script_arguments = best_run.get_details() ['runDefinition']['arguments']
print('Best Run Id: ', best_run.id)
print(best_run_metrics['AUC'], best_run_metrics['Accuracy'])
print(' -Arguments:',script_arguments)