In [2]:
import azureml.core
from azureml.core import Workspace

ws = Workspace.from_config()
print('Ready to work with- {} in workspace -{}'.format(azureml.core.VERSION, ws.name))

Ready to work with- 1.6.0 in workspace -dp101-workspace


2 ways to work with the data, **Datastore** and **Datasets**

# 1. DataStore

### 1.1 View Datastores

In [3]:
# Get the default datastore
default_ds = ws.get_default_datastore()

# Enumerate all datastores, indicating which is the default
for ds_name in ws.datastores:
    print(ds_name, '-default name:', ds_name == default_ds.name)

workspacefilestore -default name: False
workspaceblobstore -default name: True


### 1.2 Upload Data to a Datastore

In [4]:
default_ds.upload_files(files=['../mslearn-aml-labs/data/diabetes.csv', '../mslearn-aml-labs/data/diabetes2.csv'], 
                       target_path='diabetes-data/', # Put it in a folder path in the datastore
                       overwrite=True,
                       show_progress=True)

Uploading an estimated of 2 files
Uploading ../mslearn-aml-labs/data/diabetes.csv
Uploading ../mslearn-aml-labs/data/diabetes2.csv
Uploaded ../mslearn-aml-labs/data/diabetes.csv, 1 files out of an estimated total of 2
Uploaded ../mslearn-aml-labs/data/diabetes2.csv, 2 files out of an estimated total of 2
Uploaded 2 files


$AZUREML_DATAREFERENCE_62b1b41e460d4f29b1f01f97c5448047

### 1.3 Train a Model from a Datastore
When you uploaded the files in the code cell above, note that the code returned a **data reference**. A data reference provides a way to pass the path to a folder in a datastore to a script,

In [5]:
data_ref = default_ds.path('diabetes-data').as_download(path_on_compute='diabetes_data')
print(data_ref)

$AZUREML_DATAREFERENCE_a9659df2be124eb5aa24f09983040d63


In [6]:
# Create a folder for the experiment files
experiment_folder = 'diabetes_training_from_datastore'
os.makedirs(experiment_folder, exist_ok=True)
print(experiment_folder, 'folder created')

diabetes_training_from_datastore folder created


In [7]:
%%writefile $experiment_folder/diabetes_training.py

#import libraries
import os
import argparse
from azureml.core import Run
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

#Get parameters
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder reference')

args = parser.parse_args()
reg = args.reg_rate

# Get the experiment run context
run = Run.get_context()

# load the diabetes data from the data reference
data_folder = args.data_folder
print("Loading data from", data_folder)


# Load all files and concatenate their contents as a single dataframe
all_files = os.listdir(data_folder)
diabetes = pd.concat((pd.read_csv(os.path.join(data_folder, csv_file)) for csv_file in all_files))

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness',
                 'SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate',  np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.complete()







Overwriting diabetes_training_from_datastore/diabetes_training.py


In [8]:
from azureml.train.sklearn import SKLearn
from azureml.core import Experiment
from azureml.widgets import RunDetails

#set the script parameter
script_params = {'--regularization': 0.1}

#get the training dataset
diabetes_ds = ws.datasets.get("diabetes file dataset")

# Create an estimator
estimator = SKLearn(source_directory = experiment_folder,
                   entry_script = 'diabetes_training.py',
                   script_params= script_params,
                   compute_target= 'local'
                   )

# Create an experiment
experiment_name = 'diabetes-training'
experiment = Experiment(workspace = ws, name = experiment_name)

#Run the experiment
run = experiment.submit(config=estimator)

# Show the run details while running
RunDetails(run).show()
run.wait_for_completion()
                    




_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'diabetes-training_1591242565_7b19cf21',
 'target': 'local',
 'status': 'Finalizing',
 'startTimeUtc': '2020-06-04T03:49:34.914319Z',
 'error': {'error': {'code': 'UserError',
   'message': 'User program failed with TypeError: expected str, bytes or os.PathLike object, not NoneType',
   'detailsUri': 'https://aka.ms/azureml-known-errors',
   'details': [],
   'debugInfo': {'type': 'TypeError',
    'message': 'expected str, bytes or os.PathLike object, not NoneType',
    'stackTrace': '  File "azureml-setup/context_manager_injector.py", line 148, in execute_with_context\n    runpy.run_path(sys.argv[0], globals(), run_name="__main__")\n  File "/azureml-envs/azureml_12c51bdabb987f6db1eeb8e263909841/lib/python3.6/runpy.py", line 263, in run_path\n    pkg_name=pkg_name, script_name=fname)\n  File "/azureml-envs/azureml_12c51bdabb987f6db1eeb8e263909841/lib/python3.6/runpy.py", line 96, in _run_module_code\n    mod_name, mod_spec, pkg_name, script_name)\n  File "/azureml-envs/azurem

# 2. Datasets

### 2.1 Create a Tabular dataset

In [9]:
from azureml.core import Dataset

#get the default datastore
default_ds = ws.get_default_datastore()

#Create a tabular dataset from the path on the datastore (this may take a short while)
tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))

#Display the first 20 record
tab_data_set.take(20).to_pandas_dataframe()






Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,1883350,9,103,78,25,304,29.582192,1.28287,43,1
4,1424119,1,85,59,27,35,42.604536,0.549542,22,0
5,1619297,0,82,92,9,253,19.72416,0.103424,26,0
6,1660149,0,133,47,19,227,21.941357,0.17416,21,0
7,1458769,0,67,87,43,36,18.277723,0.236165,26,0
8,1201647,8,80,95,33,24,26.624929,0.443947,53,1
9,1403912,1,72,31,40,42,36.889576,0.103944,26,0


### 2.2 Create a File Dataset

The dataset you created is a tabular dataset that can be read as a dataframe containing all of the data in the structured files that are included in the dataset definition. This works well for tabular data, but in some machine learning scenarios you might need to work with data that is unstructured; or you may simply want to handle reading the data from files in your own code. To accomplish this, you can use a file dataset, which creates a list of file paths in a virtual mount point, which you can use to read the data in the files.

In [10]:
#Create a file dataset from the path on the datastore (this may take a short while)
file_data_set = Dataset.File.from_files(path=(default_ds, 'diabetes-data/*.csv'))
                #Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))

# Get the files in the dataset
for file_path in file_data_set.to_path():
    print(file_path)

/diabetes.csv
/diabetes2.csv


### 2.3 Register Datasets
Now that we have created datasets that reference the diabetes data, you can register them to make them easily accessible to any experiment being run in the workspace.

In [11]:
# Register the tabular dataset
try:
    tab_data_set = tab_data_set.register(workspace=ws, 
                                        name='diabetes dataset',
                                        description='diabetes data',
                                        tags = {'format':'CSV'},
                                        create_new_version=True)
except Exception as ex:
    print(ex)

# Register the file dataset
try:
    file_data_set = file_data_set.register(workspace=ws,
                                            name='diabetes file dataset',
                                            description='diabetes files',
                                            tags = {'format':'CSV'},
                                            create_new_version=True)
except Exception as ex:
    print(ex)

print('Datasets registered')

Datasets registered


### 2.4 view dataset

In [12]:
print("Datasets:")
for dataset_name in list(ws.datasets.keys()):
    dataset = Dataset.get_by_name(ws, dataset_name)
    print("\t", dataset.name, 'version', dataset.version)

Datasets:
	 diabetes file dataset version 1
	 diabetes dataset version 1


### 2.5 Train Model with Tabular dataset (Don't run the cell, just learn how to)

In [None]:

# Set regularization hyperparameter (passed as an argument to the script)
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
args = parser.parse_args()
reg = args.reg_rate

# Get the experiment run context
run = Run.get_context()

# load the diabetes data (passed as an input dataset)
print("Loading Data...")
diabetes = run.input_datasets['diabetes'].to_pandas_dataframe()

### 2.6 Train a model from file dataset (Don't run the cell, just learn)

In [None]:
import glob

# Set regularization hyperparameter (passed as an argument to the script)
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
args = parser.parse_args()
reg = args.reg_rate

# Get the experiment run context
run = Run.get_context()

# load the diabetes dataset
print("Loading Data...")
data_path = run.input_datasets['diabetes'] # Get the training data from the estimator input
all_files = glob.glob(data_path + "/*.csv")
diabetes = pd.concat((pd.read_csv(f) for f in all_files))