In [4]:
from azureml.core import Workspace, Datastore


ws = Workspace.from_config()

In [2]:
for ds_name in ws.datastores:
    print(ds_name)

azureml_globaldatasets
workspaceblobstore
workspacefilestore


In [11]:
# Fetch datastore
blob_store = Datastore.get(workspace = ws, datastore_name = 'workspaceblobstore')
# Upload file to datastore
blob_store.upload_files(files=['data/diabetes.csv'],
                 target_path='diabetes-data-2/',
                 show_progress=True)

Uploading an estimated of 1 files
Uploading data/diabetes.csv
Uploaded data/diabetes.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_297c798700ff47d6957e8ca05576de57

In [13]:
# Download file from datastore
blob_store.download(target_path='downloads/',
                   prefix='diabetes-data-2/')

Downloading diabetes-data-2/diabetes.csv
Downloaded diabetes-data-2/diabetes.csv, 1 files out of an estimated total of 1


1

In [17]:
# Get reference to uploaded file
ref = blob_store.path('diabetes-data-2').as_download(path_on_compute='path_on_compute')
# This can be passed to script using --data-folder: dataref, 
#--data-ref will simply be the path of the data as it is available in the compute target

In [21]:
# Create a dataset from the recently uploaded file
from azureml.core import Dataset
tab_ds = Dataset.Tabular.from_delimited_files(path=(blob_store, 'diabetes-data-2/*.csv'))

In [24]:
# Let's peek
tab_ds.take(5).to_pandas_dataframe()

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,1883350,9,103,78,25,304,29.582192,1.28287,43,1
4,1424119,1,85,59,27,35,42.604536,0.549542,22,0


In [25]:
# Register dataset, such that it will be accessible in other contexts
tab_ds.register(workspace=ws, name='diabetes_csv')

{
  "source": [
    "('workspaceblobstore', 'diabetes-data-2/*.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "a21721af-49fc-4a74-887e-31227e1aebf0",
    "name": "diabetes_csv",
    "version": 1,
    "workspace": "Workspace.create(name='ML-2s-sandbox', subscription_id='08265842-251e-450c-8d28-a06ee3f3c611', resource_group='DataSandbox')"
  }
}

In [32]:
# Can access via dataset dict in workspace
ws.datasets['diabetes_csv']

{
  "source": [
    "('workspaceblobstore', 'diabetes-data-2/*.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "a21721af-49fc-4a74-887e-31227e1aebf0",
    "name": "diabetes_csv",
    "version": 1,
    "workspace": "Workspace.create(name='ML-2s-sandbox', subscription_id='08265842-251e-450c-8d28-a06ee3f3c611', resource_group='DataSandbox')"
  }
}

In [38]:
# Get all registered datasets in workspace
Dataset.get_all(ws)

{'diabetes_csv': DatasetRegistration(id='a21721af-49fc-4a74-887e-31227e1aebf0', name='diabetes_csv', version=1, description='', tags={}), 'diabetes target': DatasetRegistration(id='7d97d13c-ad3d-4cd9-bfed-157fba387e76', name='diabetes target', version=1, description='diabetes target data', tags={'format': 'CSV'}), 'diabetes baseline': DatasetRegistration(id='9e4e5917-5885-4e17-84f8-e9623fb74f29', name='diabetes baseline', version=1, description='diabetes baseline data', tags={'format': 'CSV'}), 'diabetes file dataset': DatasetRegistration(id='e3990bf4-01bb-4f72-b2bd-0d80121ec403', name='diabetes file dataset', version=1, description='diabetes files', tags={'format': 'CSV'}), 'diabetes dataset': DatasetRegistration(id='521a535d-8a19-4ccf-bbc4-d89d8c370412', name='diabetes dataset', version=1, description='diabetes data', tags={'format': 'CSV'}), 'penguins': DatasetRegistration(id='f9ed263a-e01b-4114-b17e-9dcc9772c085', name='penguins', version=1, description='', tags={}), 'TD-Auto_Price

In [41]:
# Fetch diabetes_csv by name
Dataset.get_by_name(workspace=ws, name='diabetes_csv')

{
  "source": [
    "('workspaceblobstore', 'diabetes-data-2/*.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "a21721af-49fc-4a74-887e-31227e1aebf0",
    "name": "diabetes_csv",
    "version": 1,
    "workspace": "Workspace.create(name='ML-2s-sandbox', subscription_id='08265842-251e-450c-8d28-a06ee3f3c611', resource_group='DataSandbox')"
  }
}

In [46]:
# Uploading a copy of the file
blob_store.upload_files(files=['data/diabetes.csv'],
                 target_path='diabetes-data-2/diabetes_copy.csv',
                 show_progress=True)

Uploading an estimated of 1 files
Target already exists. Skipping upload for diabetes-data-2/diabetes_copy.csv\diabetes.csv
Uploaded 0 files


$AZUREML_DATAREFERENCE_c868d6f4b1a34a969c9a807a0ffc61ce

In [56]:
# Creating new version with only the copy of the file of the file
tab_ds = Dataset.Tabular.from_delimited_files(path=(blob_store, 'diabetes-data-2/diabetes_copy.csv'))
tab_ds.register(workspace=ws, name='diabetes_csv', create_new_version=True)

{
  "source": [
    "('workspaceblobstore', 'diabetes-data-2/diabetes_copy.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "6eadd105-1803-48c2-ba8d-cb450cb37604",
    "name": "diabetes_csv",
    "version": 3,
    "workspace": "Workspace.create(name='ML-2s-sandbox', subscription_id='08265842-251e-450c-8d28-a06ee3f3c611', resource_group='DataSandbox')"
  }
}

In [61]:
ds = Dataset.get_by_name(workspace=ws, name='diabetes_csv', version=3)

In [63]:
df = ds.to_pandas_dataframe()

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,1883350,9,103,78,25,304,29.582192,1.28287,43,1
4,1424119,1,85,59,27,35,42.604536,0.549542,22,0


In [74]:
%%writefile experiment_script/diabetes_experiment_data.py
# Let's create a new script that uses this data, using named input "csv_data"
from azureml.core import Run
import argparse
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, plot_confusion_matrix

parser = argparse.ArgumentParser()
parser.add_argument('--reg_rate', type=float, dest='reg', default=0.01)
args = parser.parse_args()
reg = args.reg


# Start logging
run = Run.get_context()

# Input
data = run.input_datasets['csv_data'].to_pandas_dataframe()
X = data.drop(columns=['PatientID', 'Diabetic'])
y = data.iloc[:,-1]

# Scale X
X_scaled = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2)

# Number of rows in data for funsies
run.log('observations', len(data))
run.log('positives', len(data[data['Diabetic'] == 1]))
run.log('negatives', len(data[data['Diabetic'] == 0]))

# Train a model
model = LogisticRegression(C=1/reg, random_state=0).fit(X_train, y_train)
y_pred = model.predict(X_test)

auc = roc_auc_score(y_test, y_pred)
run.log('AUC', auc)

conf = plot_confusion_matrix(model, X_test, y_test)
run.log_image(name = "confusion_matrix", plot = plt)

# Upload sample file, because why not
sample = data.sample(n=100)
# Creates outputs folder which is auto-uploaded to the experiment, instead of run.upload_file()
sample.to_csv('outputs/sample.csv')
# Save model
joblib.dump(value=model, filename='outputs/model.pkl')

run.complete()

Overwriting experiment_script/diabetes_experiment_data.py


In [71]:
from azureml.core import Experiment
from azureml.train.estimator import Estimator

estimator = Estimator(source_directory='experiment_script',
                      entry_script='diabetes_experiment_data.py',
                      compute_target='DS-Ass-Cluster',
                      conda_packages=['scikit-learn', 'matplotlib'],
                      script_params = {'--reg_rate': 0.1},
                      inputs=[tab_ds.as_named_input('csv_data')],
                      pip_packages=['azureml-dataprep[pandas]']
                    )
experiment = Experiment(workspace = ws, name='diabetes_experiment_script')
run = experiment.submit(config=estimator)
run.wait_for_completion(show_output=True)



RunId: diabetes_experiment_script_1605791290_91e79e8a
Web View: https://ml.azure.com/experiments/diabetes_experiment_script/runs/diabetes_experiment_script_1605791290_91e79e8a?wsid=/subscriptions/08265842-251e-450c-8d28-a06ee3f3c611/resourcegroups/DataSandbox/workspaces/ML-2s-sandbox

Streaming azureml-logs/20_image_build_log.txt

2020/11/19 13:08:20 Downloading source code...
2020/11/19 13:08:21 Finished downloading source code
2020/11/19 13:08:22 Creating Docker network: acb_default_network, driver: 'bridge'
2020/11/19 13:08:22 Successfully set up Docker network: acb_default_network
2020/11/19 13:08:22 Setting up Docker configuration...
2020/11/19 13:08:23 Successfully set up Docker configuration
2020/11/19 13:08:23 Logging in to registry: 4f065ca9923c4e649f7d27454535e444.azurecr.io
2020/11/19 13:08:24 Successfully logged into 4f065ca9923c4e649f7d27454535e444.azurecr.io
2020/11/19 13:08:24 Executing step ID: acb_step_0. Timeout(sec): 5400, Working directory: '', Network: 'acb_default

{'runId': 'diabetes_experiment_script_1605791290_91e79e8a',
 'target': 'DS-Ass-Cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-11-19T13:16:30.610074Z',
 'endTimeUtc': '2020-11-19T13:18:33.714534Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': 'f4d5acc7-cc96-4958-b863-56dab9bcb5a6',
  'azureml.git.repository_uri': 'https://github.com/rnymke/ml_notebooks.git',
  'mlflow.source.git.repoURL': 'https://github.com/rnymke/ml_notebooks.git',
  'azureml.git.branch': 'master',
  'mlflow.source.git.branch': 'master',
  'azureml.git.commit': '8af96ae52bd6c42d317ab31b4d725835ec37e29b',
  'mlflow.source.git.commit': '8af96ae52bd6c42d317ab31b4d725835ec37e29b',
  'azureml.git.dirty': 'True',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': '6eadd105-1803-48c2-ba8d-cb450cb37604'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'csv_data', 'mecha

In [73]:
run.get_metrics()

{'observations': 15000,
 'positives': 5000,
 'negatives': 10000,
 'AUC': 0.7304744988835704,
 'confusion_matrix': 'aml://artifactId/ExperimentRun/dcid.diabetes_experiment_script_1605791290_91e79e8a/confusion_matrix_1605791890.png'}