## Azure ML

In [1]:
import sys
sys.version

'3.6.9 |Anaconda, Inc.| (default, Jul 30 2019, 19:07:31) \n[GCC 7.3.0]'

In [2]:
import os, shutil

# Create a folder for the experiment files
training_folder = 'portoseguro'
os.makedirs(training_folder, exist_ok=True)

# Copy the data file into the experiment folder
shutil.copy('data/porto_seguro_safe_driver_prediction_input.csv', os.path.join(training_folder, "porto_seguro_safe_driver_prediction_input.csv"))


'portoseguro/porto_seguro_safe_driver_prediction_input.csv'

In [3]:
!ls portoseguro/porto_seguro_safe_driver_prediction_input.csv -l

-rwxrwxrwx 1 root root 115852544 Mar 22 12:32 portoseguro/porto_seguro_safe_driver_prediction_input.csv


## train.py
This file defines the key functions required to train the model.  
The file can be invoked with `python train.py` for development purposes.

In [4]:
%%writefile $training_folder/train.py
import os
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
import lightgbm


def split_data(data_df):
    """Split a dataframe into training and validation datasets"""
    
    features = data_df.drop(['target', 'id'], axis = 1)
    labels = np.array(data_df['target'])
    features_train, features_valid, labels_train, labels_valid = train_test_split(features, 
                                                                                  labels, 
                                                                                  test_size=0.2, 
                                                                                  random_state=0)

    train_data = lightgbm.Dataset(features_train, label=labels_train)
    valid_data = lightgbm.Dataset(features_valid, label=labels_valid, free_raw_data=False)
    
    return (train_data, valid_data)


def train_model(data, parameters):
    """Train a model with the given datasets and parameters"""
    # The object returned by split_data is a tuple.
    # Access train_data with data[0] and valid_data with data[1]
       
    model = lightgbm.train(parameters,
                           data[0],
                           valid_sets=data[1],
                           num_boost_round=500,
                           early_stopping_rounds=20)
    
    return model


def get_model_metrics(model, data):
    """Construct a dictionary of metrics for the model"""
    
    predictions = model.predict(data[1].data)
    fpr, tpr, thresholds = metrics.roc_curve(data[1].label, predictions)
    model_metrics = {"auc": (metrics.auc(fpr, tpr))}
        
    return model_metrics


def main():
    """This method invokes the training functions for development purposes"""
    
    # Read data from a file
    data_df = pd.read_csv('porto_seguro_safe_driver_prediction_input.csv')

    # Hard code the parameters for training the model
    parameters = {
        'learning_rate': 0.02,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'sub_feature': 0.7,
        'num_leaves': 60,
        'min_data': 100,
        'min_hessian': 1,
        'verbose': 2
    }

    # Call the functions defined in this file
    data=split_data(data_df)
    model=train_model(data, parameters)
    model_metrics=get_model_metrics(model, data)
    
    # Print the resulting metrics for the model
    print(model_metrics)
    
if __name__ == '__main__':
    main()

Overwriting portoseguro/train.py


In [5]:
!ls portoseguro/train.py -l

-rwxrwxrwx 1 root root 2513 Mar 22 12:32 portoseguro/train.py


## parameters.json
This file will specify the parameters used to train the model.

In [6]:
%%writefile $training_folder/parameters.json
{
    "training":
    {
        "learning_rate": 0.01,
        "boosting_type": "gbdt",
        "objective": "binary",
        "metric": "auc",
        "sub_feature": 0.7,
        "num_leaves": 60,
        "min_data": 100,
        "min_hessian": 1,
        "verbose": 0
    }
}


Overwriting portoseguro/parameters.json


In [7]:
!ls portoseguro/parameters.json -l

-rwxrwxrwx 1 root root 278 Mar 22 12:32 portoseguro/parameters.json


## driver_training.py
This file will be the entry script when running an Azure ML context.  
It calls the functions defined in train.py for data preparation and training, but reads parameters from a file, and logs output to the Azure ML context.  
The file can be invoked with `python driver_training.py` for development purposes.

In [8]:
%%writefile $training_folder/driver_training.py
# Import libraries
import argparse
from azureml.core import Run
import joblib
import json
import os
import pandas as pd
import shutil

# Import functions from train.py
from train import split_data, train_model, get_model_metrics

# Get the output folder for the model from the '--output_folder' parameter
parser = argparse.ArgumentParser()
parser.add_argument('--output_folder', type=str, dest='output_folder', default="output_folder")
args = parser.parse_args()
output_folder = args.output_folder

# Get the experiment run context
run = Run.get_context()

# load the safe driver prediction dataset
train_df = pd.read_csv('porto_seguro_safe_driver_prediction_input.csv')

# Load the parameters for training the model from the file
with open("parameters.json") as f:
    pars = json.load(f)
    parameters = pars["training"]

# Log each of the parameters to the run
for param_name, param_value in parameters.items():
    run.log(param_name, param_value)
    
# Use the functions imported from train.py to prepare data, train the model, and calculate the metrics

data=split_data(train_df)
model=train_model(data, parameters)
model_metrics=get_model_metrics(model, data)

# run.log of the AUC
run.log("AUC", model_metrics['auc'])

# Save the trained model to the output folder

model_name = "Portomlmodel.pkl"
filename = "outputs/" + model_name
    
joblib.dump(value=model, filename=filename)
run.upload_file(name=model_name, path_or_stream=filename)


run.complete()

Overwriting portoseguro/driver_training.py


In [9]:
!ls portoseguro/driver_training.py -l

-rwxrwxrwx 1 root root 1467 Mar 22 12:32 portoseguro/driver_training.py


In [10]:
import azureml.core
from azureml.core import Workspace

# Load the workspace
ws = Workspace.from_config()

## Use an Estimator to Run the Script as an Experiment

See [this tutorial](https://github.com/MicrosoftDocs/mslearn-aml-labs/blob/master/02-Training_Models.ipynb) for a starting point

Use the scikit-learn and lightgbm conda packages

In [11]:
from azureml.train.estimator import Estimator
from azureml.core import Experiment

# Create an estimator
# Need to use the ScriptRunConfig

estimator = Estimator(source_directory=training_folder,
                      entry_script='driver_training.py',
                      compute_target='local',
                      conda_packages=['scikit-learn', 'lightgbm']
                      )

# Create an experiment
experiment_name = 'portosegurodemo'
experiment = Experiment(workspace = ws, name = experiment_name)

# Run the experiment based on the estimator
run = experiment.submit(config=estimator)
run.wait_for_completion(show_output=True)

'Estimator' is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or an Azure ML curated environment.


RunId: portosegurodemo_1616416374_9e44ef43
Web View: https://ml.azure.com/experiments/portosegurodemo/runs/portosegurodemo_1616416374_9e44ef43?wsid=/subscriptions/70b8f39e-8863-49f7-b6ba-34a80799550c/resourcegroups/AMLworkshop-rg/workspaces/AMLworkshop

Streaming azureml-logs/70_driver_log.txt

[2021-03-22T12:33:05.039432] Entering context manager injector.
Cannot provide tracer without any exporter configured.
[context_manager_injector.py] Command line Options: Namespace(inject=['ProjectPythonPath:context_managers.ProjectPythonPath', 'RunHistory:context_managers.RunHistory', 'TrackUserError:context_managers.TrackUserError', 'UserExceptions:context_managers.UserExceptions'], invocation=['driver_training.py'])
Script type = None
Starting the daemon thread to refresh tokens in background for process with pid = 9
[2021-03-22T12:33:05.714809] Entering Run History Context Manager.
[2021-03-22T12:33:06.522555] Current directory: /azureml-run
[2021-03-22T12:33:06.522609] Preparing to call scr

[238]	valid_0's auc: 0.635588
[239]	valid_0's auc: 0.63564
[240]	valid_0's auc: 0.635584
[241]	valid_0's auc: 0.635613
[242]	valid_0's auc: 0.635603
[243]	valid_0's auc: 0.635652
[244]	valid_0's auc: 0.635682
[245]	valid_0's auc: 0.635731
[246]	valid_0's auc: 0.635798
[247]	valid_0's auc: 0.635816
[248]	valid_0's auc: 0.635852
[249]	valid_0's auc: 0.635879
[250]	valid_0's auc: 0.635905
[251]	valid_0's auc: 0.635987
[252]	valid_0's auc: 0.636002
[253]	valid_0's auc: 0.636038
[254]	valid_0's auc: 0.636038
[255]	valid_0's auc: 0.636024
[256]	valid_0's auc: 0.636047
[257]	valid_0's auc: 0.636103
[258]	valid_0's auc: 0.636134
[259]	valid_0's auc: 0.636148
[260]	valid_0's auc: 0.636169
[261]	valid_0's auc: 0.636192
[262]	valid_0's auc: 0.636193
[263]	valid_0's auc: 0.636197
[264]	valid_0's auc: 0.636246
[265]	valid_0's auc: 0.636289
[266]	valid_0's auc: 0.6363
[267]	valid_0's auc: 0.636298
[268]	valid_0's auc: 0.636331
[269]	valid_0's auc: 0.636378
[270]	valid_0's auc: 0.636369
[271]	valid_0

{'runId': 'portosegurodemo_1616416374_9e44ef43',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2021-03-22T12:33:02.173312Z',
 'endTimeUtc': '2021-03-22T12:33:43.53785Z',
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': 'c389f134-6893-4dd7-89fc-cd45b1febfd2'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'script': 'driver_training.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': [],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'local',
  'dataReferences': {},
  'data': {},
  'outputData': {},
  'jobName': None,
  'maxRunDurationSeconds': None,
  'nodeCount': 1,
  'priority': None,
  'credentialPassthrough': False,
  'identity': None,
  'environment': {'name': 'Experiment portosegurodemo Environment',
   'version': 'Autosave_2021-03-22T12:32:59Z_85fbcd62',
   'python': {'interpreterPath': 'python',
    'userManagedDependencies': False,
    'condaDependenci

In [12]:
# Print the resulting metrics
metrics = run.get_metrics()
for k, v in metrics.items():
        print(k, v)

learning_rate 0.01
boosting_type gbdt
objective binary
metric auc
sub_feature 0.7
num_leaves 60
min_data 100
min_hessian 1
verbose 0
AUC 0.6387827870370796


In [13]:
AUC=metrics['AUC']
AUC

0.6387827870370796

In [14]:
LR=metrics['learning_rate']
LR

0.01

In [15]:
experiment

Name,Workspace,Report Page,Docs Page
portosegurodemo,AMLworkshop,Link to Azure Machine Learning studio,Link to Documentation
