# Ajustar hyperparámetros con un job de barrido

In [None]:
pip show azure-ai-ml

## Conectar a workspace

In [40]:
# conectar
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

ml_client = MLClient.from_config(credential=DefaultAzureCredential())

print(f"Conectado al Workspace: {ml_client.workspace_name}")


Found the config file in: /config.json
Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented


Conectado al Workspace: naturgyml4


## Crea el script de  entrenamiento
El script espera dos parámetros:

- `--training_data` Especifica la ruta a un activo de datos registrado como datos de entrenamiento de entrada.
- `--reg_rate` para el ajuste del hiperparameter.

In [43]:
import os

# create a folder for the script files
script_folder = '/home/azureuser/cloudfiles/code/Users/formacion/azure-machine-learning/2.Ciclo_Vida_Creación_Entrenamiento_Modelos/labs/src'
os.makedirs(script_folder, exist_ok=True)
print(script_folder, 'folder created')

/home/azureuser/cloudfiles/code/Users/formacion/azure-machine-learning/2.Ciclo_Vida_Creación_Entrenamiento_Modelos/labs/src folder created


In [45]:
%%writefile $script_folder/train.py
# import libraries
import mlflow
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

def main(args):
    # read data
    df = get_data(args.training_data)

    # split data
    X_train, X_test, y_train, y_test = split_data(df)

    # train model
    model = train_model(args.reg_rate, X_train, X_test, y_train, y_test)

    # evaluate model
    eval_model(model, X_test, y_test)

# function that reads the data
def get_data(path):
    print("Reading data...")
    df = pd.read_csv(path)
    
    return df

# function that splits the data
def split_data(df):
    print("Splitting data...")
    X, y = df[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness',
    'SerumInsulin','BMI','DiabetesPedigree','Age']].values, df['Diabetic'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

    return X_train, X_test, y_train, y_test

# function that trains the model
def train_model(reg_rate, X_train, X_test, y_train, y_test):
    mlflow.log_param("Regularization rate", reg_rate)
    print("Training model...")
    model = LogisticRegression(C=1/reg_rate, solver="liblinear").fit(X_train, y_train)

    return model

# function that evaluates the model
def eval_model(model, X_test, y_test):
    # calculate accuracy
    y_hat = model.predict(X_test)
    acc = np.average(y_hat == y_test)
    print('Accuracy:', acc)
    mlflow.log_metric("training_accuracy_score", acc)

    # calculate AUC
    y_scores = model.predict_proba(X_test)
    auc = roc_auc_score(y_test,y_scores[:,1])
    print('AUC: ' + str(auc))
    mlflow.log_metric("AUC", auc)

    # plot ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
    fig = plt.figure(figsize=(6, 4))
    # Plot the diagonal 50% line
    plt.plot([0, 1], [0, 1], 'k--')
    # Plot the FPR and TPR achieved by our model
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.savefig("ROC-Curve.png")
    mlflow.log_artifact("ROC-Curve.png")    

def parse_args():
    # setup arg parser
    parser = argparse.ArgumentParser()

    # add arguments
    parser.add_argument("--training_data", dest='training_data',
                        type=str)
    parser.add_argument("--reg_rate", dest='reg_rate',
                        type=float, default=0.01)

    # parse args
    args = parser.parse_args()

    # return args
    return args

# run script
if __name__ == "__main__":
    # add space in logs
    print("\n\n")
    print("*" * 60)

    # parse args
    args = parse_args()

    # run main function
    main(args)

    # add space in logs
    print("*" * 60)
    print("\n\n")


Writing /home/azureuser/cloudfiles/code/Users/formacion/azure-machine-learning/2.Ciclo_Vida_Creación_Entrenamiento_Modelos/labs/src/train.py


## Configurar y ejecutar un job de comando

In [56]:
from azure.ai.ml import command, Input
from azure.ai.ml.constants import AssetTypes

# configure job

job = command(
    code="/home/azureuser/cloudfiles/code/Users/formacion/azure-machine-learning/2.Ciclo_Vida_Creación_Entrenamiento_Modelos/labs/src",
    command="python train.py --training_data ${{inputs.diabetes_data}} --reg_rate ${{inputs.reg_rate}}",
    inputs={
        "diabetes_data": Input(type=AssetTypes.URI_FILE, path="azureml:diabetes-data-local-ric:1"),
        "reg_rate": 0.01,
    },
    environment="AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest",
    compute="ricardoenm",
    display_name="diabetes-train-hyperparams-ric",
    experiment_name="diabetes-training-hyp-ric", 
    tags={"model_type": "LogisticRegression"}
    )

# submit job
# returned_job = ml_client.create_or_update(job)
# aml_url = returned_job.studio_url
# print("Monitor your job at", aml_url)

## Definir el espacio de búsqueda

In [57]:
from azure.ai.ml.sweep import Choice

# TODO: configure sweep
command_job_for_sweep = job(
    reg_rate = Choice(values=[0.01, 0.1, 1])
)

## Configurar y enviar el trabajo de barrido

In [None]:
# Aplicar el parámetro de barrido para obtener el Sweep_Job
from azure.ai.ml.sweep import BanditPolicy

sweep_job = command_job_for_sweep.sweep(
    compute="ricardoenm",
    sampling_algorithm="grid",
    primary_metric="training_accuracy_score",
    goal="Maximize",
)

# Establecer el nombre del experimento de trabajo de barrido
sweep_job.experiment_name="sweep-diabetes-ric"

# Early termination
sweep_job.early_termination = BanditPolicy(
    slack_amount = 0.2, 
    delay_evaluation = 2, 
    evaluation_interval = 1
)

# Definir los límites para este barrido
# TODO
sweep_job.set_limits(max_total_trials=4, max_concurrent_trials=2, timeout=7200)

Enviar el job de barrido.

In [59]:
returned_sweep_job = ml_client.create_or_update(sweep_job)
aml_url = returned_sweep_job.studio_url
print("Monitor your job at", aml_url)

Monitor your job at https://ml.azure.com/runs/yellow_panda_lph9yl9f13?wsid=/subscriptions/7decb7a4-f615-4cc3-9d7d-5de10998373f/resourcegroups/naturgy/workspaces/naturgyml4&tid=8b85184d-f6f0-4ace-80dd-c3bda5a6c156


Cuando se complete el trabajo, navegue a la descripción general del trabajo. 
La pestaña **Trials** mostrará todos los modelos que han sido entrenados y la puntuación de `accuracy`.