# **Rain prediction Australia**

*Contexto*  

Predecir la lluvia del día siguiente entrenando modelos con AutoML de Azure, la variable objetivo RainTomorrow.

In [1]:
from azure.identity import DefaultAzureCredential
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml import automl, Input, MLClient
from pprint import pprint
from azure.ai.ml.entities import ResourceConfiguration
from azure.ai.ml.entities import AmlCompute

In [2]:
credential = DefaultAzureCredential()
ml_client = MLClient.from_config(credential)

Found the config file in: /config.json


##### **1. Read and split data**

Se dividió el dataset en train (80%) y test (20%)

In [26]:
# Crear MLTABLE
def create_ml_table_file(filename):
    """Create ML Table definition"""

    return (
        "paths:\n"
        "  - file: ./{0}\n"
        "transformations:\n"
        "  - read_delimited:\n"
        "        delimiter: ','\n"
        "        encoding: 'utf8'\n"
        "        empty_as_string: false"
    ).format(filename)


def save_ml_table_file(output_path, mltable_file_contents):
    with open(os.path.join(output_path, "MLTable"), "w") as f:
        f.write(mltable_file_contents)
   

In [27]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

weather_data = pd.read_csv('https://raw.githubusercontent.com/sharonmaygua/rain_prediction/main/weatherAUS_ML.csv')
train_data, test_data = train_test_split(weather_data, test_size=0.2, random_state=11)

# Train file and its MLTABLE

train_folder = './data-ml-table/training-ml-table'
os.makedirs(train_folder, exist_ok=True)
train_file_path = os.path.join(train_folder,'train_data.csv')
train_data.to_csv(train_file_path, index=False)

train_mltable_file_contents = create_ml_table_file(
    os.path.basename(train_file_path)
)  
save_ml_table_file(train_folder, train_mltable_file_contents) 


# Validation file and its MLTABLE

valid_folder = './data-ml-table/validation-ml-table'
os.makedirs(input_train_folder, exist_ok=True)
valid_file_path = os.path.join(valid_folder,'valid_data.csv')
train_data.to_csv(valid_file_path, index=False)

valid_mltable_file_contents = create_ml_table_file(
    os.path.basename(valid_file_path)
)  
save_ml_table_file(valid_folder, valid_mltable_file_contents) 

In [28]:
# READ data en formato MLTABLE
training_mltable_path   = "./data-ml-table/training-ml-table/"
validation_mltable_path = "./data-ml-table/validation-ml-table/"

# Training MLTable defined locally, with local data to be uploaded
my_training_data_input = Input(type=AssetTypes.MLTABLE, path=training_mltable_path)

# Validation MLTable defined locally, with local data to be uploaded
my_validation_data_input = Input(type=AssetTypes.MLTABLE, path=validation_mltable_path)

##### **2. Configurar el trabajo de clasificación**

Se utilizó un trabajo de clasificación para entrenar un modelo que prediga "RainTomorrow". Se entrenan varios modelos utilizando los datos de entrenamiento. El modelo con el mejor rendimiento en los datos de validación basados en la métrica principal se selecciona como modelo final.

- El compute cluster utilizado es de tipo General purpose, **STANDARD_DS3_V2**, con 2 de 4 cores disponibles, 8GB RAM, 75GB storage. Las características del cluster se adaptan a la tarea de clasificación y al conjunto de datos que no requiere un amplio almacenamiento, pero nos permite utilizar dos nodos para paralelizar el proceso.

- Para seguir experimentando con los modelos se configuro a 30 minutos para ejecutar todo el experimento.

In [30]:
from azure.ai.ml.entities import AmlCompute
from azure.core.exceptions import ResourceNotFoundError

compute_name = "cpu-cluster-prediction"
size_name    = "Standard_D4_v3"
try:
    _ = ml_client.compute.get(compute_name)
    print("Found existing compute target.")
except ResourceNotFoundError:
    print("Creating a new compute target...")
    compute_config = AmlCompute(
        name=compute_name,
        type="amlcompute",
        size=size_name,
        idle_time_before_scale_down=120,
        min_instances=0,
        max_instances=6,
        tier="Dedicated",
    )
    ml_client.begin_create_or_update(compute_config).result()

Creating a new compute target...


In [31]:
exp_name     = 'rain-prediction'
exp_timeout  = 30

# Create the AutoML classification job with the related factory-function.
classification_job = automl.classification(
    compute=compute_name,
    experiment_name=exp_name,
    training_data=my_training_data_input,
    validation_data=my_validation_data_input,
    target_column_name="RainTomorrow",
    primary_metric="precision_score_weighted",
    enable_model_explainability=True,
    tags={"resource": exp_name}
)

# Limits are all optional

classification_job.set_limits(timeout_minutes=exp_timeout,
                               max_trials=5,
                               max_nodes=4)

In [32]:
returned_job = ml_client.jobs.create_or_update(
    classification_job
)  # submit the job to the backend

print(f"Created job: {returned_job}")

[32mUploading training-ml-table (7.42 MBs): 100%|██████████| 7419704/7419704 [00:00<00:00, 21795047.28it/s]
[39m

[32mUploading validation-ml-table (7.42 MBs): 100%|██████████| 7419704/7419704 [00:00<00:00, 42353946.76it/s]
[39m



Created job: compute: azureml:cpu-cluster-prediction
creation_context:
  created_at: '2023-10-28T05:04:39.012205+00:00'
  created_by: Carolina Aldunate
  created_by_type: User
display_name: shy_machine_vgd0l85mnl
experiment_name: rain-prediction
id: azureml:/subscriptions/3deaa453-5a6c-4bcd-85f1-1645c3ccd539/resourceGroups/diplomado_ucb/providers/Microsoft.MachineLearningServices/workspaces/ws-rain-prediction/jobs/shy_machine_vgd0l85mnl
limits:
  enable_early_termination: true
  max_concurrent_trials: 1
  max_cores_per_trial: -1
  max_nodes: 4
  max_trials: 5
  timeout_minutes: 30
  trial_timeout_minutes: 30
log_verbosity: info
name: shy_machine_vgd0l85mnl
outputs: {}
primary_metric: precision_score_weighted
properties: {}
resources:
  instance_count: 1
  shm_size: 2g
services:
  Studio:
    endpoint: https://ml.azure.com/runs/shy_machine_vgd0l85mnl?wsid=/subscriptions/3deaa453-5a6c-4bcd-85f1-1645c3ccd539/resourcegroups/diplomado_ucb/workspaces/ws-rain-prediction&tid=192a4dcb-d77a-44ee

# 

**Entrenamiento...**

In [33]:
ml_client.jobs.stream(returned_job.name)

RunId: shy_machine_vgd0l85mnl
Web View: https://ml.azure.com/runs/shy_machine_vgd0l85mnl?wsid=/subscriptions/3deaa453-5a6c-4bcd-85f1-1645c3ccd539/resourcegroups/diplomado_ucb/workspaces/ws-rain-prediction

Execution Summary
RunId: shy_machine_vgd0l85mnl
Web View: https://ml.azure.com/runs/shy_machine_vgd0l85mnl?wsid=/subscriptions/3deaa453-5a6c-4bcd-85f1-1645c3ccd539/resourcegroups/diplomado_ucb/workspaces/ws-rain-prediction



Con base a los resultados, los modelos que entreno AutoML son: StackEnsemble, VotingEnsemble, MaxAbsScaler-XGBoostClassifier, MaxAbsScaler-LightGBM y MaxAbsScaler, ExtremeRandomTrees. El mejor modelo *StackEnsemble* tiene una precision alta de 0.85540

In [34]:
# Get a URL for the status of the job
returned_job.services["Studio"].endpoint
print(returned_job)

compute: azureml:cpu-cluster-prediction
creation_context:
  created_at: '2023-10-28T05:04:39.012205+00:00'
  created_by: Carolina Aldunate
  created_by_type: User
display_name: shy_machine_vgd0l85mnl
experiment_name: rain-prediction
id: azureml:/subscriptions/3deaa453-5a6c-4bcd-85f1-1645c3ccd539/resourceGroups/diplomado_ucb/providers/Microsoft.MachineLearningServices/workspaces/ws-rain-prediction/jobs/shy_machine_vgd0l85mnl
limits:
  enable_early_termination: true
  max_concurrent_trials: 1
  max_cores_per_trial: -1
  max_nodes: 4
  max_trials: 5
  timeout_minutes: 30
  trial_timeout_minutes: 30
log_verbosity: info
name: shy_machine_vgd0l85mnl
outputs: {}
primary_metric: precision_score_weighted
properties: {}
resources:
  instance_count: 1
  shm_size: 2g
services:
  Studio:
    endpoint: https://ml.azure.com/runs/shy_machine_vgd0l85mnl?wsid=/subscriptions/3deaa453-5a6c-4bcd-85f1-1645c3ccd539/resourcegroups/diplomado_ucb/workspaces/ws-rain-prediction&tid=192a4dcb-d77a-44ee-815c-5ca0ba5

#### **3. Recuperar la mejor prueba**

Utilizar MLFLowClient para acceder a los resultados  
Inicializar cliente MLFlow

In [35]:
# Obtain the tracking URI for MLFlow
import mlflow

# Obtain the tracking URL from MLClient
MLFLOW_TRACKING_URI = ml_client.workspaces.get(
    name=ml_client.workspace_name
).mlflow_tracking_uri

print(MLFLOW_TRACKING_URI)

azureml://eastus.api.azureml.ms/mlflow/v1.0/subscriptions/3deaa453-5a6c-4bcd-85f1-1645c3ccd539/resourceGroups/diplomado_ucb/providers/Microsoft.MachineLearningServices/workspaces/ws-rain-prediction


In [36]:
# Set the MLFLOW TRACKING URI
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
print("\nCurrent tracking uri: {}".format(mlflow.get_tracking_uri()))


Current tracking uri: azureml://eastus.api.azureml.ms/mlflow/v1.0/subscriptions/3deaa453-5a6c-4bcd-85f1-1645c3ccd539/resourceGroups/diplomado_ucb/providers/Microsoft.MachineLearningServices/workspaces/ws-rain-prediction


In [37]:
from mlflow.tracking.client import MlflowClient
from mlflow.artifacts import download_artifacts

# Initialize MLFlow client
mlflow_client = MlflowClient()

In [38]:
# Get the AutoML parent job
job_name = returned_job.name

# Get the parent run
mlflow_parent_run = mlflow_client.get_run(job_name)

print("Parent Run: ")
print(mlflow_parent_run)

Parent Run: 
<Run: data=<RunData: metrics={'AUC_macro': 0.9332377604374513,
 'AUC_micro': 0.9591190960559728,
 'AUC_weighted': 0.9332377597960146,
 'accuracy': 0.8901886979347234,
 'average_precision_score_macro': 0.9071936745946523,
 'average_precision_score_micro': 0.9599754421181079,
 'average_precision_score_weighted': 0.947337788420737,
 'balanced_accuracy': 0.7940624879486162,
 'f1_score_macro': 0.8228425658237979,
 'f1_score_micro': 0.8901886979347234,
 'f1_score_weighted': 0.8840626829935706,
 'log_loss': 0.26430608203075623,
 'matthews_correlation': 0.6582308509458612,
 'norm_macro_recall': 0.5881249758972324,
 'precision_score_macro': 0.8683467552758894,
 'precision_score_micro': 0.8901886979347234,
 'precision_score_weighted': 0.8865463906811467,
 'recall_score_macro': 0.7940624879486162,
 'recall_score_micro': 0.8901886979347234,
 'recall_score_weighted': 0.8901886979347234,
 'weighted_accuracy': 0.9403582837699775}, params={}, tags={'automl_best_child_run_id': 'shy_machine

In [39]:
# Get the best model's child run

best_child_run_id = mlflow_parent_run.data.tags["automl_best_child_run_id"]
print("Found best child run id: ", best_child_run_id)

best_run = mlflow_client.get_run(best_child_run_id)

print("Best child run: ")
print(best_run)

Found best child run id:  shy_machine_vgd0l85mnl_1
Best child run: 
<Run: data=<RunData: metrics={'AUC_macro': 0.9332377604374513,
 'AUC_micro': 0.9591190960559728,
 'AUC_weighted': 0.9332377597960146,
 'accuracy': 0.8901886979347234,
 'average_precision_score_macro': 0.9071936745946523,
 'average_precision_score_micro': 0.9599754421181079,
 'average_precision_score_weighted': 0.947337788420737,
 'balanced_accuracy': 0.7940624879486162,
 'f1_score_macro': 0.8228425658237979,
 'f1_score_micro': 0.8901886979347234,
 'f1_score_weighted': 0.8840626829935706,
 'log_loss': 0.26430608203075623,
 'matthews_correlation': 0.6582308509458612,
 'norm_macro_recall': 0.5881249758972324,
 'precision_score_macro': 0.8683467552758894,
 'precision_score_micro': 0.8901886979347234,
 'precision_score_weighted': 0.8865463906811467,
 'recall_score_macro': 0.7940624879486162,
 'recall_score_micro': 0.8901886979347234,
 'recall_score_weighted': 0.8901886979347234,
 'weighted_accuracy': 0.9403582837699775}, pa

In [40]:
# Get best model run's metrics
best_run.data.metrics

{'balanced_accuracy': 0.7940624879486162,
 'matthews_correlation': 0.6582308509458612,
 'recall_score_macro': 0.7940624879486162,
 'accuracy': 0.8901886979347234,
 'precision_score_macro': 0.8683467552758894,
 'AUC_weighted': 0.9332377597960146,
 'f1_score_micro': 0.8901886979347234,
 'recall_score_weighted': 0.8901886979347234,
 'recall_score_micro': 0.8901886979347234,
 'average_precision_score_micro': 0.9599754421181079,
 'norm_macro_recall': 0.5881249758972324,
 'average_precision_score_weighted': 0.947337788420737,
 'AUC_micro': 0.9591190960559728,
 'average_precision_score_macro': 0.9071936745946523,
 'f1_score_macro': 0.8228425658237979,
 'AUC_macro': 0.9332377604374513,
 'precision_score_weighted': 0.8865463906811467,
 'precision_score_micro': 0.8901886979347234,
 'f1_score_weighted': 0.8840626829935706,
 'log_loss': 0.26430608203075623,
 'weighted_accuracy': 0.9403582837699775}

##### **3.1. Download the best model locally**

In [41]:
# Create local folder
local_dir = "./artifact_downloads"
if not os.path.exists(local_dir):
    os.mkdir(local_dir)

In [42]:
# Download run's artifacts/outputs
local_path = download_artifacts(
    run_id=best_run.info.run_id, artifact_path="outputs", dst_path=local_dir
)
print("Artifacts downloaded in: {}".format(local_path))
print("Artifacts: {}".format(os.listdir(local_path)))

Artifacts downloaded in: /mnt/batch/tasks/shared/LS_root/mounts/clusters/aldunatelipac1/code/Users/aldunatelipac/artifact_downloads/outputs
Artifacts: ['conda_env_v_1_0_0.yml', 'engineered_feature_names.json', 'env_dependencies.json', 'featurization_summary.json', 'generated_code', 'mlflow-model', 'model.pkl', 'pipeline_graph.json', 'run_id.txt', 'scoring_file_pbi_v_1_0_0.py', 'scoring_file_v_1_0_0.py', 'scoring_file_v_2_0_0.py']


In [43]:
# Show the contents of the MLFlow model folder
os.listdir("./artifact_downloads/outputs/mlflow-model")

['conda.yaml', 'MLmodel', 'model.pkl', 'python_env.yaml', 'requirements.txt']

#### **4. Register Best Model and Deploy**.
##### 4.1 Create managed online endpoint

In [44]:
# import required libraries
from azure.ai.ml.entities import (
    ManagedOnlineEndpoint,
    ManagedOnlineDeployment,
    Model,
    Environment,
    CodeConfiguration,
    ProbeSettings,
)
from azure.ai.ml.constants import ModelType

In [47]:
# Creating a unique endpoint name with current datetime to avoid conflicts
import datetime

online_endpoint_name = "rainpredictionaus-" + datetime.datetime.now().strftime("%m%d%H%M%f")

# create an online endpoint
endpoint = ManagedOnlineEndpoint(
    name=online_endpoint_name,
    description="this is a sample online endpoint for mlflow model",
    auth_mode="key",
    tags={"foo": "bar"},
)

In [48]:
ml_client.begin_create_or_update(endpoint).result()

ManagedOnlineEndpoint({'public_network_access': 'Enabled', 'provisioning_state': 'Succeeded', 'scoring_uri': 'https://rainpredictionaus-10280538867760.eastus.inference.ml.azure.com/score', 'openapi_uri': 'https://rainpredictionaus-10280538867760.eastus.inference.ml.azure.com/swagger.json', 'name': 'rainpredictionaus-10280538867760', 'description': 'this is a sample online endpoint for mlflow model', 'tags': {'foo': 'bar'}, 'properties': {'azureml.onlineendpointid': '/subscriptions/3deaa453-5a6c-4bcd-85f1-1645c3ccd539/resourcegroups/diplomado_ucb/providers/microsoft.machinelearningservices/workspaces/ws-rain-prediction/onlineendpoints/rainpredictionaus-10280538867760', 'AzureAsyncOperationUri': 'https://management.azure.com/subscriptions/3deaa453-5a6c-4bcd-85f1-1645c3ccd539/providers/Microsoft.MachineLearningServices/locations/eastus/mfeOperationsStatus/oe:fe38c996-ca2d-4473-bf5b-739c564da9f0:c365c9a8-e259-4af5-8e3e-9b6ade4bb1e7?api-version=2022-02-01-preview'}, 'print_as_yaml': True, '

##### 4.1 Register best model and deploy

In [49]:
model_name = "rain-prediction-aus-model"
model = Model(
    path=f"azureml://jobs/{best_run.info.run_id}/outputs/artifacts/outputs/mlflow-model/",
    name=model_name,
    description="my sample classification model",
    type=AssetTypes.MLFLOW_MODEL,
)

# for downloaded file
# model = Model(path="artifact_downloads/outputs/model.pkl", name=model_name)

registered_model = ml_client.models.create_or_update(model)

In [50]:
registered_model.id

'/subscriptions/3deaa453-5a6c-4bcd-85f1-1645c3ccd539/resourceGroups/diplomado_ucb/providers/Microsoft.MachineLearningServices/workspaces/ws-rain-prediction/models/rain-prediction-aus-model/versions/1'

In [51]:
from azure.ai.ml.entities import OnlineRequestSettings

# Setting the request timeout 90s
req_timeout = OnlineRequestSettings(request_timeout_ms=90000)

deployment = ManagedOnlineDeployment(
    name='rain-prediction-deploy',
    endpoint_name=online_endpoint_name,
    model=registered_model,
    instance_type="Standard_D2as_v4",  
    instance_count=1,
    request_settings=req_timeout
)

In [52]:
ml_client.online_deployments.begin_create_or_update(deployment).result()

Check: endpoint rainpredictionaus-10280538867760 exists


..............................................................................................

ManagedOnlineDeployment({'private_network_connection': None, 'provisioning_state': 'Succeeded', 'endpoint_name': 'rainpredictionaus-10280538867760', 'type': 'Managed', 'name': 'rain-prediction-deploy', 'description': None, 'tags': {}, 'properties': {'AzureAsyncOperationUri': 'https://management.azure.com/subscriptions/3deaa453-5a6c-4bcd-85f1-1645c3ccd539/providers/Microsoft.MachineLearningServices/locations/eastus/mfeOperationsStatus/od:fe38c996-ca2d-4473-bf5b-739c564da9f0:5cefa5b3-9c7d-42f7-b343-ce14afe572ed?api-version=2023-04-01-preview'}, 'print_as_yaml': True, 'id': '/subscriptions/3deaa453-5a6c-4bcd-85f1-1645c3ccd539/resourceGroups/diplomado_ucb/providers/Microsoft.MachineLearningServices/workspaces/ws-rain-prediction/onlineEndpoints/rainpredictionaus-10280538867760/deployments/rain-prediction-deploy', 'Resource__source_path': None, 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/aldunatelipac1/code/Users/aldunatelipac', 'creation_context': None, 'serialize': <msres

##### 4.2 Test the deployment

In [53]:
import json

print(test_data.iloc[0,:])

request_json = {
    "input_data":{
        "columns":["Location","MinTemp","MaxTemp","Rainfall",'WindGustDir','WindGustSpeed',
                    "WindDir9am","WindDir3pm","WindSpeed9am","WindSpeed3pm","Humidity9am",
                    "Humidity3pm","Pressure9am","Pressure3pm","RainToday"],
        "data":   [{
                    "Location":21.0,
                    "MinTemp":6.1,
                    "MaxTemp":20.7,
                    "Rainfall":0.0,
                    "WindGustDir":0.0,
                    "WindGustSpeed":30.0,
                    "WindDir9am":0.0,
                    "WindDir3pm":8.0,
                    "WindSpeed9am":17.0,
                    "WindSpeed3pm":9.0,
                    "Humidity9am":68.0,
                    "Humidity3pm":39.0,
                    "Pressure9am":1026.0,
                    "Pressure3pm":1021.9,
                    "RainToday":0.0
                  }],
    }
}

Location           21.0
MinTemp             6.1
MaxTemp            20.7
Rainfall            0.0
WindGustDir         0.0
WindGustSpeed      30.0
WindDir9am          0.0
WindDir3pm          8.0
WindSpeed9am       17.0
WindSpeed3pm        9.0
Humidity9am        68.0
Humidity3pm        39.0
Pressure9am      1026.0
Pressure3pm      1021.9
RainToday           0.0
RainTomorrow        0.0
Name: 12670, dtype: float64


In [54]:
request_file_name = "sample_request_data.json"
with open(request_file_name, "w") as request_file:
    json.dump(request_json, request_file)

resp = ml_client.online_endpoints.invoke(
    endpoint_name=online_endpoint_name,
    deployment_name=deployment.name,
    request_file=request_file_name,
)
print('Predicción con el mejor modelo')
resp

Predicción con el mejor modelo


'[0]'

In [57]:
import json
print(test_data.iloc[2,:])
request_json = {
    "input_data":{
        "columns":["Location","MinTemp","MaxTemp","Rainfall",'WindGustDir','WindGustSpeed',
                    "WindDir9am","WindDir3pm","WindSpeed9am","WindSpeed3pm","Humidity9am",
                    "Humidity3pm","Pressure9am","Pressure3pm","RainToday"],
        "data":   [{
                    "Location":15.0,
                    "MinTemp":15.1,
                    "MaxTemp":22.7,
                    "Rainfall":5.8,
                    "WindGustDir":6.0,
                    "WindGustSpeed":65.0,
                    "WindDir9am":6.74,
                    "WindDir3pm":5.0,
                    "WindSpeed9am":2.0,
                    "WindSpeed3pm":9.0,
                    "Humidity9am":7.0,
                    "Humidity3pm":82.0,
                    "Pressure9am":1006.0,
                    "Pressure3pm":1002.9,
                    "RainToday":1.0
                  }],
    }
}


Location           15.000000
MinTemp            15.100000
MaxTemp            22.700000
Rainfall            5.800000
WindGustDir         6.000000
WindGustSpeed      65.000000
WindDir9am          6.742962
WindDir3pm          5.000000
WindSpeed9am        2.000000
WindSpeed3pm        7.000000
Humidity9am        94.000000
Humidity3pm        82.000000
Pressure9am      1006.200000
Pressure3pm      1002.100000
RainToday           1.000000
RainTomorrow        1.000000
Name: 111579, dtype: float64


In [56]:
request_file_name = "sample_request_data.json"
with open(request_file_name, "w") as request_file:
    json.dump(request_json, request_file)

resp = ml_client.online_endpoints.invoke(
    endpoint_name=online_endpoint_name,
    deployment_name=deployment.name,
    request_file=request_file_name,
)
resp

'[1]'

In [58]:
os.getcwd()

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/aldunatelipac1/code/Users/aldunatelipac'

In [61]:
import shutil


shutil.make_archive(output_filename, 'zip', os.getcwd())

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/aldunatelipac1/code/Users/aldunatelipac/20231028_ochestrator_prediction-rain_aus.zip'