# Ejecutar script en Azure ML

## Conectar a workspace

In [None]:
! pip show azure-ai-ml

In [None]:
# conectar al workspace
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

ml_client = MLClient.from_config(credential=DefaultAzureCredential())

print(f"Conectado al Workspace: {ml_client.workspace_name}")

## Datos

In [None]:
# MLTABLE
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

my_path = '/home/azureuser/cloudfiles/code/Users/formacion/azure-machine-learning/1.Introduccion_AML_Pipelines/ejercicios/2.script_housing_data/data'

my_data = Data(
    path=my_path,
    type=AssetTypes.MLTABLE,
    description="Housing dara",
    name="housing-data-mltable-ric-2"
)

ml_client.data.create_or_update(my_data)

## Usar el SDK de Python para entrenar un modelo

In [None]:
%%writefile /home/azureuser/cloudfiles/code/Users/formacion/azure-machine-learning/1.Introduccion_AML_Pipelines/ejercicios/2.script_housing_data/src/house-training.py

# import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import argparse
from pathlib import Path

def main(args):

    # 1. Cargar los datos
    # Suponemos que el dataset está en un archivo CSV llamado 'house_prices.csv'
    data = pd.read_csv(args.input_data+"/house_prices.csv")

    # 2. Inspeccionar las columnas
    print(data.columns)

    # 3. Descartar columnas no significativas
    # Las columnas 'date', 'street', 'city', 'statezip', 'country' no son relevantes para la predicción
    columns_to_drop = ['date', 'street', 'city', 'statezip', 'country']
    data = data.drop(columns=columns_to_drop)

    # 4. Separar características (X) y objetivo (y)
    X = data.drop(columns=['price'])  # Todas las columnas excepto 'price'
    y = data['price']  # Variable objetivo

    # 5. Dividir los datos en entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 6. Preprocesamiento de datos
    # Identificar columnas numéricas y categóricas
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns

    # Crear un preprocesador con StandardScaler para numéricas y OneHotEncoder para categóricas
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ]
    )

    # 7. Crear un pipeline con el preprocesador y el modelo Lasso
    lasso_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', Lasso(alpha=1, random_state=42))  # alpha es el parámetro de regularización
    ])

    # 8. Entrenar el modelo
    lasso_model.fit(X_train, y_train)

    # 9. Hacer predicciones
    y_pred = lasso_model.predict(X_test)

    # 10. Evaluar el modelo usando MSE
    mse = mean_squared_error(y_test, y_pred)
    print(f"Error Cuadrático Medio (MSE): {mse:.2f}")

    # 11. Calcular la varianza de los datos objetivo
    variance = np.var(y_test)
    print(f"Varianza de los datos objetivo: {variance:.2f}")

    # 12. Comparación entre MSE y varianza
    mse_to_variance_ratio = mse / variance
    print(f"Relación MSE/Varianza: {mse_to_variance_ratio:.2f}")


def parse_args():
    # setup arg parser
    parser = argparse.ArgumentParser()

    # add arguments
    parser.add_argument("--input_data", dest='input_data',
                        type=str)
    
    # parse args
    args = parser.parse_args()

    print('reading from:', args.input_data)

    # return args
    return args


# run script
if __name__ == "__main__":
    # add space in logs
    print("\n\n")
    print("*" * 60)

    # parse args
    args = parse_args()

    # run main function
    main(args)

    # add space in logs
    print("*" * 60)
    print("\n\n")

## Generar el job

In [None]:
from azure.ai.ml import command, MLClient
from azure.ai.ml.entities import Environment
from azure.ai.ml import Input
from azure.ai.ml.constants import AssetTypes, InputOutputModes

# configure input and output
my_job_inputs = {
    "local_data": Input(type=AssetTypes.MLTABLE, path="azureml:housing-data-mltable-ric:1")
}


# Definir un comando para el experimento
job = command(
    code="/home/azureuser/cloudfiles/code/Users/formacion/azure-machine-learning/1.Introduccion_AML_Pipelines/ejercicios/2.script_housing_data/src",
    command="python house-training.py --input_data ${{inputs.local_data}}",
    inputs=my_job_inputs,
    environment="azureml://registries/azureml/environments/sklearn-1.5/versions/26",
    compute="ricardoenm",
    display_name="housing-training-data-ric",
    experiment_name="housing-training-data-ric"
)

# Enviar el experimento
returned_job = ml_client.jobs.create_or_update(job)
print(f"Experimento enviado: {returned_job}")

