#### Creating a handle to workspace:

In [70]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

#authenticate
credential = DefaultAzureCredential()

#removed my personal account information for obvious reasons
ml_client = MLClient(
    credential=credential,
    subscription_id="ENTER YOUR SUBSCRIPTION ID",
    resource_group_name="NAME OF YOUR RESOURCE GROUP",
    workspace_name="NAME OF YOUR WORKSPACE",
)

In [71]:
import os

dependencies_dir = "./dependencies"
os.makedirs(dependencies_dir, exist_ok=True)

In [72]:
import os

train_src_dir = "./src"
os.makedirs(train_src_dir, exist_ok=True)

In [73]:
%%writefile {train_src_dir}/main.py
import os
import argparse
import pandas as pd
import numpy as np
import mlflow
import mlflow.xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score, f1_score, auc, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
    
def roc_auc_plot(Y_test, Y_preds):
    fpr, tpr, threshold = roc_curve(Y_test, Y_preds)
    roc_auc = auc(fpr, tpr)
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.savefig('roc.png')
    mlflow.log_artifact('roc.png')
    plt.close()

def confusion_matrix_plot(Y_test, Y_preds):
    cm = confusion_matrix(Y_test, Y_preds)
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Wistia)
    classNames = ['0','1']
    plt.title('Confusion Matrix')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(classNames))
    plt.xticks(tick_marks, classNames, rotation=45)
    plt.yticks(tick_marks, classNames)
    s = [['TN','FP'], ['FN', 'TP']]

    for i in range(2):
        for j in range(2):
            plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j]))
    plt.savefig('confusion_matrix.png')
    mlflow.log_artifact('confusion_matrix.png')
    plt.close()

def main():
    """Main function of the script."""

    ##################################
    #<initialize arguments and mlflow>
    ##################################

    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, help="path to input data")
    parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25)
    parser.add_argument("--registered_model_name", type=str, help="model name")
    parser.add_argument("--proba_threshold", type=float, required=False, default=0.20)   

    args = parser.parse_args()
   
    # Start Logging
    mlflow.start_run()

    # enable autologging
    mlflow.xgboost.autolog()

    ###################
    #<prepare the data>
    ###################
    print(" ".join(f"{k}={v}" for k, v in vars(args).items()))

    print("input data:", args.data)
    
    diabetes_df = pd.read_csv(args.data)

    mlflow.log_metric("num_samples", diabetes_df.shape[0])
    mlflow.log_metric("num_features", diabetes_df.shape[1] - 1)

    categorical_columns = ['gender','smoking_history']

    label_encoder = LabelEncoder()

    for col in categorical_columns:
        diabetes_df[col] = label_encoder.fit_transform(diabetes_df[col])

    X = diabetes_df.drop('diabetes', axis=1)
    Y = diabetes_df['diabetes']

    X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=args.test_train_ratio)

    ##################
    #<train the model>
    ###################
    print(f"Training with data of shape {X_train.shape}")

    model = XGBClassifier()
    model.fit(X_train, Y_train)
    Y_proba = model.predict_proba(X_test)
    Y_pred = (Y_proba[:, 1] >= args.proba_threshold).astype(int)

    mlflow.log_metric("accuracy", accuracy_score(Y_test,Y_pred))
    mlflow.log_metric("f1 score", f1_score(Y_test,Y_pred))
    mlflow.log_metric("roc auc score", roc_auc_score(Y_test, Y_pred))

    roc_auc_plot(Y_test,Y_pred)
    confusion_matrix_plot(Y_test,Y_pred)

    print(classification_report(Y_test, Y_pred))

    ##########################
    #<save and register model>
    ##########################
    # Registering the model to the workspace
    print("Registering the model via MLFlow")
    mlflow.xgboost.log_model(
        xgb_model=model,
        registered_model_name=args.registered_model_name,
        artifact_path=args.registered_model_name,
    )

    # Saving the model to a file
    mlflow.xgboost.save_model(
        xgb_model=model,
        path=os.path.join(args.registered_model_name, "trained_model"),
    )
    
    # Stop Logging
    mlflow.end_run()

if __name__ == "__main__":
    main()

Overwriting ./src/main.py


In [74]:
from azure.ai.ml import command
from azure.ai.ml import Input

registered_model_name = "diabetes_xgboost_model"

job = command(
    inputs=dict(
        data=Input(
            type="uri_file",
            path="azureml://subscriptions/e29434d7-433a-4d49-96cb-cd1fc7230c68/resourcegroups/myResourceGroup0001/workspaces/learning-azure-eastus2/datastores/workspaceblobstore/paths/UI/2023-06-01_132255_UTC/diabetes_prediction_dataset.csv",
        ),
        test_train_ratio=0.25,
        proba_threshold=0.20,
        registered_model_name=registered_model_name,
    ),
    code="./src/",  # location of source code
    command="python main.py --data ${{inputs.data}} --test_train_ratio ${{inputs.test_train_ratio}} --proba_threshold ${{inputs.proba_threshold}} --registered_model_name ${{inputs.registered_model_name}}",
    environment="AzureML-lightgbm-3.2-ubuntu18.04-py37-cpu@latest",
    compute="khammitt1-devlight-ds11",
    experiment_name="train_model_xgboost_diabetes",
    display_name="xgboost_diabetes",
)

In [75]:
ml_client.create_or_update(job)

[32mUploading src (0.0 MBs):   0%|          | 0/4156 [00:00<?, ?it/s][32mUploading src (0.0 MBs): 100%|██████████| 4156/4156 [00:00<00:00, 345356.57it/s]
[39m



Experiment,Name,Type,Status,Details Page
train_model_xgboost_diabetes,great_lychee_3718564h18,command,Starting,Link to Azure Machine Learning studio
