# Finetuning Huggingface models on Azure ML

This notebook explains how to create an end-to-end lineage for hugging face models on Azure.

In [1]:
from azure.identity import DefaultAzureCredential, AzureCliCredential
from azure.ai.ml import MLClient
from azure.ai.ml.entities import AmlCompute, ComputeInstance
from azure.ai.ml.entities import Environment, BuildContext
from azure.ai.ml import command, Input, Output

ModuleNotFoundError: No module named 'azure'

### Create AML Compute

In [26]:
# Function to create MLClient
def create_ml_client(credential=None):     
    if credential is None:
        try:
            credential = DefaultAzureCredential()
            # Check if given credential can get token successfully.
            credential.get_token("https://management.azure.com/.default")
        except Exception as ex:
            # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
            credential = AzureCliCredential()

    return MLClient.from_config(credential=credential)

# Function to create AML Compute Cluster
def create_aml_cluster(ml_client, gpu_cluster_name = "AmlComputeCluster", vm_size = "Standard_NC4as_T4_v3", min_nodes = 1, max_nodes = 2):
    # If you already have a gpu cluster, mention it here. Else will create a new one    
    try:
        compute = ml_client.compute.get(gpu_cluster_name)
        print("successfully fetched compute:", compute.name)
    except Exception as ex:
        print("failed to fetch compute:", gpu_cluster_name)
        print(f"creating new {vm_size} compute")
        compute = AmlCompute(
            name=gpu_cluster_name,
            size=vm_size,
            min_instances=min_nodes,
            max_instances=max_nodes,  # For multi node training set this to an integer value more than 1
        )
        ml_client.compute.begin_create_or_update(compute).wait()
        print("successfully created compute:", compute.name)
    return compute

def create_aml_node(ml_client, node_name = "vism-cpu-4c", vm_size = "Standard_E4ds_v4"):
    try:
        compute = ml_client.compute.get(node_name)
        print("successfully fetched compute:", compute.name)
    except Exception as ex:
        print("failed to fetch compute:", node_name)
        print(f"creating new {vm_size} compute")
        compute = ComputeInstance(
            name=node_name,
            size=vm_size
        )
        ml_client.compute.begin_create_or_update(compute).wait()
        print("successfully created compute:", compute.name)
    return compute

### Create Custom Environment

In [27]:
# Function to create Environment
def create_environment(ml_client, env_Name = "finetune_hf_lora"):
    try:
        env = ml_client.environments.get(env_Name, version="latest")
        print("successfully fetched environment:", env.name)
    except Exception as ex:
        print("failed to fetch environment:", env_Name)
        print(f"creating new environment {env_Name}")
        env_docker_context = Environment(
            build=BuildContext(path="src/env"),
            name=env_Name,
            description="Environment created from a Docker context.",
        )
        ml_client.environments.create_or_update(env_docker_context)
        print("successfully created environment:", env_docker_context.name)
    return env_docker_context

### Create Azure ML Dataset

In [5]:
# Create Azure ML dataset from local file
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

VERSION = "1.0"
NAME = "squad_dev_v1"
def create_dataset(ml_client):
    # Create a dataset from local file
    dataset = Data(
        type=AssetTypes.URI_FILE,
        path="./data/squad.json",
        description="SquAD v1.0 dev dataset",
        name=NAME,
        version=VERSION,
    )
    return ml_client.data.create_or_update(dataset)

ml_client = create_ml_client()
try:
   ml_client.data.get(NAME, VERSION)
except:
   create_dataset(ml_client)

Found the config file in: ./config.json


### Create Azure ML Job

In [5]:
# from azure.ai.ml.constants import AssetTypes, InputOutputModes
# # Set the input for the job:
# data_asset = ml_client.data.get(NAME, version=VERSION)
# inputs = Input(path=data_asset.id, mode=InputOutputModes.MOUNT, type=AssetTypes.URI_FILE, destination_path_on_compute="data")
# mlflow_tracking_uri = ml_client.workspaces.get(ml_client.workspace_name).mlflow_tracking_uri

# # Function to create Job
# def create_job(compute_cluster, script_file = "finetune_hf_models.py", job_name = "hf_finetuning", env_name = "finetune_hf_lora"):
#     job = command(
#         code=".",
#         command=f"python src/{script_file} \
#             --model_name google/flan-t5-small \
#             --num_epochs 1 \
#             --mlflow_tracking_uri {mlflow_tracking_uri} \
#             --data_path ${{inputs.data}} \
#             --target_input_length=512 \
#             --target_max_length=100 \
#             --train_size=1000",
#     compute=compute_cluster,
#     inputs={
#       "data": inputs
#     },
#     services={
#       "My_jupyterlab": JupyterLabJobService(
#         nodes="all" # For distributed jobs, use the `nodes` property to pick which node you want to enable interactive services on. If `nodes` are not selected, by default, interactive applications are only enabled on the head node. Values are "all", or compute node index (for ex. "0", "1" etc.)
#       ),
#       "My_vscode": VsCodeJobService(
#         nodes="all"
#       ),
#       "My_tensorboard": TensorBoardJobService(
#         nodes="all",
#         log_dir="outputs/runs"  # relative path of Tensorboard logs (same as in your training script)         
#       ),
#     },
#     environment=f"{env_name}@latest",
#     instance_count=1,  
#     display_name=job_name)
#     return job

NameError: name 'ml_client' is not defined

In [58]:
# gpu_cluster_name = "AmlComputeCluster"
# ml_client = create_ml_client()
# compute_cluster = create_aml_cluster(ml_client, gpu_cluster_name = gpu_cluster_name)
# env_docker_context = create_environment(ml_client)
# job = create_job(gpu_cluster_name)
# print(f"Creating job {job.display_name}")
# # Submit the job and wait for completion
# job = ml_client.jobs.create_or_update(job, show_output = True)
# print("Job created successfully")
# print(job.studio_url)   

Found the config file in: ./config.json


successfully fetched compute: AMLComputeCluster
failed to fetch environment: finetune_hf_lora
creating new environment finetune_hf_lora
successfully created environment: finetune_hf_lora
Creating job hf_finetuning


Uploading finetuning_hf_models (85.26 MBs): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 85262513/85262513 [00:00<00:00, 115392488.33it/s]


Job created successfully
https://ml.azure.com/runs/willing_avocado_h5g2k7n8gc?wsid=/subscriptions/6a01260f-39d6-415f-a6c9-cf4fd479cbec/resourcegroups/sriks-ml-rg/workspaces/sriks-ml-sea&tid=16b3c013-d300-468d-ac64-7eda0820b6d3


### Create Azure ML pipeline job

In [6]:
from Users.vism.sriksml.azureml.finetuning_hf_models.src.prepare_component import prepare_data_component
from Users.vism.sriksml.azureml.finetuning_hf_models.src.train_component import training_component

help(prepare_data_component)
help(training_component)

2024-04-10 06:01:23.672648: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-10 06:01:26.220176: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-10 06:01:26.930770: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /anaconda/envs/azureml_py38/lib/:/anaconda/envs/azureml_py38/lib/:/anaconda/envs/az

Help on function prepare_data_component in module src.prep.prepare_component:

prepare_data_component()

Help on function training_component in module src.train.train_component:

training_component(input_data: <mldesigner._input_output.Input object at 0x7f844d4ac7f0>, output_model: <mldesigner._input_output.Output object at 0x7f844d4acbb0>)



In [29]:
from azure.ai.ml.dsl import pipeline
from azure.ai.ml.constants import AssetTypes, InputOutputModes

gpu_cluster_name = "AmlComputeCluster"
cpu_compute_name = "vism-cpu-4c"
env_name = "finetune_hf_lora"
data_asset = ml_client.data.get(NAME, version=VERSION)

ml_client = create_ml_client()
mlflow_tracking_uri = ml_client.workspaces.get(ml_client.workspace_name).mlflow_tracking_uri
compute_cluster = create_aml_cluster(ml_client, gpu_cluster_name = gpu_cluster_name)
compute_instance = create_aml_node(ml_client, node_name = cpu_compute_name)
env_docker_context = create_environment(ml_client)

Found the config file in: ./config.json


successfully fetched compute: AMLComputeCluster
successfully fetched compute: vism-cpu-4c
failed to fetch environment: finetune_hf_lora
creating new environment finetune_hf_lora
successfully created environment: finetune_hf_lora


In [32]:
# Define the data preparation component using python SDK
# Alternatively we can declare components using YAML
# The component is defined using the command function
data_prep_component = command(
    name="data_prep_qna_squad_dev_v1",
    display_name="Squad dataset preparation component",
    description="Reads the SQuAD v1.0 dev dataset and prepares it for fine tuning HF models",
    inputs={
        "data": Input(mode=InputOutputModes.MOUNT, type=AssetTypes.URI_FILE, destination_path_on_compute="data")
    },
    outputs=dict(
        train_data=Output(type="uri_folder", mode="rw_mount")
    ),
    # The source folder of the component
    code='./src',
    command="""python prepare_component.py \
            --data_path ${{inputs.data}} \
            --target_input_length=512 \
            --target_max_length=100 \
            --train_size=1000""",
    environment=f"{env_name}@latest",
)

# Now we register the component to the workspace
data_prep_component = ml_client.create_or_update(data_prep_component.component)
print(f"Component {data_prep_component.name} with Version {data_prep_component.version} is registered")

Component data_prep_qna_squad_dev_v1 with Version 2024-04-10-07-12-34-5797646 is registered


In [36]:
training_component = command(
    name="train_hf_qna_squad_dev_v1",
    display_name="Fine tune HF model",
    description="Fine tune HF model on Squad dataset for QnA task",
    inputs={
        "train_data": Input(mode=InputOutputModes.MOUNT, type="uri_folder")
    },
    outputs=dict(
        train_output=Output(type="uri_folder", mode="rw_mount")
    ),
    # The source folder of the component
    code='./src',
    command=""python prepare_component.py \
            --data_path ${{inputs.train_data}} \
            --target_input_length=512 \
            --model_name "google/flan-t5-small" \
            --num_epochs 1 \
            --mlflow_tracking_uri {mlflow_tracking_uri} \
            --target_max_length=100 \
            --train_size=1000"",
    environment=f"{env_name}@latest",
)

training_component = ml_client.create_or_update(training_component.component)
print(f"Component {training_component.name} with Version {training_component.version} is registered")

Component train_hf_qna_squad_dev_v1 with Version 1 is registered


In [37]:
inputs = Input(path=data_asset.id, mode=InputOutputModes.MOUNT, type=AssetTypes.URI_FILE, destination_path_on_compute="data")


@pipeline(
    default_compute=gpu_cluster_name,
)
def finetune_hfmodels_azureml_pipeline(pipeline_input_data):
    """E2E Hugging face Q and A model using huggingface, peft, azureml and python sdk."""
    prepare_data_node = data_prep_component(data=pipeline_input_data)
    prepare_data_node.compute = cpu_compute_name
    train_node = training_component(train_data=prepare_data_node.outputs.train_data)
    train_node.compute = gpu_cluster_name

# create a pipeline
pipeline_job = finetune_hfmodels_azureml_pipeline(pipeline_input_data=inputs)
pipeline_job = ml_client.jobs.create_or_update(pipeline_job, experiment_name="finetuning_hf")
ml_client.jobs.stream(pipeline_job.name)


RunId: boring_yam_gd699rprr1
Web View: https://ml.azure.com/runs/boring_yam_gd699rprr1?wsid=/subscriptions/6a01260f-39d6-415f-a6c9-cf4fd479cbec/resourcegroups/sriks-ml-rg/workspaces/sriks-ml-sea

Streaming logs/azureml/executionlogs.txt

[2024-04-10 07:14:28Z] Submitting 1 runs, first five are: 251421c1:24b5eb0f-de3a-4e5c-ad50-42b453675075
[2024-04-10 07:20:53Z] Completing processing run id 24b5eb0f-de3a-4e5c-ad50-42b453675075.
[2024-04-10 07:20:54Z] Submitting 1 runs, first five are: 993c760a:f693c5fd-4d03-4b09-8fbd-17f6c82c082a
[2024-04-10 07:21:48Z] Execution of experiment failed, update experiment status and cancel running nodes.

Execution Summary
RunId: boring_yam_gd699rprr1
Web View: https://ml.azure.com/runs/boring_yam_gd699rprr1?wsid=/subscriptions/6a01260f-39d6-415f-a6c9-cf4fd479cbec/resourcegroups/sriks-ml-rg/workspaces/sriks-ml-sea


JobException: Exception : 
 {
    "error": {
        "code": "UserError",
        "message": "Pipeline has failed child jobs. Failed nodes: /train_node. For more details and logs, please go to the job detail page and check the child jobs.",
        "message_format": "Pipeline has failed child jobs. {0}",
        "message_parameters": {},
        "reference_code": "PipelineHasStepJobFailed",
        "details": []
    },
    "environment": "southeastasia",
    "location": "southeastasia",
    "time": "2024-04-10T07:21:48.302878Z",
    "component_name": ""
} 