# Finetuning Huggingface models on Azure ML

This notebook explains how to create an end-to-end lineage for hugging face models on Azure.

In [12]:
from azure.identity import DefaultAzureCredential, AzureCliCredential
from azure.ai.ml import MLClient
from azure.ai.ml.entities import AmlCompute, ComputeInstance
from azure.ai.ml.entities import Environment, BuildContext
from azure.ai.ml import command, Input, Output
from azure.ai.ml.dsl import pipeline
from azure.ai.ml.constants import AssetTypes, InputOutputModes

### Create AML Compute

In [7]:
# Function to create MLClient
def create_ml_client(credential=None):     
    if credential is None:
        try:
            credential = DefaultAzureCredential()
            # Check if given credential can get token successfully.
            credential.get_token("https://management.azure.com/.default")
        except Exception as ex:
            # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
            credential = AzureCliCredential()

    return MLClient.from_config(credential=credential)

# Function to create AML Compute Cluster
def create_aml_cluster(ml_client, gpu_cluster_name = "AmlComputeCluster", vm_size = "Standard_NC4as_T4_v3", min_nodes = 1, max_nodes = 2):
    # If you already have a gpu cluster, mention it here. Else will create a new one    
    try:
        compute = ml_client.compute.get(gpu_cluster_name)
        print("successfully fetched compute:", compute.name)
    except Exception as ex:
        print("failed to fetch compute:", gpu_cluster_name)
        print(f"creating new {vm_size} compute")
        compute = AmlCompute(
            name=gpu_cluster_name,
            size=vm_size,
            min_instances=min_nodes,
            max_instances=max_nodes,  # For multi node training set this to an integer value more than 1
        )
        ml_client.compute.begin_create_or_update(compute).wait()
        print("successfully created compute:", compute.name)
    return compute

def create_aml_node(ml_client, node_name = "vism-cpu-8c", vm_size = "Standard_E4ds_v4"):
    try:
        compute = ml_client.compute.get(node_name)
        print("successfully fetched compute:", compute.name)
    except Exception as ex:
        print("failed to fetch compute:", node_name)
        print(f"creating new {vm_size} compute")
        compute = ComputeInstance(
            name=node_name,
            size=vm_size
        )
        ml_client.compute.begin_create_or_update(compute).wait()
        print("successfully created compute:", compute.name)
    return compute

### Create Custom Environment

In [8]:
# Function to create Environment
def create_environment(ml_client, env_Name = "finetune_hf_lora"):
    try:
        env = ml_client.environments.get(env_Name, version="latest")
        print("successfully fetched environment:", env.name)
    except Exception as ex:
        print("failed to fetch environment:", env_Name)
        print(f"creating new environment {env_Name}")
        env_docker_context = Environment(
            build=BuildContext(path="src/env"),
            name=env_Name,
            description="Environment created from a Docker context.",
        )
        ml_client.environments.create_or_update(env_docker_context)
        print("successfully created environment:", env_docker_context.name)
    return env_docker_context

### Create Azure ML Dataset

In [9]:
# Create Azure ML dataset from local file
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

VERSION = "1.0"
NAME = "squad_dev_v1"
def create_dataset(ml_client):
    # Create a dataset from local file
    dataset = Data(
        type=AssetTypes.URI_FILE,
        path="./data/squad.json",
        description="SquAD v1.0 dev dataset",
        name=NAME,
        version=VERSION,
    )
    return ml_client.data.create_or_update(dataset)

ml_client = create_ml_client()
try:
   ml_client.data.get(NAME, VERSION)
except:
   create_dataset(ml_client)

Found the config file in: ./config.json


### Create Azure ML pipeline job

In [13]:
gpu_cluster_name = "AmlComputeCluster"
cpu_compute_name = "vism-cpu-8c"
env_name = "finetune_hf_lora"
data_asset = ml_client.data.get(NAME, version=VERSION)
ml_client = create_ml_client()
mlflow_tracking_uri = ml_client.workspaces.get(ml_client.workspace_name).mlflow_tracking_uri
compute_cluster = create_aml_cluster(ml_client, gpu_cluster_name = gpu_cluster_name)
compute_instance = create_aml_node(ml_client, node_name = cpu_compute_name)
env_docker_context = create_environment(ml_client)

Found the config file in: ./config.json


successfully fetched compute: AMLComputeCluster
successfully fetched compute: vism-cpu-8c
failed to fetch environment: finetune_hf_lora
creating new environment finetune_hf_lora
successfully created environment: finetune_hf_lora


In [14]:
training_component = command(
    name="train_hf_qna_squad_dev_v1",
    display_name="Fine tune HF model",
    description="Fine tune HF model on Squad dataset for QnA task",
    inputs={
        "data": Input(mode=InputOutputModes.MOUNT, type=AssetTypes.URI_FILE, destination_path_on_compute="data"),
        "mlflow_tracking_uri": ""
    },
    outputs=dict(
        train_output=Output(type="uri_folder", mode="rw_mount")
    ),
    # The source folder of the component
    code='./src',
    command="""python train_component.py \
            --data_path ${{inputs.data}} \
            --target_input_length=512 \
            --model_name "google/flan-t5-small" \
            --num_epochs 1 \
            --mlflow_tracking_uri "${{inputs.mlflow_tracking_uri}}" \
            --target_max_length=100 \
            --train_size=1000""",
    environment=f"{env_name}@latest",
)

training_component = ml_client.create_or_update(training_component.component)
print(f"Component {training_component.name} with Version {training_component.version} is registered")

[32mUploading src (0.01 MBs): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12544/12544 [00:00<00:00, 162351.80i

Component train_hf_qna_squad_dev_v1 with Version 2024-06-27-05-01-35-8703317 is registered


In [15]:
inputs = Input(path=data_asset.id, mode=InputOutputModes.MOUNT, type=AssetTypes.URI_FILE, destination_path_on_compute="data")

@pipeline(
    default_compute=gpu_cluster_name,
)
def finetune_hfmodels_azureml_pipeline(pipeline_input_data):
    """E2E Hugging face Q and A model using huggingface, peft, azureml and python sdk."""
    train_node = training_component(data=pipeline_input_data, mlflow_tracking_uri=mlflow_tracking_uri)
    train_node.compute = cpu_compute_name

# create a pipeline
pipeline_job = finetune_hfmodels_azureml_pipeline(pipeline_input_data=inputs)
pipeline_job = ml_client.jobs.create_or_update(pipeline_job, experiment_name="finetuning_hf")

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


In [18]:
import time

print(f"Pipeline {pipeline_job.name} created")
job_status = ml_client.jobs.get(pipeline_job.name)
print(f"Pipeline {pipeline_job.name} status: {job_status.status}")
while job_status.status not in ["Completed", "Failed", "Canceled"]:
    job_status = ml_client.jobs.get(pipeline_job.name)
    print(f"Pipeline {pipeline_job.name} status: {job_status.status}")
    time.sleep(30)

# ml_client.jobs.stream(pipeline_job.name)

Pipeline nice_dream_j74y44grmy created
Pipeline nice_dream_j74y44grmy status: Running
Pipeline nice_dream_j74y44grmy status: Running
Pipeline nice_dream_j74y44grmy status: Running
Pipeline nice_dream_j74y44grmy status: Running
Pipeline nice_dream_j74y44grmy status: Running
Pipeline nice_dream_j74y44grmy status: Running
Pipeline nice_dream_j74y44grmy status: Running
Pipeline nice_dream_j74y44grmy status: Running
Pipeline nice_dream_j74y44grmy status: Running
Pipeline nice_dream_j74y44grmy status: Running
Pipeline nice_dream_j74y44grmy status: Running
Pipeline nice_dream_j74y44grmy status: Running
Pipeline nice_dream_j74y44grmy status: Running
Pipeline nice_dream_j74y44grmy status: Running
Pipeline nice_dream_j74y44grmy status: Running
Pipeline nice_dream_j74y44grmy status: Running
Pipeline nice_dream_j74y44grmy status: Running
Pipeline nice_dream_j74y44grmy status: Running
Pipeline nice_dream_j74y44grmy status: Running
Pipeline nice_dream_j74y44grmy status: Running
Pipeline nice_dream_j

KeyboardInterrupt: 