In [None]:
%pip install -U azure-ai-ml>=1.10
%pip install -U 'azureml-rag[azure,cognitive_search]>=0.2.2'

In [2]:
%%writefile config.json
{
    "subscription_id": "6a01260f-39d6-415f-a6c9-cf4fd479cbec",
    "resource_group": "sriks-ml-rg",
    "workspace_name": "sriks-ml-sea"
}

Overwriting config.json


In [3]:
from azure.identity import DefaultAzureCredential, AzureCliCredential
from azure.ai.ml import MLClient

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = AzureCliCredential()

ml_client = MLClient.from_config(credential=credential)

Found the config file in: ./config.json


In [3]:
from azure.ai.ml.entities import AmlCompute
experiment_name = "doc-summarization-hf"
# If you already have a gpu cluster, mention it here. Else will create a new one
compute_cluster = "AMLComputeCluster"
try:
    compute = ml_client.compute.get(compute_cluster)
    print("successfully fetched compute:", compute.name)
except Exception as ex:
    print("failed to fetch compute:", compute_cluster)
    print("creating new Standard_ND40rs_v2 compute")
    compute = AmlCompute(
        name=compute_cluster,
        size="Standard_NC4as_T4_v3", # Info on Standard_ND40rs_v2 SKU: https://learn.microsoft.com/en-us/azure/virtual-machines/ndv2-series
        min_instances=1,
        max_instances=2,  # For multi node training set this to an integer value more than 1
    )
    ml_client.compute.begin_create_or_update(compute).wait()
    print("successfully created compute:", compute.name)

In [None]:
# Download data
from azure.ai.ml.entities import Environment, BuildContext

Env_Name = "finetuning_hf"
env_docker_context = Environment(
    build=BuildContext(path="env"),
    name=Env_Name,
    description="Environment created from a Docker context.",
)
ml_client.environments.create_or_update(env_docker_context)

In [None]:
from azure.ai.ml import command, Input, Output
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

from azure.ai.ml.entities import (
    VsCodeJobService,
    TensorBoardJobService,
    JupyterLabJobService,
)

job = command(
    code=".",
    command="python finetune_hf_models.py \
        --model_name google/flan-t5-small \
        --dataset squad \
        --target_input_length=512 \
        --target_max_length=100 \
        --train_size=1000",
    compute=compute_cluster,
    services={
      "My_jupyterlab": JupyterLabJobService(
        nodes="all" # For distributed jobs, use the `nodes` property to pick which node you want to enable interactive services on. If `nodes` are not selected, by default, interactive applications are only enabled on the head node. Values are "all", or compute node index (for ex. "0", "1" etc.)
      ),
      "My_vscode": VsCodeJobService(
        nodes="all"
      ),
      "My_tensorboard": TensorBoardJobService(
        nodes="all",
        log_dir="outputs/runs"  # relative path of Tensorboard logs (same as in your training script)         
      ),
    },
    environment="finetuning_hf@latest",
    instance_count=1,  
    display_name="hf_finetuning"
) # basic environment comes with my workspace
job = ml_client.jobs.create_or_update(job)
job