In [3]:
# Import packages used by the following code snippets
import csv
import json
import os
import requests
import time

import pandas as pd

from azure.ai.ml import Input, MLClient
from azure.ai.ml.constants import AssetTypes
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml.entities import (
    AmlCompute,
    BatchDeployment,
    BatchEndpoint,
    BatchRetrySettings,
    Model,
)

In [2]:
from azure.identity import (
    DefaultAzureCredential,
    InteractiveBrowserCredential,
    ClientSecretCredential,
)
from azure.ai.ml.entities import AmlCompute
import time

try:
    credential = DefaultAzureCredential()
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    credential = InteractiveBrowserCredential()


ml_client = MLClient(
    credential=credential,
    subscription_id="ed4b0003-99cf-4ec0-88ba-68847f627acf",
    resource_group_name="srijon-resourcegroup1",
    workspace_name="srijon-workspace1",
)

# the models, fine tuning pipelines and environments are available in the AzureML system registry, "azureml"
registry_ml_client = MLClient(credential, registry_name="azureml")

In [4]:
testdata = requests.get(
    "https://datasets-server.huggingface.co/first-rows?dataset=librispeech_asr&config=clean&split=test&offset=0&limit=100"
).text
testdata = json.loads(testdata)

In [5]:
model_name = "openai-whisper-large"
model_version = "10"
foundation_model = registry_ml_client.models.get(model_name, model_version)
print(
    f"Using model name: {foundation_model.name}, version: {foundation_model.version}, id: {foundation_model.id} for inferencing."
)

Using model name: openai-whisper-large, version: 10, id: azureml://registries/azureml/models/openai-whisper-large/versions/10 for inferencing.


In [6]:
audio_urls_and_text = [
    (row["row"]["audio"][0]["src"], row["row"]["text"]) for row in testdata["rows"]
]

In [7]:
test_df = pd.DataFrame(data=audio_urls_and_text, columns=["audio", "text"])

In [8]:
# Define directories and filenames as variables
dataset_dir = "librispeech-dataset"
test_datafile = "test_100.csv"

batch_dir = "batch"
batch_inputs_dir = os.path.join(batch_dir, "inputs")
batch_input_file = "batch_input.csv"
os.makedirs(dataset_dir, exist_ok=True)
os.makedirs(batch_dir, exist_ok=True)
os.makedirs(batch_inputs_dir, exist_ok=True)

In [9]:
pd.set_option(
    "display.max_colwidth", 0
)  # Set the max column width to 0 to display the full text
test_df.head()

Unnamed: 0,audio,text
0,https://datasets-server.huggingface.co/assets/librispeech_asr/--/8aeb8cac5ad165fc5574d4e84218154a8f4eca7b/--/clean/test/0/audio/audio.wav?Expires=1709442444&Signature=jwLst5hLO29MJkMCLuN8mPRaydYT~Y6psM7hrtWPVqlEBdPIqha35Aqbv9yWdDeTMKdTtynwLRGY~FIzBL0CBCvLLL4NqddjJwXN~O9b1CrIpZciyRRwCl5xrsApW4DehpdsP-i~FQKeQ9jtlcCCCJEBcXsgJkch3oCNtwW~OD-~gSa0VykG0Pwx6ffqXAJIW65VMXacDqPA53R7DaZ4Kvx7em~HWFohzWm~~xiUJ~a-Cgdt116nzj98sZtLnu7n4ItrlYhWIZg38Am6b2Emr9oYvdJ-ouIDl9yUZFgXiJ22hJdq1LJq5mutnHzbWC039Z915v3758jc6pJ0MuaoiQ__&Key-Pair-Id=K3EI6M078Z3AC3,CONCORD RETURNED TO ITS PLACE AMIDST THE TENTS
1,https://datasets-server.huggingface.co/assets/librispeech_asr/--/8aeb8cac5ad165fc5574d4e84218154a8f4eca7b/--/clean/test/1/audio/audio.wav?Expires=1709442444&Signature=OB9sEAB8rbwDlwqKxv18ztq-W-r6cOJZF7WfWF0Unf0zfocd1CdA1W14AkxKVtPsx~egb7k2IRBBWdJDJN6IlOF4vTC-aiYs0g4zTA56R5rmbeHjMqlL86CtS6hI3jRxtTJw8rEuSzimjbpnBaxOKzxwr7cCWAFTrohJrCxiKLspIL9Lp5irB-swRzAasYpThHNJ84ymwXPevj-HCFuZFlbBqJQwAbvBhDvDaXCpuwXAXvfnTjsKi1yCIyCqwxa85GV7pzAK8lA0GgEhcub9bWnyJh6DJzmZzMAmKMK3fhQO~t9XuQCgWy~X1b~u~ikMTnABhoBhLRgAVRAM0QmpIQ__&Key-Pair-Id=K3EI6M078Z3AC3,THE ENGLISH FORWARDED TO THE FRENCH BASKETS OF FLOWERS OF WHICH THEY HAD MADE A PLENTIFUL PROVISION TO GREET THE ARRIVAL OF THE YOUNG PRINCESS THE FRENCH IN RETURN INVITED THE ENGLISH TO A SUPPER WHICH WAS TO BE GIVEN THE NEXT DAY
2,https://datasets-server.huggingface.co/assets/librispeech_asr/--/8aeb8cac5ad165fc5574d4e84218154a8f4eca7b/--/clean/test/2/audio/audio.wav?Expires=1709442444&Signature=Y1TXR8NWSKf5NLiIqaa7WH~UV5JR126HQ6U-5Ovgl7oCNyjK7WNga1NO~5rFgGw5Af0PWI2MMhcCDWfPr7~9AC3dhmgKt9HCh-FbPwKo-OcBxINgKhy6gNdimMO3hBV9kVt5kXGxiHUGkRJYiEoI71iLH-PrchGL8UicUJ~JxLOoawnywsHZMo47~0Hk-kZNNHjL78R4hGDqS5lk3prqzsMbrPOQV14M-aHbPcsLWmLyWv19qpKum4S62q6Wz3BdjX1yD0rqz0vDHoYCMZRj1nbgZbDwKp3LAINOa-fveoN3zi-ykOg-GHx1609CeCOI1AeCSliUmPP0dQ-p-SXLrA__&Key-Pair-Id=K3EI6M078Z3AC3,CONGRATULATIONS WERE POURED IN UPON THE PRINCESS EVERYWHERE DURING HER JOURNEY
3,https://datasets-server.huggingface.co/assets/librispeech_asr/--/8aeb8cac5ad165fc5574d4e84218154a8f4eca7b/--/clean/test/3/audio/audio.wav?Expires=1709442444&Signature=NIfzDxN5iqq5naBbM3TW0uwjbSJ6zJxsgaEzSBBec--dJGVsAHFeHWvhsVU~BAlYs-OAkN7~lqpb-MMeP-u7KBNCwkLnOYo8djLKG8jaHSEZoD3w8C9oO5YbfQtE12JWx~e20dovkS2rVowYcwjfTZ-wqxLz2e9M94AmedFuflkcht~JbP-jTAONYN5F0fqZBVHa8~SZdB--oQeMK-~EP~g4KjJXamHmasBlrc9p2FyG4JNuHS5hP1Hz6TSqGh1WvObMtCm4VZtd8D8kM-RjOZ1OZlueS8RKMT6VRGQsLuG~v8~dnB5FWjSkg8avIIt8fS8Dv0~NUtwsOSNwDNOYNg__&Key-Pair-Id=K3EI6M078Z3AC3,FROM THE RESPECT PAID HER ON ALL SIDES SHE SEEMED LIKE A QUEEN AND FROM THE ADORATION WITH WHICH SHE WAS TREATED BY TWO OR THREE SHE APPEARED AN OBJECT OF WORSHIP THE QUEEN MOTHER GAVE THE FRENCH THE MOST AFFECTIONATE RECEPTION FRANCE WAS HER NATIVE COUNTRY AND SHE HAD SUFFERED TOO MUCH UNHAPPINESS IN ENGLAND FOR ENGLAND TO HAVE MADE HER FORGET FRANCE
4,https://datasets-server.huggingface.co/assets/librispeech_asr/--/8aeb8cac5ad165fc5574d4e84218154a8f4eca7b/--/clean/test/4/audio/audio.wav?Expires=1709442444&Signature=gveEQp1XUlzOIcjZlGx7tGyZd9s4Jc1kk1wIrbF8WhBtC1mRPNMXs~QTZMWQsLF0CEn6Q9TzDval7s949KwVw6G-GHKIWW7LwsUei0j6e38UqeSEzLGNbnlpYfy8UvPKPtz9kNIo72Qr2bf-i1DHs49d~gZ27X4aNzJffQVN9FNxQMaesuhPfFjTpfxuNbHPZmMQZ2qHi~5ShSwL6W2FM5ysVvWzdpuEPo9PBasK3o1pSKToqFXHCSp2zog3ugo5~u2mp2XL0G3YOROYDQ~~A3IAqYMm66HV~Y3Ct3Y4OJ6TSJyjHxmfxCo5S62Q2wdS95jeiy-0PxeL~rJ-LW3GwA__&Key-Pair-Id=K3EI6M078Z3AC3,SHE TAUGHT HER DAUGHTER THEN BY HER OWN AFFECTION FOR IT THAT LOVE FOR A COUNTRY WHERE THEY HAD BOTH BEEN HOSPITABLY RECEIVED AND WHERE A BRILLIANT FUTURE OPENED BEFORE THEM


In [10]:
test_df["language"] = "en"
test_df.to_csv(os.path.join(".", dataset_dir, test_datafile), index=False)

pd.set_option(
    "display.max_colwidth", 0
)  # Set the max column width to 0 to display the full text
test_df.head()

Unnamed: 0,audio,text,language
0,https://datasets-server.huggingface.co/assets/librispeech_asr/--/8aeb8cac5ad165fc5574d4e84218154a8f4eca7b/--/clean/test/0/audio/audio.wav?Expires=1709442444&Signature=jwLst5hLO29MJkMCLuN8mPRaydYT~Y6psM7hrtWPVqlEBdPIqha35Aqbv9yWdDeTMKdTtynwLRGY~FIzBL0CBCvLLL4NqddjJwXN~O9b1CrIpZciyRRwCl5xrsApW4DehpdsP-i~FQKeQ9jtlcCCCJEBcXsgJkch3oCNtwW~OD-~gSa0VykG0Pwx6ffqXAJIW65VMXacDqPA53R7DaZ4Kvx7em~HWFohzWm~~xiUJ~a-Cgdt116nzj98sZtLnu7n4ItrlYhWIZg38Am6b2Emr9oYvdJ-ouIDl9yUZFgXiJ22hJdq1LJq5mutnHzbWC039Z915v3758jc6pJ0MuaoiQ__&Key-Pair-Id=K3EI6M078Z3AC3,CONCORD RETURNED TO ITS PLACE AMIDST THE TENTS,en
1,https://datasets-server.huggingface.co/assets/librispeech_asr/--/8aeb8cac5ad165fc5574d4e84218154a8f4eca7b/--/clean/test/1/audio/audio.wav?Expires=1709442444&Signature=OB9sEAB8rbwDlwqKxv18ztq-W-r6cOJZF7WfWF0Unf0zfocd1CdA1W14AkxKVtPsx~egb7k2IRBBWdJDJN6IlOF4vTC-aiYs0g4zTA56R5rmbeHjMqlL86CtS6hI3jRxtTJw8rEuSzimjbpnBaxOKzxwr7cCWAFTrohJrCxiKLspIL9Lp5irB-swRzAasYpThHNJ84ymwXPevj-HCFuZFlbBqJQwAbvBhDvDaXCpuwXAXvfnTjsKi1yCIyCqwxa85GV7pzAK8lA0GgEhcub9bWnyJh6DJzmZzMAmKMK3fhQO~t9XuQCgWy~X1b~u~ikMTnABhoBhLRgAVRAM0QmpIQ__&Key-Pair-Id=K3EI6M078Z3AC3,THE ENGLISH FORWARDED TO THE FRENCH BASKETS OF FLOWERS OF WHICH THEY HAD MADE A PLENTIFUL PROVISION TO GREET THE ARRIVAL OF THE YOUNG PRINCESS THE FRENCH IN RETURN INVITED THE ENGLISH TO A SUPPER WHICH WAS TO BE GIVEN THE NEXT DAY,en
2,https://datasets-server.huggingface.co/assets/librispeech_asr/--/8aeb8cac5ad165fc5574d4e84218154a8f4eca7b/--/clean/test/2/audio/audio.wav?Expires=1709442444&Signature=Y1TXR8NWSKf5NLiIqaa7WH~UV5JR126HQ6U-5Ovgl7oCNyjK7WNga1NO~5rFgGw5Af0PWI2MMhcCDWfPr7~9AC3dhmgKt9HCh-FbPwKo-OcBxINgKhy6gNdimMO3hBV9kVt5kXGxiHUGkRJYiEoI71iLH-PrchGL8UicUJ~JxLOoawnywsHZMo47~0Hk-kZNNHjL78R4hGDqS5lk3prqzsMbrPOQV14M-aHbPcsLWmLyWv19qpKum4S62q6Wz3BdjX1yD0rqz0vDHoYCMZRj1nbgZbDwKp3LAINOa-fveoN3zi-ykOg-GHx1609CeCOI1AeCSliUmPP0dQ-p-SXLrA__&Key-Pair-Id=K3EI6M078Z3AC3,CONGRATULATIONS WERE POURED IN UPON THE PRINCESS EVERYWHERE DURING HER JOURNEY,en
3,https://datasets-server.huggingface.co/assets/librispeech_asr/--/8aeb8cac5ad165fc5574d4e84218154a8f4eca7b/--/clean/test/3/audio/audio.wav?Expires=1709442444&Signature=NIfzDxN5iqq5naBbM3TW0uwjbSJ6zJxsgaEzSBBec--dJGVsAHFeHWvhsVU~BAlYs-OAkN7~lqpb-MMeP-u7KBNCwkLnOYo8djLKG8jaHSEZoD3w8C9oO5YbfQtE12JWx~e20dovkS2rVowYcwjfTZ-wqxLz2e9M94AmedFuflkcht~JbP-jTAONYN5F0fqZBVHa8~SZdB--oQeMK-~EP~g4KjJXamHmasBlrc9p2FyG4JNuHS5hP1Hz6TSqGh1WvObMtCm4VZtd8D8kM-RjOZ1OZlueS8RKMT6VRGQsLuG~v8~dnB5FWjSkg8avIIt8fS8Dv0~NUtwsOSNwDNOYNg__&Key-Pair-Id=K3EI6M078Z3AC3,FROM THE RESPECT PAID HER ON ALL SIDES SHE SEEMED LIKE A QUEEN AND FROM THE ADORATION WITH WHICH SHE WAS TREATED BY TWO OR THREE SHE APPEARED AN OBJECT OF WORSHIP THE QUEEN MOTHER GAVE THE FRENCH THE MOST AFFECTIONATE RECEPTION FRANCE WAS HER NATIVE COUNTRY AND SHE HAD SUFFERED TOO MUCH UNHAPPINESS IN ENGLAND FOR ENGLAND TO HAVE MADE HER FORGET FRANCE,en
4,https://datasets-server.huggingface.co/assets/librispeech_asr/--/8aeb8cac5ad165fc5574d4e84218154a8f4eca7b/--/clean/test/4/audio/audio.wav?Expires=1709442444&Signature=gveEQp1XUlzOIcjZlGx7tGyZd9s4Jc1kk1wIrbF8WhBtC1mRPNMXs~QTZMWQsLF0CEn6Q9TzDval7s949KwVw6G-GHKIWW7LwsUei0j6e38UqeSEzLGNbnlpYfy8UvPKPtz9kNIo72Qr2bf-i1DHs49d~gZ27X4aNzJffQVN9FNxQMaesuhPfFjTpfxuNbHPZmMQZ2qHi~5ShSwL6W2FM5ysVvWzdpuEPo9PBasK3o1pSKToqFXHCSp2zog3ugo5~u2mp2XL0G3YOROYDQ~~A3IAqYMm66HV~Y3Ct3Y4OJ6TSJyjHxmfxCo5S62Q2wdS95jeiy-0PxeL~rJ-LW3GwA__&Key-Pair-Id=K3EI6M078Z3AC3,SHE TAUGHT HER DAUGHTER THEN BY HER OWN AFFECTION FOR IT THAT LOVE FOR A COUNTRY WHERE THEY HAD BOTH BEEN HOSPITABLY RECEIVED AND WHERE A BRILLIANT FUTURE OPENED BEFORE THEM,en


In [12]:
batch_df = test_df[["audio", "language"]]

# Divide this into files of 10 rows each
batch_size_per_predict = 10
for i in range(0, len(batch_df), batch_size_per_predict):
    j = i + batch_size_per_predict
    batch_df[i:j].to_csv(
        os.path.join(batch_inputs_dir, str(i) + batch_input_file), quoting=csv.QUOTE_ALL
    )

# Check out the first and last file name created
input_files = os.listdir(batch_inputs_dir)
print(f"{input_files[0]} to {str(i)}{batch_input_file}.")

.amlignore to 90batch_input.csv.


In [23]:
compute_name = "test-cpu-cluster"

compute_cluster = AmlCompute(
    name=compute_name,
    description="An AML compute cluster",
    size="Standard_E4as_v4",
    min_instances=1,
    max_instances=2,
    idle_time_before_scale_down=120,
)  # 120 seconds

ml_client.begin_create_or_update(compute_cluster)

<azure.core.polling._poller.LROPoller at 0x7f921f4cf520>

In [14]:
# Endpoint names need to be unique in a region, hence using timestamp to create unique endpoint name
timestamp = int(time.time())
endpoint_name = "speech-recognition-" + str(timestamp)

endpoint = BatchEndpoint(
    name=endpoint_name,
    description="Batch endpoint for "
    + foundation_model.name
    + ", for automatic-speech-recognition task",
)
ml_client.begin_create_or_update(endpoint).result()

<azure.ai.ml._restclient.v2022_05_01.models._models_py3.BatchEndpointData at 0x7f9224ffe950>

In [24]:
deployment_name = "demo"

deployment = BatchDeployment(
    name=deployment_name,
    endpoint_name=endpoint_name,
    model=foundation_model.id,
    compute=compute_name,
    error_threshold=0,
    instance_count=1,
    logging_level="info",
    max_concurrency_per_instance=1,
    mini_batch_size=2,
    output_file_name="predictions.csv",
    retry_settings=BatchRetrySettings(max_retries=3, timeout=600),
)
ml_client.begin_create_or_update(deployment).result()

BatchDeployment({'provisioning_state': 'Succeeded', 'endpoint_name': 'speech-recognition-1709439640', 'type': None, 'name': 'demo', 'description': None, 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/ed4b0003-99cf-4ec0-88ba-68847f627acf/resourceGroups/srijon-resourcegroup1/providers/Microsoft.MachineLearningServices/workspaces/srijon-workspace1/batchEndpoints/speech-recognition-1709439640/deployments/demo', 'Resource__source_path': None, 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/srijon-compute1/code/Users/mandal.srijon/GenAICode', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x7f921f243700>, 'serialize': <msrest.serialization.Serializer object at 0x7f921f174040>, 'model': 'azureml://registries/azureml/models/openai-whisper-large/versions/10', 'code_configuration': None, 'environment': None, 'environment_variables': {}, 'compute': '/subscriptions/ed4b0003-99cf-4ec0-88ba-68847f627acf/resourceGroups/srijon-reso

In [25]:
endpoint = ml_client.batch_endpoints.get(endpoint_name)
endpoint.defaults.deployment_name = deployment_name
ml_client.begin_create_or_update(endpoint).wait()

endpoint = ml_client.batch_endpoints.get(endpoint_name)
print(f"The default deployment is {endpoint.defaults.deployment_name}")

The default deployment is demo


In [None]:
input = Input(path=batch_inputs_dir, type=AssetTypes.URI_FOLDER)

job = ml_client.batch_endpoints.invoke(
    endpoint_name=endpoint.name, input=input
)

ml_client.jobs.stream(job.name)

In [None]:
scoring_job = list(ml_client.jobs.list(parent_job_name=job.name))[0]

ml_client.jobs.download(
    name=scoring_job.name, download_path=batch_dir, output_name="score"
)

predictions_file = os.path.join(batch_dir, "named-outputs", "score", "predictions.csv")

# Load the batch predictions file with no headers into a dataframe and set your column names
score_df = pd.read_csv(
    predictions_file,
    header=None,
    names=["row_number_per_file", "prediction", "batch_input_file_name"],
)
score_df.head()

In [None]:
input_df = []
for file in input_files:
    input = pd.read_csv(os.path.join(batch_inputs_dir, file), index_col=0)
    input.reset_index(inplace=True)
    input["batch_input_file_name"] = file
    input.reset_index(names=["row_number_per_file"], inplace=True)
    input_df.append(input)
input_df = pd.concat(input_df)
input_df.set_index("index", inplace=True)
input_df = input_df.join(test_df.drop(columns=["audio", "language"]))

input_df.head()

In [None]:
df = pd.merge(
    input_df, score_df, how="inner", on=["row_number_per_file", "batch_input_file_name"]
)

# Show the first few rows of the results
df.head(20)

In [None]:
ml_client.batch_endpoints.begin_delete(name=endpoint_name).result()
ml_client.compute.begin_delete(name=compute_name).result()