In [5]:
import pandas as pd
import requests
from jiwer import wer
from tqdm import tqdm

import os
import shutil
from pathlib import Path
import sys
import json
import time

## MP3 Conversion
- Mainly for GCP input constraints, but also for consistency with os implementation

In [None]:
def resample_normalise_audio(in_file, out_file, sample_rate=16000):
    if not os.path.exists(in_file):
        raise ValueError(f"{in_file} not found")
    if out_file is None:
        out_file = in_file.replace(os.path.splitext(in_file)[-1], f"_{sample_rate}.wav")

    os.system(
        f"ffmpeg -i {in_file} -acodec pcm_s16le -ac 1 -af aresample=resampler=soxr -ar {sample_rate} {out_file} -y"
    )
    return out_file


transcript_manifest = pd.read_csv("../output/radio_national_podcasts/manifest.csv")

output_dir = Path("../output/radio_national_podcasts/audio/wav")
shutil.rmtree(str(output_dir)) if output_dir.exists() else None
output_dir.mkdir(parents=True, exist_ok=True)

for idx, record in transcript_manifest.iterrows():
    input_path = Path(record.audio_path)
    output_path = input_path.parents[1] / f"wav/{input_path.stem}.wav"
    resample_normalise_audio(str(input_path), str(output_path))


## OS

In [None]:
sys.path.append("..")
from asr import transcribe_mono_audio


In [None]:
os_transcript_records = []
for audio_path in tqdm(transcript_manifest.head(5).audio_path):
    before = time.time()
    transcript = transcribe_mono_audio(audio_path)
    after = time.time()
    os_transcript_records.append(
        {
            "hypothesis": " ".join(transcript.transcript.tolist()),
            "elapsed_time": after - before,
            "provider": "os",
        }
    )


In [None]:
# remove "this is an ABC podcast" intro from hypothesis
os_wer = (
    pd.concat([transcript_manifest.head(), pd.DataFrame(os_transcript_records)], axis=1)
    .assign(wer=lambda x: x.apply(lambda y: wer(y.transcript, y.hypothesis), axis=1))
    .pipe(lambda x: x[["transcript", "hypothesis", "stem", "transcript_len", "wer"]])
)


## GCP
- Huge chunks of the transcript missing when using async methods?
- Probably use telephony model instead
- Consider using streams instead of batch

In [31]:
from google.cloud import speech, storage

project = "hobby-358221"
bucket_name = "blog-os-asr"
storage_client = storage.Client(project=project)
bucket = storage_client.get_bucket(bucket_name)
blobs = bucket.list_blobs()
gcp_uris = [f"gs://{bucket_name}/{e.name}" for e in blobs]


09/06/2022 09:45:14 AM AEST Checking None for explicit credentials as part of auth process...
09/06/2022 09:45:14 AM AEST Checking Cloud SDK credentials as part of auth process...
09/06/2022 09:45:14 AM AEST No project ID could be determined. Consider running `gcloud config set project` or setting the GOOGLE_CLOUD_PROJECT environment variable
09/06/2022 09:45:14 AM AEST Making request: POST https://oauth2.googleapis.com/token


In [33]:
speech_client = speech.SpeechClient()

def transcribe_gcs(gcs_uri):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    audio = speech.RecognitionAudio(uri=gcs_uri)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
    )
    operation = speech_client.long_running_recognize(config=config, audio=audio)
    res = operation.result()
    return " ".join([e.alternatives[0].transcript.strip() for e in res.results]).strip()


09/06/2022 09:45:24 AM AEST Checking None for explicit credentials as part of auth process...
09/06/2022 09:45:24 AM AEST Checking Cloud SDK credentials as part of auth process...
09/06/2022 09:45:24 AM AEST No project ID could be determined. Consider running `gcloud config set project` or setting the GOOGLE_CLOUD_PROJECT environment variable


In [34]:
gcp_output_dir = Path("../output/radio_national_podcasts/transcripts/gcp")
shutil.rmtree(str(gcp_output_dir)) if gcp_output_dir.exists() else None
gcp_output_dir.mkdir(parents=True, exist_ok=True)

for gcp_uri in tqdm(gcp_uris):
    try:
        print(f"Transcribing {gcp_uri}...")
        before = time.time()
        gcp_res = transcribe_gcs(gcp_uri)
        after = time.time()
        gcp_transcript_record = {
            "hypothesis": gcp_res,
            "elapsed_time": after - before,
            "provider": "gcp",
        }
        stub_output = (gcp_output_dir / f"{Path(gcp_uri).stem}.json").write_text(
            json.dumps(gcp_transcript_record)
        )
    except Exception:
        print(f"Unable to transcribe: {gcp_uri}")


  0%|          | 0/12 [00:00<?, ?it/s]

Transcribing gs://blog-os-asr/aussie-stem-stars-ajay-rane.wav...
09/06/2022 09:45:32 AM AEST Making request: POST https://oauth2.googleapis.com/token
09/06/2022 09:45:35 AM AEST Retrying due to , sleeping 0.8s ...
09/06/2022 09:45:36 AM AEST Retrying due to , sleeping 3.8s ...
09/06/2022 09:45:40 AM AEST Retrying due to , sleeping 0.7s ...
09/06/2022 09:45:41 AM AEST Retrying due to , sleeping 6.4s ...
09/06/2022 09:45:48 AM AEST Retrying due to , sleeping 25.2s ...
09/06/2022 09:46:14 AM AEST Retrying due to , sleeping 60.0s ...
09/06/2022 09:47:15 AM AEST Retrying due to , sleeping 60.0s ...


  8%|▊         | 1/12 [02:47<30:46, 167.82s/it]

Transcribing gs://blog-os-asr/australia-s-marsupial-megafauna-their-discovery-ecology-and-exti.wav...
09/06/2022 09:48:17 AM AEST Retrying due to , sleeping 0.3s ...
09/06/2022 09:48:18 AM AEST Retrying due to , sleeping 2.9s ...
09/06/2022 09:48:21 AM AEST Retrying due to , sleeping 2.2s ...
09/06/2022 09:48:24 AM AEST Retrying due to , sleeping 12.3s ...
09/06/2022 09:48:36 AM AEST Retrying due to , sleeping 12.9s ...
09/06/2022 09:48:50 AM AEST Retrying due to , sleeping 40.6s ...
09/06/2022 09:49:31 AM AEST Retrying due to , sleeping 56.8s ...
09/06/2022 09:50:28 AM AEST Retrying due to , sleeping 60.0s ...
09/06/2022 09:51:28 AM AEST Retrying due to , sleeping 60.0s ...


 17%|█▋        | 2/12 [07:01<36:23, 218.37s/it]

Transcribing gs://blog-os-asr/cancer-risk-diet.wav...
09/06/2022 09:52:31 AM AEST Retrying due to , sleeping 0.8s ...
09/06/2022 09:52:32 AM AEST Retrying due to , sleeping 3.2s ...
09/06/2022 09:52:36 AM AEST Retrying due to , sleeping 4.3s ...
09/06/2022 09:52:40 AM AEST Retrying due to , sleeping 15.4s ...
09/06/2022 09:52:56 AM AEST Retrying due to , sleeping 28.8s ...
09/06/2022 09:53:25 AM AEST Retrying due to , sleeping 48.1s ...
09/06/2022 09:54:14 AM AEST Retrying due to , sleeping 10.0s ...
09/06/2022 09:54:24 AM AEST Retrying due to , sleeping 48.1s ...


 25%|██▌       | 3/12 [09:45<29:00, 193.40s/it]

Transcribing gs://blog-os-asr/concerns-when-humans-and-machines-merge.wav...
09/06/2022 09:55:14 AM AEST Retrying due to , sleeping 1.1s ...
09/06/2022 09:55:16 AM AEST Retrying due to , sleeping 2.2s ...
09/06/2022 09:55:18 AM AEST Retrying due to , sleeping 1.9s ...
09/06/2022 09:55:21 AM AEST Retrying due to , sleeping 13.7s ...
09/06/2022 09:55:35 AM AEST Retrying due to , sleeping 10.8s ...
09/06/2022 09:55:46 AM AEST Retrying due to , sleeping 31.6s ...
09/06/2022 09:56:18 AM AEST Retrying due to , sleeping 53.6s ...
09/06/2022 09:57:12 AM AEST Retrying due to , sleeping 60.0s ...


 33%|███▎      | 4/12 [12:45<25:04, 188.08s/it]

Transcribing gs://blog-os-asr/deadly-falling-coins.wav...
09/06/2022 09:58:15 AM AEST Retrying due to , sleeping 1.4s ...
09/06/2022 09:58:16 AM AEST Retrying due to , sleeping 3.4s ...
09/06/2022 09:58:20 AM AEST Retrying due to , sleeping 6.3s ...
09/06/2022 09:58:27 AM AEST Retrying due to , sleeping 10.5s ...
09/06/2022 09:58:38 AM AEST Retrying due to , sleeping 29.1s ...
09/06/2022 09:59:07 AM AEST Retrying due to , sleeping 23.6s ...
09/06/2022 09:59:31 AM AEST Retrying due to , sleeping 53.3s ...


 42%|████▏     | 5/12 [14:58<19:37, 168.21s/it]

Transcribing gs://blog-os-asr/exhibit-shows-voyages-of-ancient-cultures-around-the-coral-sea.wav...
09/06/2022 10:00:27 AM AEST Retrying due to , sleeping 2.0s ...
09/06/2022 10:00:30 AM AEST Retrying due to , sleeping 0.3s ...
09/06/2022 10:00:30 AM AEST Retrying due to , sleeping 4.2s ...
09/06/2022 10:00:35 AM AEST Retrying due to , sleeping 8.8s ...
09/06/2022 10:00:44 AM AEST Retrying due to , sleeping 22.5s ...
09/06/2022 10:01:07 AM AEST Retrying due to , sleeping 9.0s ...
09/06/2022 10:01:17 AM AEST Retrying due to , sleeping 34.3s ...
09/06/2022 10:01:51 AM AEST Retrying due to , sleeping 6.0s ...
09/06/2022 10:01:58 AM AEST Retrying due to , sleeping 47.8s ...
09/06/2022 10:02:46 AM AEST Retrying due to , sleeping 31.0s ...
09/06/2022 10:03:17 AM AEST Retrying due to , sleeping 19.1s ...
09/06/2022 10:03:37 AM AEST Retrying due to , sleeping 60.0s ...


 50%|█████     | 6/12 [19:10<19:40, 196.71s/it]

Transcribing gs://blog-os-asr/how-dino-feet-supported-the-reptilian-giants.wav...
09/06/2022 10:04:39 AM AEST Retrying due to , sleeping 0.9s ...
09/06/2022 10:04:41 AM AEST Retrying due to , sleeping 2.5s ...
09/06/2022 10:04:44 AM AEST Retrying due to , sleeping 4.9s ...
09/06/2022 10:04:49 AM AEST Retrying due to , sleeping 9.8s ...
09/06/2022 10:04:59 AM AEST Retrying due to , sleeping 7.3s ...
09/06/2022 10:05:07 AM AEST Retrying due to , sleeping 59.5s ...
09/06/2022 10:06:07 AM AEST Retrying due to , sleeping 60.0s ...


 58%|█████▊    | 7/12 [21:39<15:06, 181.30s/it]

Transcribing gs://blog-os-asr/mapping-covid-genes.wav...
09/06/2022 10:07:09 AM AEST Retrying due to , sleeping 1.6s ...
09/06/2022 10:07:11 AM AEST Retrying due to , sleeping 3.0s ...
09/06/2022 10:07:14 AM AEST Retrying due to , sleeping 0.4s ...
09/06/2022 10:07:15 AM AEST Retrying due to , sleeping 10.4s ...
09/06/2022 10:07:26 AM AEST Retrying due to , sleeping 9.3s ...
09/06/2022 10:07:36 AM AEST Retrying due to , sleeping 23.4s ...
09/06/2022 10:07:59 AM AEST Retrying due to , sleeping 60.0s ...


 67%|██████▋   | 8/12 [23:32<10:37, 159.47s/it]

Transcribing gs://blog-os-asr/new-dung-beetles-on-their-way-to-australia.wav...
09/06/2022 10:09:02 AM AEST Retrying due to , sleeping 0.4s ...
09/06/2022 10:09:02 AM AEST Retrying due to , sleeping 0.8s ...
09/06/2022 10:09:03 AM AEST Retrying due to , sleeping 2.3s ...
09/06/2022 10:09:06 AM AEST Retrying due to , sleeping 12.5s ...
09/06/2022 10:09:19 AM AEST Retrying due to , sleeping 11.8s ...
09/06/2022 10:09:31 AM AEST Retrying due to , sleeping 24.2s ...
09/06/2022 10:09:56 AM AEST Retrying due to , sleeping 60.0s ...
09/06/2022 10:10:56 AM AEST Retrying due to , sleeping 60.0s ...


 75%|███████▌  | 9/12 [26:29<08:14, 164.99s/it]

Transcribing gs://blog-os-asr/overusing-vitamins-health-risk.wav...
09/06/2022 10:11:58 AM AEST Retrying due to , sleeping 0.8s ...
09/06/2022 10:12:00 AM AEST Retrying due to , sleeping 1.3s ...
09/06/2022 10:12:01 AM AEST Retrying due to , sleeping 2.5s ...
09/06/2022 10:12:04 AM AEST Retrying due to , sleeping 9.7s ...
09/06/2022 10:12:14 AM AEST Retrying due to , sleeping 12.7s ...
09/06/2022 10:12:27 AM AEST Retrying due to , sleeping 43.9s ...
09/06/2022 10:13:11 AM AEST Retrying due to , sleeping 26.0s ...
09/06/2022 10:13:38 AM AEST Retrying due to , sleeping 60.0s ...


 83%|████████▎ | 10/12 [29:10<05:27, 163.86s/it]

Transcribing gs://blog-os-asr/science-stars-shine-at-the-2022-eureka-science-prizes.wav...
09/06/2022 10:14:40 AM AEST Retrying due to , sleeping 1.1s ...
09/06/2022 10:14:42 AM AEST Retrying due to , sleeping 0.9s ...
09/06/2022 10:14:43 AM AEST Retrying due to , sleeping 2.0s ...
09/06/2022 10:14:45 AM AEST Retrying due to , sleeping 3.8s ...
09/06/2022 10:14:49 AM AEST Retrying due to , sleeping 25.5s ...
09/06/2022 10:15:15 AM AEST Retrying due to , sleeping 53.4s ...
09/06/2022 10:16:09 AM AEST Retrying due to , sleeping 10.3s ...
09/06/2022 10:16:19 AM AEST Retrying due to , sleeping 60.0s ...


 92%|█████████▏| 11/12 [31:52<02:43, 163.21s/it]

Transcribing gs://blog-os-asr/sugar-label-shopping-habits.wav...
09/06/2022 10:17:22 AM AEST Retrying due to , sleeping 1.6s ...
09/06/2022 10:17:24 AM AEST Retrying due to , sleeping 2.5s ...
09/06/2022 10:17:27 AM AEST Retrying due to , sleeping 1.5s ...
09/06/2022 10:17:28 AM AEST Retrying due to , sleeping 0.4s ...
09/06/2022 10:17:29 AM AEST Retrying due to , sleeping 22.7s ...
09/06/2022 10:17:52 AM AEST Retrying due to , sleeping 37.2s ...
09/06/2022 10:18:30 AM AEST Retrying due to , sleeping 60.0s ...


100%|██████████| 12/12 [34:03<00:00, 170.26s/it]


In [None]:
# potentially use CLI?
gcloud ml speech recognize-long-running \
    'gs://blog-os-asr/test.wav' \
     --language-code='en-US' --async

# poll result
gcloud ml speech operations describe 1558607248830316847

## AWS

In [25]:
import s3fs

fs = s3fs.S3FileSystem()
aws_uris = fs.ls("blog-os-asr")

aws_uris

['blog-os-asr/aussie-stem-stars-ajay-rane.wav',
 'blog-os-asr/australia-s-marsupial-megafauna-their-discovery-ecology-and-exti.wav',
 'blog-os-asr/cancer-risk-diet.wav',
 'blog-os-asr/concerns-when-humans-and-machines-merge.wav',
 'blog-os-asr/deadly-falling-coins.wav',
 'blog-os-asr/exhibit-shows-voyages-of-ancient-cultures-around-the-coral-sea.wav',
 'blog-os-asr/how-dino-feet-supported-the-reptilian-giants.wav',
 'blog-os-asr/mapping-covid-genes.wav',
 'blog-os-asr/new-dung-beetles-on-their-way-to-australia.wav',
 'blog-os-asr/overusing-vitamins-health-risk.wav',
 'blog-os-asr/science-stars-shine-at-the-2022-eureka-science-prizes.wav',
 'blog-os-asr/sugar-label-shopping-habits.wav']

In [26]:
import boto3
import requests
from botocore.exceptions import ClientError
import logging

from transcribe_util import CustomWaiter, WaitState

logger = logging.getLogger(__name__)


class TranscribeCompleteWaiter(CustomWaiter):
    def __init__(self, client):
        super().__init__(
            "TranscribeComplete",
            "GetTranscriptionJob",
            "TranscriptionJob.TranscriptionJobStatus",
            {"COMPLETED": WaitState.SUCCESS, "FAILED": WaitState.FAILURE},
            client,
        )

    def wait(self, job_name):
        self._wait(TranscriptionJobName=job_name)


def start_job(
    job_name,
    media_uri,
    media_format,
    language_code,
    transcribe_client,
    vocabulary_name=None,
):
    try:
        job_args = {
            "TranscriptionJobName": job_name,
            "Media": {"MediaFileUri": media_uri},
            "MediaFormat": media_format,
            "LanguageCode": language_code,
        }
        if vocabulary_name is not None:
            job_args["Settings"] = {"VocabularyName": vocabulary_name}
        response = transcribe_client.start_transcription_job(**job_args)
        job = response["TranscriptionJob"]
        logger.info("Started transcription job %s.", job_name)
    except ClientError:
        logger.exception("Couldn't start transcription job %s.", job_name)
        raise
    else:
        return job


def get_job(job_name, transcribe_client):
    try:
        response = transcribe_client.get_transcription_job(
            TranscriptionJobName=job_name
        )
        job = response["TranscriptionJob"]
        logger.info("Got job %s.", job["TranscriptionJobName"])
    except ClientError:
        logger.exception("Couldn't get job %s.", job_name)
        raise
    else:
        return job


def list_jobs(job_filter, transcribe_client):
    try:
        response = transcribe_client.list_transcription_jobs(JobNameContains=job_filter)
        jobs = response["TranscriptionJobSummaries"]
        next_token = response.get("NextToken")
        while next_token is not None:
            response = transcribe_client.list_transcription_jobs(
                JobNameContains=job_filter, NextToken=next_token
            )
            jobs += response["TranscriptionJobSummaries"]
            next_token = response.get("NextToken")
        logger.info("Got %s jobs with filter %s.", len(jobs), job_filter)
    except ClientError:
        logger.exception("Couldn't get jobs with filter %s.", job_filter)
        raise
    else:
        return jobs


def delete_job(job_name, transcribe_client):
    try:
        transcribe_client.delete_transcription_job(TranscriptionJobName=job_name)
        logger.info("Deleted job %s.", job_name)
    except ClientError:
        logger.exception("Couldn't delete job %s.", job_name)
        raise

In [15]:
transcribe_client = boto3.client("transcribe")
aws_output_dir = Path("../output/radio_national_podcasts/transcripts/aws")
shutil.rmtree(str(aws_output_dir)) if aws_output_dir.exists() else None
aws_output_dir.mkdir(parents=True, exist_ok=True)

for aws_uri in tqdm(aws_uris):
    try:
        print(f"Transcribing {aws_uri}...")
        before = time.time()
        job_name_simple = Path(aws_uri).name

        # ensure a job doesn't already exist
        for job in list_jobs(job_name_simple, transcribe_client):
            delete_job(job["TranscriptionJobName"], transcribe_client)

        print(f"Starting transcription job {job_name_simple}.")
        start_job(job_name_simple,
                  f"s3://{aws_uri}", "wav", "en-US", transcribe_client)
        transcribe_waiter = TranscribeCompleteWaiter(transcribe_client)
        transcribe_waiter.wait(job_name_simple)
        job_simple = get_job(job_name_simple, transcribe_client)
        transcript_simple = requests.get(
            job_simple["Transcript"]["TranscriptFileUri"]
        ).json()
        after = time.time()
        aws_transcript_record = {
            "hypothesis": transcript_simple["results"]["transcripts"][0]["transcript"],
            "elapsed_time": after - before,
            "provider": "aws",
        }
        stub_output = (aws_output_dir / f"{Path(aws_uri).stem}.json").write_text(
            json.dumps(aws_transcript_record)
        )
        # clean-up jobs
        for job in list_jobs(job_name_simple, transcribe_client):
            delete_job(job["TranscriptionJobName"], transcribe_client)

    except Exception:
        print(f"Unable to transcribe: {aws_uri}")


  0%|          | 0/1 [00:00<?, ?it/s]

Transcribing blog-os-asr/sugar-label-shopping-habits.wav...
Starting transcription job sugar-label-shopping-habits.wav.


{'TranscriptionJobName': 'sugar-label-shopping-habits.wav',
 'TranscriptionJobStatus': 'IN_PROGRESS',
 'LanguageCode': 'en-US',
 'MediaFormat': 'wav',
 'Media': {'MediaFileUri': 's3://blog-os-asr/sugar-label-shopping-habits.wav'},
 'StartTime': datetime.datetime(2022, 9, 6, 10, 46, 18, 48000, tzinfo=tzlocal()),
 'CreationTime': datetime.datetime(2022, 9, 6, 10, 46, 18, 30000, tzinfo=tzlocal())}

100%|██████████| 1/1 [00:52<00:00, 52.36s/it]


## Azure

- Batch transcription instructions: https://github.com/Azure-Samples/cognitive-services-speech-sdk/tree/master/samples/batch/python
- Generate swagger, download python package, install python package
- Example code via: https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/batch/python/python-client/main.py

In [19]:
speech_key = os.environ["azure_asr_key"]
service_region = os.environ["azure_asr_region"]
endpoint = os.environ["azure_asr_endpoint"]

blob_key = os.environ["azure_blob_key"]
blob_connection_string = os.environ["azure_blob_connection_string"]
blob_container_name = os.environ["azure_blob_container_name"]
storage_account_name = os.environ["azure_storage_account_name"]

In [20]:
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
import swagger_client as cris_client

logging.basicConfig(
    stream=sys.stdout,
    level=logging.DEBUG,
    format="%(asctime)s %(message)s",
    datefmt="%m/%d/%Y %I:%M:%S %p %Z",
)

# Your subscription key and region for the speech service
SUBSCRIPTION_KEY = speech_key
SERVICE_REGION = service_region

NAME = "Simple transcription"
DESCRIPTION = "Simple transcription description"

LOCALE = "en-US"
# with a single request. At least 'read' and 'list' (rl) permissions are required.
# RECORDINGS_CONTAINER_URI = "https://blogosasr.blob.core.windows.net/blog-os-asr"
RECORDINGS_CONTAINER_URI = "https://blogosasr.blob.core.windows.net/blog-os-asr?sp=rl&st=2022-09-05T21:46:42Z&se=2022-09-06T05:46:42Z&spr=https&sv=2021-06-08&sr=c&sig=5cKlVMcMexgvS2BEI%2BWiaeNT1apvATLRJfK3E4lhTvM%3D"

# Set model information when doing transcription with custom models
MODEL_REFERENCE = None  # guid of a custom model


def transcribe_from_single_blob(uri, properties):
    """
    Transcribe a single audio file located at `uri` using the settings specified in `properties`
    using the base model for the specified locale.
    """
    return cris_client.Transcription(
        display_name=NAME,
        description=DESCRIPTION,
        locale=LOCALE,
        content_urls=[uri],
        properties=properties,
    )


def transcribe_with_custom_model(api, uri, properties):
    """
    Transcribe a single audio file located at `uri` using the settings specified in `properties`
    using the base model for the specified locale.
    """
    if MODEL_REFERENCE is None:
        logging.error("Custom model ids must be set when using custom models")
        sys.exit()
    model = api.get_model(MODEL_REFERENCE)
    return cris_client.Transcription(
        display_name=NAME,
        description=DESCRIPTION,
        locale=LOCALE,
        content_urls=[uri],
        model=model,
        properties=properties,
    )


def transcribe_from_container(uri, properties):
    """
    Transcribe all files in the container located at `uri` using the settings specified in `properties`
    using the base model for the specified locale.
    """
    return cris_client.Transcription(
        display_name=NAME,
        description=DESCRIPTION,
        locale=LOCALE,
        content_container_url=uri,
        properties=properties,
    )


def _paginate(api, paginated_object):  # sourcery skip: raise-specific-error
    """
    The autogenerated client does not support pagination. This function returns a generator over
    all items of the array that the paginated object `paginated_object` is part of.
    """
    yield from paginated_object.values
    typename = type(paginated_object).__name__
    auth_settings = ["apiKeyHeader", "apiKeyQuery"]
    while paginated_object.next_link:
        link = paginated_object.next_link[len(api.api_client.configuration.host) :]
        paginated_object, status, headers = api.api_client.call_api(
            link, "GET", response_type=typename, auth_settings=auth_settings
        )

        if status == 200:
            yield from paginated_object.values
        else:
            raise Exception(f"could not receive paginated data: status {status}")


def delete_all_transcriptions(api):
    """
    Delete all transcriptions associated with your speech resource.
    """
    logging.info("Deleting all existing completed transcriptions.")

    # get all transcriptions for the subscription
    transcriptions = list(_paginate(api, api.get_transcriptions()))

    # Delete all pre-existing completed transcriptions.
    # If transcriptions are still running or not started, they will not be deleted.
    for transcription in transcriptions:
        transcription_id = transcription._self.split("/")[-1]
        logging.debug(f"Deleting transcription with id {transcription_id}")
        try:
            api.delete_transcription(transcription_id)
        except cris_client.rest.ApiException as exc:
            logging.error(f"Could not delete transcription {transcription_id}: {exc}")


def transcribe_azure(blob_sas):
    logging.info("Starting transcription client...")
    before = time.time()

    # configure API key authorization: subscription_key
    configuration = cris_client.Configuration()
    configuration.api_key["Ocp-Apim-Subscription-Key"] = SUBSCRIPTION_KEY
    configuration.host = (
        f"https://{SERVICE_REGION}.api.cognitive.microsoft.com/speechtotext/v3.0"
    )

    # create the client object and authenticate
    client = cris_client.ApiClient(configuration)

    # create an instance of the transcription api class
    api = cris_client.CustomSpeechTranscriptionsApi(api_client=client)

    properties = {
        # "punctuationMode": "DictatedAndAutomatic",
        # "profanityFilterMode": "Masked",
        # "wordLevelTimestampsEnabled": True,
        # "diarizationEnabled": True,
        # "destinationContainerUrl": "<SAS Uri with at least write (w) permissions for an Azure Storage blob container that results should be written to>",
        # "timeToLive": "PT1H"
    }

    transcription_definition = transcribe_from_single_blob(blob_sas, properties)

    # Uncomment this block to transcribe all files from a container.
    # transcription_definition = transcribe_from_container(
    #     RECORDINGS_CONTAINER_URI, properties)

    created_transcription, status, headers = api.create_transcription_with_http_info(
        transcription=transcription_definition
    )

    # get the transcription Id from the location URI
    transcription_id = headers["location"].split("/")[-1]

    logging.info(
        f"Created new transcription with id '{transcription_id}' in region {SERVICE_REGION}"
    )

    logging.info("Checking status.")

    completed = False
    while not completed:
        # wait for 5 seconds before refreshing the transcription status
        time.sleep(5)

        transcription = api.get_transcription(transcription_id)
        logging.info(f"Transcriptions status: {transcription.status}")

        if transcription.status in ("Failed", "Succeeded"):
            completed = True

        if transcription.status == "Succeeded":
            pag_files = api.get_transcription_files(transcription_id)
            for file_data in _paginate(api, pag_files):
                if file_data.kind != "Transcription":
                    continue

                file_data.name
                results_url = file_data.links.content_url
                results = requests.get(results_url)
                after = time.time()
                return {
                    "hypothesis": results.json()["combinedRecognizedPhrases"][0][
                        "display"
                    ],
                    "provider": "azure",
                    "elapsed_time": after - before,
                }
        elif transcription.status == "Failed":
            logging.info(
                f"Transcription failed: {transcription.properties.error.message}"
            )


In [21]:
from azure.storage.blob import (
    BlobClient,
    BlobSasPermissions,
    BlobServiceClient,
    ContainerClient,
    __version__,
    generate_blob_sas,
)

blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)
blob_client = blob_service_client.get_container_client(blob_container_name)

azure_output_dir = Path("../output/radio_national_podcasts/transcripts/azure")
shutil.rmtree(str(azure_output_dir)) if azure_output_dir.exists() else None
azure_output_dir.mkdir(parents=True, exist_ok=True)


In [24]:
from datetime import datetime, timedelta

for blob_file in tqdm(list(blob_client.list_blobs())):
    print(f"Transcribing {blob_file['name']}...")

    blob_sas = generate_blob_sas(
        account_name=storage_account_name,
        container_name=blob_container_name,
        blob_name=blob_file["name"],
        account_key=blob_key,
        permission=BlobSasPermissions(read=True),
        expiry=datetime.utcnow() + timedelta(hours=6),
    )

    blob_sas_formatted = f"https://{storage_account_name}.blob.core.windows.net/{blob_container_name}/{blob_file['name']}?{blob_sas}"

    azure_transcript_record = transcribe_azure(blob_sas_formatted)
    stub_output = (
        azure_output_dir / f"{Path(blob_file['name']).stem}.json"
    ).write_text(json.dumps(azure_transcript_record))


09/06/2022 10:57:21 AM AEST Request URL: 'https://blogosasr.blob.core.windows.net/blog-os-asr?restype=REDACTED&comp=REDACTED'
Request method: 'GET'
Request headers:
    'x-ms-version': 'REDACTED'
    'Accept': 'application/xml'
    'User-Agent': 'azsdk-python-storage-blob/12.13.1 Python/3.8.13 (macOS-10.16-x86_64-i386-64bit)'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': 'dccd3274-2d7e-11ed-89a8-068c0081e3bb'
    'Authorization': 'REDACTED'
No body was attached to the request
09/06/2022 10:57:21 AM AEST Response status: 200
Response headers:
    'Transfer-Encoding': 'chunked'
    'Content-Type': 'application/xml'
    'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'
    'x-ms-request-id': 'cfcab60c-e01e-0012-368b-c1ad00000000'
    'x-ms-client-request-id': 'dccd3274-2d7e-11ed-89a8-068c0081e3bb'
    'x-ms-version': 'REDACTED'
    'Date': 'Tue, 06 Sep 2022 00:57:21 GMT'


  0%|          | 0/12 [00:00<?, ?it/s]

Transcribing aussie-stem-stars-ajay-rane.wav...
09/06/2022 10:57:21 AM AEST Starting transcription client...
09/06/2022 10:57:21 AM AEST Created new transcription with id 'b437d522-7a99-4c9a-bc64-e054e7d4ab67' in region australiaeast
09/06/2022 10:57:21 AM AEST Checking status.
09/06/2022 10:57:27 AM AEST Transcriptions status: Running
09/06/2022 10:57:32 AM AEST Transcriptions status: Running
09/06/2022 10:57:37 AM AEST Transcriptions status: Running
09/06/2022 10:57:42 AM AEST Transcriptions status: Running
09/06/2022 10:57:47 AM AEST Transcriptions status: Running
09/06/2022 10:57:52 AM AEST Transcriptions status: Running
09/06/2022 10:57:57 AM AEST Transcriptions status: Running
09/06/2022 10:58:02 AM AEST Transcriptions status: Running
09/06/2022 10:58:07 AM AEST Transcriptions status: Running
09/06/2022 10:58:12 AM AEST Transcriptions status: Running
09/06/2022 10:58:17 AM AEST Transcriptions status: Running
09/06/2022 10:58:22 AM AEST Transcriptions status: Running
09/06/2022 10

  8%|▊         | 1/12 [03:44<41:10, 224.63s/it]

Transcribing australia-s-marsupial-megafauna-their-discovery-ecology-and-exti.wav...
09/06/2022 11:01:06 AM AEST Starting transcription client...
09/06/2022 11:01:06 AM AEST Created new transcription with id '64bdce26-759c-4490-8571-7d9ae3346440' in region australiaeast
09/06/2022 11:01:06 AM AEST Checking status.
09/06/2022 11:01:11 AM AEST Transcriptions status: Running
09/06/2022 11:01:16 AM AEST Transcriptions status: Running
09/06/2022 11:01:21 AM AEST Transcriptions status: Running
09/06/2022 11:01:27 AM AEST Transcriptions status: Running
09/06/2022 11:01:32 AM AEST Transcriptions status: Running
09/06/2022 11:01:37 AM AEST Transcriptions status: Running
09/06/2022 11:01:42 AM AEST Transcriptions status: Running
09/06/2022 11:01:47 AM AEST Transcriptions status: Running
09/06/2022 11:01:52 AM AEST Transcriptions status: Running
09/06/2022 11:01:57 AM AEST Transcriptions status: Running
09/06/2022 11:02:02 AM AEST Transcriptions status: Running
09/06/2022 11:02:07 AM AEST Transcr

 17%|█▋        | 2/12 [07:30<37:31, 225.19s/it]

Transcribing cancer-risk-diet.wav...
09/06/2022 11:04:51 AM AEST Starting transcription client...
09/06/2022 11:04:52 AM AEST Created new transcription with id '6464c625-e6c4-4ef6-bcd3-cde9a323be39' in region australiaeast
09/06/2022 11:04:52 AM AEST Checking status.
09/06/2022 11:04:57 AM AEST Transcriptions status: Running
09/06/2022 11:05:02 AM AEST Transcriptions status: Running
09/06/2022 11:05:07 AM AEST Transcriptions status: Running
09/06/2022 11:05:12 AM AEST Transcriptions status: Running
09/06/2022 11:05:17 AM AEST Transcriptions status: Running
09/06/2022 11:05:22 AM AEST Transcriptions status: Running
09/06/2022 11:05:27 AM AEST Transcriptions status: Running
09/06/2022 11:05:32 AM AEST Transcriptions status: Running
09/06/2022 11:05:37 AM AEST Transcriptions status: Running
09/06/2022 11:05:43 AM AEST Transcriptions status: Running
09/06/2022 11:05:48 AM AEST Transcriptions status: Running
09/06/2022 11:05:53 AM AEST Transcriptions status: Running
09/06/2022 11:05:58 AM A

 25%|██▌       | 3/12 [11:52<36:18, 242.01s/it]

Transcribing concerns-when-humans-and-machines-merge.wav...
09/06/2022 11:09:13 AM AEST Starting transcription client...
09/06/2022 11:09:14 AM AEST Created new transcription with id 'f55d5866-8008-4dfe-870d-fa5958f73808' in region australiaeast
09/06/2022 11:09:14 AM AEST Checking status.
09/06/2022 11:09:19 AM AEST Transcriptions status: Running
09/06/2022 11:09:24 AM AEST Transcriptions status: Running
09/06/2022 11:09:29 AM AEST Transcriptions status: Running
09/06/2022 11:09:34 AM AEST Transcriptions status: Running
09/06/2022 11:09:40 AM AEST Transcriptions status: Running
09/06/2022 11:09:45 AM AEST Transcriptions status: Running
09/06/2022 11:09:50 AM AEST Transcriptions status: Running
09/06/2022 11:09:55 AM AEST Transcriptions status: Running
09/06/2022 11:10:00 AM AEST Transcriptions status: Running
09/06/2022 11:10:05 AM AEST Transcriptions status: Running
09/06/2022 11:10:10 AM AEST Transcriptions status: Running
09/06/2022 11:10:15 AM AEST Transcriptions status: Running
0

 33%|███▎      | 4/12 [16:08<33:01, 247.71s/it]

Transcribing deadly-falling-coins.wav...
09/06/2022 11:13:30 AM AEST Starting transcription client...
09/06/2022 11:13:30 AM AEST Created new transcription with id 'ef73ad33-2eba-4167-a788-d047865d8207' in region australiaeast
09/06/2022 11:13:30 AM AEST Checking status.
09/06/2022 11:13:35 AM AEST Transcriptions status: Running
09/06/2022 11:13:41 AM AEST Transcriptions status: Running
09/06/2022 11:13:46 AM AEST Transcriptions status: Running
09/06/2022 11:13:51 AM AEST Transcriptions status: Running
09/06/2022 11:13:56 AM AEST Transcriptions status: Running
09/06/2022 11:14:01 AM AEST Transcriptions status: Running
09/06/2022 11:14:06 AM AEST Transcriptions status: Running
09/06/2022 11:14:11 AM AEST Transcriptions status: Running
09/06/2022 11:14:16 AM AEST Transcriptions status: Running
09/06/2022 11:14:21 AM AEST Transcriptions status: Running
09/06/2022 11:14:26 AM AEST Transcriptions status: Running
09/06/2022 11:14:31 AM AEST Transcriptions status: Running
09/06/2022 11:14:37 

 42%|████▏     | 5/12 [18:02<23:14, 199.27s/it]

Transcribing exhibit-shows-voyages-of-ancient-cultures-around-the-coral-sea.wav...
09/06/2022 11:15:23 AM AEST Starting transcription client...
09/06/2022 11:15:24 AM AEST Created new transcription with id '4f9b3a2b-743d-4845-8bd6-369150282fd0' in region australiaeast
09/06/2022 11:15:24 AM AEST Checking status.
09/06/2022 11:15:29 AM AEST Transcriptions status: Running
09/06/2022 11:15:34 AM AEST Transcriptions status: Running
09/06/2022 11:15:39 AM AEST Transcriptions status: Running
09/06/2022 11:15:44 AM AEST Transcriptions status: Running
09/06/2022 11:15:49 AM AEST Transcriptions status: Running
09/06/2022 11:15:54 AM AEST Transcriptions status: Running
09/06/2022 11:15:59 AM AEST Transcriptions status: Running
09/06/2022 11:16:04 AM AEST Transcriptions status: Running
09/06/2022 11:16:09 AM AEST Transcriptions status: Running
09/06/2022 11:16:14 AM AEST Transcriptions status: Running
09/06/2022 11:16:19 AM AEST Transcriptions status: Running
09/06/2022 11:16:25 AM AEST Transcrip

 50%|█████     | 6/12 [23:28<24:14, 242.38s/it]

Transcribing how-dino-feet-supported-the-reptilian-giants.wav...
09/06/2022 11:20:49 AM AEST Starting transcription client...
09/06/2022 11:20:50 AM AEST Created new transcription with id 'd7b2e3b6-3861-43cc-aff3-4c2189500234' in region australiaeast
09/06/2022 11:20:50 AM AEST Checking status.
09/06/2022 11:20:55 AM AEST Transcriptions status: Running
09/06/2022 11:21:00 AM AEST Transcriptions status: Running
09/06/2022 11:21:05 AM AEST Transcriptions status: Running
09/06/2022 11:21:10 AM AEST Transcriptions status: Running
09/06/2022 11:21:15 AM AEST Transcriptions status: Running
09/06/2022 11:21:20 AM AEST Transcriptions status: Running
09/06/2022 11:21:25 AM AEST Transcriptions status: Running
09/06/2022 11:21:30 AM AEST Transcriptions status: Running
09/06/2022 11:21:35 AM AEST Transcriptions status: Running
09/06/2022 11:21:40 AM AEST Transcriptions status: Running
09/06/2022 11:21:46 AM AEST Transcriptions status: Running
09/06/2022 11:21:51 AM AEST Transcriptions status: Runn

 58%|█████▊    | 7/12 [26:57<19:17, 231.54s/it]

Transcribing mapping-covid-genes.wav...
09/06/2022 11:24:18 AM AEST Starting transcription client...
09/06/2022 11:24:19 AM AEST Created new transcription with id '4210c5d1-2a71-4048-91d7-6dff0ba4347e' in region australiaeast
09/06/2022 11:24:19 AM AEST Checking status.
09/06/2022 11:24:24 AM AEST Transcriptions status: Running
09/06/2022 11:24:29 AM AEST Transcriptions status: Running
09/06/2022 11:24:34 AM AEST Transcriptions status: Running
09/06/2022 11:24:39 AM AEST Transcriptions status: Running
09/06/2022 11:24:44 AM AEST Transcriptions status: Running
09/06/2022 11:24:49 AM AEST Transcriptions status: Running
09/06/2022 11:24:55 AM AEST Transcriptions status: Running
09/06/2022 11:25:00 AM AEST Transcriptions status: Running
09/06/2022 11:25:05 AM AEST Transcriptions status: Running
09/06/2022 11:25:10 AM AEST Transcriptions status: Running
09/06/2022 11:25:15 AM AEST Transcriptions status: Running
09/06/2022 11:25:20 AM AEST Transcriptions status: Running
09/06/2022 11:25:25 A

 67%|██████▋   | 8/12 [29:15<13:27, 201.79s/it]

Transcribing new-dung-beetles-on-their-way-to-australia.wav...
09/06/2022 11:26:36 AM AEST Starting transcription client...
09/06/2022 11:26:37 AM AEST Created new transcription with id '24ee8fcc-f91a-485c-ac7d-277ecfa83f79' in region australiaeast
09/06/2022 11:26:37 AM AEST Checking status.
09/06/2022 11:26:42 AM AEST Transcriptions status: Running
09/06/2022 11:26:47 AM AEST Transcriptions status: Running
09/06/2022 11:26:52 AM AEST Transcriptions status: Running
09/06/2022 11:26:57 AM AEST Transcriptions status: Running
09/06/2022 11:27:02 AM AEST Transcriptions status: Running
09/06/2022 11:27:08 AM AEST Transcriptions status: Running
09/06/2022 11:27:13 AM AEST Transcriptions status: Running
09/06/2022 11:27:18 AM AEST Transcriptions status: Running
09/06/2022 11:27:23 AM AEST Transcriptions status: Running
09/06/2022 11:27:28 AM AEST Transcriptions status: Running
09/06/2022 11:27:33 AM AEST Transcriptions status: Running
09/06/2022 11:27:38 AM AEST Transcriptions status: Runnin

 75%|███████▌  | 9/12 [34:17<11:39, 233.15s/it]

Transcribing overusing-vitamins-health-risk.wav...
09/06/2022 11:31:39 AM AEST Starting transcription client...
09/06/2022 11:31:39 AM AEST Created new transcription with id '47eb44c1-0017-4e01-9982-f296e091c371' in region australiaeast
09/06/2022 11:31:39 AM AEST Checking status.
09/06/2022 11:31:44 AM AEST Transcriptions status: Running
09/06/2022 11:31:49 AM AEST Transcriptions status: Running
09/06/2022 11:31:54 AM AEST Transcriptions status: Running
09/06/2022 11:31:59 AM AEST Transcriptions status: Running
09/06/2022 11:32:04 AM AEST Transcriptions status: Running
09/06/2022 11:32:09 AM AEST Transcriptions status: Running
09/06/2022 11:32:15 AM AEST Transcriptions status: Running
09/06/2022 11:32:20 AM AEST Transcriptions status: Running
09/06/2022 11:32:25 AM AEST Transcriptions status: Running
09/06/2022 11:32:30 AM AEST Transcriptions status: Running
09/06/2022 11:32:35 AM AEST Transcriptions status: Running
09/06/2022 11:32:40 AM AEST Transcriptions status: Running
09/06/2022

 83%|████████▎ | 10/12 [36:00<06:25, 192.81s/it]

Transcribing science-stars-shine-at-the-2022-eureka-science-prizes.wav...
09/06/2022 11:33:21 AM AEST Starting transcription client...
09/06/2022 11:33:21 AM AEST Created new transcription with id '51b990ea-dda5-4878-ab94-99e8b286c146' in region australiaeast
09/06/2022 11:33:21 AM AEST Checking status.
09/06/2022 11:33:27 AM AEST Transcriptions status: Running
09/06/2022 11:33:32 AM AEST Transcriptions status: Running
09/06/2022 11:33:37 AM AEST Transcriptions status: Running
09/06/2022 11:33:42 AM AEST Transcriptions status: Running
09/06/2022 11:33:47 AM AEST Transcriptions status: Running
09/06/2022 11:33:52 AM AEST Transcriptions status: Running
09/06/2022 11:33:57 AM AEST Transcriptions status: Running
09/06/2022 11:34:02 AM AEST Transcriptions status: Running
09/06/2022 11:34:07 AM AEST Transcriptions status: Running
09/06/2022 11:34:12 AM AEST Transcriptions status: Running
09/06/2022 11:34:17 AM AEST Transcriptions status: Running
09/06/2022 11:34:22 AM AEST Transcriptions sta

 92%|█████████▏| 11/12 [38:58<03:08, 188.57s/it]

Transcribing sugar-label-shopping-habits.wav...
09/06/2022 11:36:20 AM AEST Starting transcription client...
09/06/2022 11:36:20 AM AEST Created new transcription with id '7ac5ea68-8f01-4933-88e4-6e684a6927cc' in region australiaeast
09/06/2022 11:36:20 AM AEST Checking status.
09/06/2022 11:36:26 AM AEST Transcriptions status: Running
09/06/2022 11:36:31 AM AEST Transcriptions status: Running
09/06/2022 11:36:36 AM AEST Transcriptions status: Running
09/06/2022 11:36:41 AM AEST Transcriptions status: Running
09/06/2022 11:36:46 AM AEST Transcriptions status: Running
09/06/2022 11:36:51 AM AEST Transcriptions status: Running
09/06/2022 11:36:56 AM AEST Transcriptions status: Running
09/06/2022 11:37:01 AM AEST Transcriptions status: Running
09/06/2022 11:37:06 AM AEST Transcriptions status: Running
09/06/2022 11:37:11 AM AEST Transcriptions status: Running
09/06/2022 11:37:16 AM AEST Transcriptions status: Running
09/06/2022 11:37:21 AM AEST Transcriptions status: Running
09/06/2022 11

100%|██████████| 12/12 [40:51<00:00, 204.32s/it]
