In [None]:
import json
import time
from tqdm import tqdm
import sys
import shutil
from pathlib import Path
import logging

import pandas as pd
from jiwer import wer

import os


## MP3 Conversion
- Mainly for GCP input constraints, but also for consistency with os implementation

In [None]:
def resample_normalise_audio(in_file, out_file, sample_rate=16000):
    if not os.path.exists(in_file):
        raise ValueError(f"{in_file} not found")
    if out_file is None:
        out_file = in_file.replace(os.path.splitext(
            in_file)[-1], f"_{sample_rate}.wav")

    os.system(
        f"ffmpeg -i {in_file} -acodec pcm_s16le -ac 1 -af aresample=resampler=soxr -ar {sample_rate} {out_file} -y"
    )
    return out_file


transcript_manifest = pd.read_csv(
    '../output/radio_national_podcasts/manifest.csv')

output_dir = Path("../output/radio_national_podcasts/audio/wav")
shutil.rmtree(str(output_dir)) if output_dir.exists() else None
output_dir.mkdir(parents=True, exist_ok=True)

for idx, record in transcript_manifest.iterrows():
    input_path = Path(record.audio_path)
    output_path = input_path.parents[1] / f"wav/{input_path.stem}.wav"
    resample_normalise_audio(str(input_path), str(output_path))


## OS

In [None]:
sys.path.append('..')
from asr import transcribe_mono_audio

In [None]:
os_transcript_records = []
for audio_path in tqdm(transcript_manifest.head(5).audio_path):
    before = time.time()
    transcript = transcribe_mono_audio(audio_path)
    after = time.time()
    os_transcript_records.append({'hypothesis': ' '.join(transcript.transcript.tolist()),
    'elapsed_time': after - before,
    'provider': 'os'})

In [None]:
# remove "this is an ABC podcast" intro from hypothesis
os_wer = (pd.concat([transcript_manifest.head(), pd.DataFrame(os_transcript_records)], axis=1)
.assign(wer=lambda x: x.apply(lambda y: wer(y.transcript, y.hypothesis), axis=1))
.pipe(lambda x: x[['transcript','hypothesis','stem','transcript_len','wer']])
)

## GCP
- Huge chunks of the transcript missing when using async methods?
- Probably use telephony model instead
- Consider using streams instead of batch

In [None]:
from google.cloud import storage
from google.cloud import speech

project = 'hobby-358221'
bucket_name = 'blog-os-asr'
storage_client = storage.Client(project=project)
bucket = storage_client.get_bucket(bucket_name)
blobs = bucket.list_blobs()
gcp_uris = [f"gs://{bucket_name}/{e.name}" for e in blobs]

In [None]:
speech_client = speech.SpeechClient()

def transcribe_gcs(gcs_uri):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    audio = speech.RecognitionAudio(uri=gcs_uri)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
    )
    operation = speech_client.long_running_recognize(config=config, audio=audio)
    res = operation.result()
    return ' '.join([e.alternatives[0].transcript.strip() for e in res.results]).strip()

In [None]:
gcp_output_dir = Path("../output/radio_national_podcasts/transcripts/gcp")
shutil.rmtree(str(gcp_output_dir)) if gcp_output_dir.exists() else None
gcp_output_dir.mkdir(parents=True, exist_ok=True)

for gcp_uri in tqdm(gcp_uris):
    try:
        print(f"Transcribing {gcp_uri}...")
        before = time.time()
        gcp_res = transcribe_gcs(gcp_uri)
        after = time.time()
        gcp_transcript_record = {'hypothesis': gcp_res,
                                    'elapsed_time': after - before,
                                    'provider': 'gcp'}
        stub_output = (gcp_output_dir /
                        f"{Path(gcp_uri).stem}.json").write_text(json.dumps(gcp_transcript_record))
    except Exception:
        print(f"Unable to transcribe: {gcp_uri}")


In [None]:
# potentially use CLI?
gcloud ml speech recognize-long-running \
    'gs://blog-os-asr/test.wav' \
     --language-code='en-US' --async

# poll result
gcloud ml speech operations describe 1558607248830316847

## AWS

In [None]:
import s3fs
fs = s3fs.S3FileSystem()
aws_uris = fs.ls('blog-os-asr')


In [None]:
import boto3
from botocore.exceptions import ClientError
import requests
from transcribe_util import CustomWaiter, WaitState

logger = logging.getLogger(__name__)


class TranscribeCompleteWaiter(CustomWaiter):
    def __init__(self, client):
        super().__init__(
            'TranscribeComplete', 'GetTranscriptionJob',
            'TranscriptionJob.TranscriptionJobStatus',
            {'COMPLETED': WaitState.SUCCESS, 'FAILED': WaitState.FAILURE},
            client)

    def wait(self, job_name):
        self._wait(TranscriptionJobName=job_name)


def start_job(
        job_name, media_uri, media_format, language_code, transcribe_client,
        vocabulary_name=None):
    try:
        job_args = {
            'TranscriptionJobName': job_name,
            'Media': {'MediaFileUri': media_uri},
            'MediaFormat': media_format,
            'LanguageCode': language_code}
        if vocabulary_name is not None:
            job_args['Settings'] = {'VocabularyName': vocabulary_name}
        response = transcribe_client.start_transcription_job(**job_args)
        job = response['TranscriptionJob']
        logger.info("Started transcription job %s.", job_name)
    except ClientError:
        logger.exception("Couldn't start transcription job %s.", job_name)
        raise
    else:
        return job


def get_job(job_name, transcribe_client):
    try:
        response = transcribe_client.get_transcription_job(
            TranscriptionJobName=job_name)
        job = response['TranscriptionJob']
        logger.info("Got job %s.", job['TranscriptionJobName'])
    except ClientError:
        logger.exception("Couldn't get job %s.", job_name)
        raise
    else:
        return job


def list_jobs(job_filter, transcribe_client):
    try:
        response = transcribe_client.list_transcription_jobs(
            JobNameContains=job_filter)
        jobs = response['TranscriptionJobSummaries']
        next_token = response.get('NextToken')
        while next_token is not None:
            response = transcribe_client.list_transcription_jobs(
                JobNameContains=job_filter, NextToken=next_token)
            jobs += response['TranscriptionJobSummaries']
            next_token = response.get('NextToken')
        logger.info("Got %s jobs with filter %s.", len(jobs), job_filter)
    except ClientError:
        logger.exception("Couldn't get jobs with filter %s.", job_filter)
        raise
    else:
        return jobs


def delete_job(job_name, transcribe_client):
    try:
        transcribe_client.delete_transcription_job(
            TranscriptionJobName=job_name)
        logger.info("Deleted job %s.", job_name)
    except ClientError:
        logger.exception("Couldn't delete job %s.", job_name)
        raise


transcribe_client = boto3.client('transcribe')

In [None]:
aws_output_dir = Path("../output/radio_national_podcasts/transcripts/aws")
shutil.rmtree(str(aws_output_dir)) if aws_output_dir.exists() else None
aws_output_dir.mkdir(parents=True, exist_ok=True)

for aws_uri in tqdm(aws_uris):
    try:
        print(f"Transcribing {aws_uri}...")
        before = time.time()

        job_name_simple = Path(aws_uri).name
        print(f"Starting transcription job {job_name_simple}.")
        start_job(job_name_simple,
                  f's3://{aws_uri}', 'wav', 'en-US', transcribe_client)
        transcribe_waiter = TranscribeCompleteWaiter(transcribe_client)
        transcribe_waiter.wait(job_name_simple)
        job_simple = get_job(job_name_simple, transcribe_client)
        transcript_simple = requests.get(
            job_simple['Transcript']['TranscriptFileUri']).json()
        after = time.time()
        aws_transcript_record = {'hypothesis': transcript_simple['results']['transcripts'][0]['transcript'],
                                 'elapsed_time': after - before,
                                 'provider': 'aws'}
        stub_output = (aws_output_dir /
                       f"{Path(aws_uri).stem}.json").write_text(json.dumps(aws_transcript_record))
        # clean-up jobs
        for job in list_jobs('test.wav', transcribe_client):
            delete_job(job['TranscriptionJobName'], transcribe_client)

    except Exception:
        print(f"Unable to transcribe: {aws_uri}")


In [None]:
# clean-up jobs
for job in list_jobs('test.wav', transcribe_client):
    delete_job(job['TranscriptionJobName'], transcribe_client)


## Azure

In [None]:
speech_key = os.environ['azure_asr_key']
service_region = os.environ['azure_asr_region']
endpoint = os.environ['azure_asr_endpoint']

In [None]:
# def speech_recognise_continuous_from_file():
#     speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
#     audio_config = speechsdk.audio.AudioConfig(filename="/Users/samhardyhey/Desktop/blog/blog-os-asr/output/radio_national_podcasts/audio/wav/sugar-label-shopping-habits.wav")

#     speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

#     done = False

#     def stop_cb(evt: speechsdk.SessionEventArgs):
#         """callback that signals to stop continuous recognition upon receiving an event `evt`"""
#         print('CLOSING on {}'.format(evt))
#         nonlocal done
#         done = True

#     # Connect callbacks to the events fired by the speech recognizer
#     speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
#     speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
#     speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
#     speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
#     speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
#     # stop continuous recognition on either session stopped or canceled events
#     speech_recognizer.session_stopped.connect(stop_cb)
#     speech_recognizer.canceled.connect(stop_cb)

#     # Start continuous speech recognition
#     speech_recognizer.start_continuous_recognition()
#     while not done:
#         time.sleep(.5)

#     speech_recognizer.stop_continuous_recognition()
#     return speech_recognition_result.text
#     # </SpeechContinuousRecognitionWithFile>

# res = speech_recognise_continuous_from_file()


In [35]:
import azure.cognitiveservices.speech as speechsdk

def azure_transcribe_audio(file_path):
    speech_config = speechsdk.SpeechConfig(subscription=azure_asr_key, region=azure_asr_region)

    speech_config.speech_recognition_language = "en-US"
    audio_config = speechsdk.audio.AudioConfig(filename=file_path)
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    speech_recognition_result = speech_recognizer.recognize_once()
    if speech_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech:
        return speech_recognition_result.text

res = azure_transcribe_audio("/Users/samhardyhey/Desktop/blog/blog-os-asr/output/radio_national_podcasts/audio/wav/sugar-label-shopping-habits.wav")

In [36]:
res

