## Echo360 Transcribe Project

This notebook was tested in Amazon SageMaker Studio on a ml.t3.medium instance with Python 3 (Data Science) kernel.




**asr-evaluation** is a python module for evaluting ASR hypotheses (i.e. word error rate and word recognition rate).

You can read more here:
https://github.com/belambert/asr-evaluation
    

In [16]:
!pip install asr-evaluation



## File Descriptions
- 180108_GT.txt                          - The text of the mp3 file as produced via Ground Truth
- 180108-echo360-140-wiki.txt            - The text of the mp3 file from standard Transcribe model
- 180108-echo360-140-wiki-textbook.txt   - The text of the mp3 file from Transcribe with Custom Language Model


In [17]:
import json
import time
import sys
import urllib.request

In the next cell we create some of the variables and functions we will use later


In [18]:
region = 'us-east-1'
media_source_path = 's3://am-echo360/mp3/'

#job_number = '180108'
job_number = '180326'

job_version = 'm'

media_format = 'mp3'
input_file_uri = f'{media_source_path}{job_number}.{media_format}'

# create a unique job name
uid = int(time.time())
job_name = f'job-{job_number}-{media_format}-{job_version}-{uid}'

reference_filename = f'./reference/{job_number}_GT.txt'
results_filename = f'./output/{job_name}.txt'


#### Functions


In [19]:
def start_transcription_job(input_file_uri, media_format, job_number, job_name):
    bucket = 'am-echo360'
    job_filename = f'./jobs/{job_name}.json'
    job = {
        "TranscriptionJobName": job_name, 
        "LanguageCode": "en-US", 
        "MediaFormat": media_format, 
        "Media": {
            "MediaFileUri": input_file_uri
        }
    }

    # save the json file for passing to the aws cli transcribe command
    with open(job_filename, "w") as write_file:
        json.dump(job, write_file)

    # and start the transcription job
    !aws transcribe start-transcription-job \
         --region $region \
         --cli-input-json file://$job_filename 
        
        
def get_transcription_results(job_name, results_fname):
    
    out_fname = f'./output/{job_name}.txt'

    # don't run this cell until the job is finished
    results = !aws transcribe get-transcription-job \
       --region $region \
       --transcription-job-name $job_name
    
    if (results[0] == ''):
        return(results[1])
    
    else:
        
        # flatten the list of strings so we can load json string into an var
        s_results = ''
        for r in results:
            s_results = s_results + r
    
        # load json string into a var
        json_results = json.loads(s_results)
        job_status = json_results['TranscriptionJob']['TranscriptionJobStatus']

        if job_status == 'COMPLETED':

            # get the URL of the file containing the transcription results
            transcript = json_results['TranscriptionJob']['Transcript']['TranscriptFileUri']

            # download, note the transcription file is in json format
            with urllib.request.urlopen(transcript) as url:
                data = json.loads(url.read().decode())

            # save the text of the transcript to a file
            transcript_data = data['results']['transcripts'][0]['transcript']
            outfile = open(results_fname, 'w')
            outfile.write(transcript_data)
            outfile.close()
            
        return(job_status)
    

In [20]:
# create the transcription job
start_transcription_job(input_file_uri, media_format, job_number, job_name)


{
    "TranscriptionJob": {
        "TranscriptionJobName": "job-180326-mp3-m-1618014273",
        "TranscriptionJobStatus": "IN_PROGRESS",
        "LanguageCode": "en-US",
        "MediaFormat": "mp3",
        "Media": {
            "MediaFileUri": "s3://am-echo360/mp3/180326.mp3"
        },
        "StartTime": 1618014273.959,
        "CreationTime": 1618014273.935
    }
}


In [21]:
%%time

# now wait for the transcription job to complete
while True:
    status = get_transcription_results(job_name, results_filename)
    if status == 'COMPLETED':
        break
    elif status != 'IN_PROGRESS':
        break
        
    sys.stdout.write('.')
    time.sleep(10)

# usage: 
# !wer <reference.txt> <actual.txt>
print(f'\nJob Name: {job_name}')
print(status)


....................................................
Job Name: job-180326-mp3-m-1618014273
COMPLETED
CPU times: user 189 ms, sys: 115 ms, total: 304 ms
Wall time: 9min 15s


In [22]:
%%time

# now perform the Word Error Rate analysis
if status == 'COMPLETED':
    !wer $reference_filename $results_filename

Sentence count: 1
WER:    33.094% (      2164 /       6539)
WRR:    70.408% (      4604 /       6539)
SER:   100.000% (         1 /          1)
CPU times: user 1.16 s, sys: 138 ms, total: 1.3 s
Wall time: 1min 31s


# Now repeat with a WAV file 
The .wav file is the original mp3 loaded into Audacity, then amplified for a peak of 0.0db and then saved as a 16-bit WAV file


In [23]:
media_format = 'wav'
input_file_uri = f'{media_source_path}{job_number}.{media_format}'
job_name = f'job-{job_number}-{media_format}-{job_version}-{uid}'
results_filename = f'./output/{job_name}.txt'

start_transcription_job(input_file_uri, media_format, job_number, job_name)


{
    "TranscriptionJob": {
        "TranscriptionJobName": "job-180326-wav-m-1618014273",
        "TranscriptionJobStatus": "IN_PROGRESS",
        "LanguageCode": "en-US",
        "MediaFormat": "wav",
        "Media": {
            "MediaFileUri": "s3://am-echo360/mp3/180326.wav"
        },
        "StartTime": 1618014921.839,
        "CreationTime": 1618014921.819
    }
}


In [None]:
%%time

# now wait for the transcription job to complete
while True:
    status = get_transcription_results(job_name, results_filename)
    if status == 'COMPLETED':
        break
    elif status != 'IN_PROGRESS':
        break
        
    sys.stdout.write('.')
    time.sleep(10)

# usage: 
# !wer <reference.txt> <actual.txt>
print(f'\nJob Name: {job_name}')
print(status)


..............

In [None]:
%%time

# now perform the Word Error Rate analysis
if status == 'COMPLETED':
    !wer $reference_filename $results_filename