# Your Guide to AWS Transcribe in Python!

**Run Several Transcibe jobs at once using the Below Code:**

In [None]:
#importing needed libraries

import boto3
import os
import logging
from botocore.exceptions import ClientError
from boto.s3.connection import S3Connection
from multiprocessing import Pool
import json
import datetime
import pandas as pd

In [None]:
#specify information below

aws_access_key_id = ''
aws_access_key_id = ''
region = ''
bucket_name = ''
audio_file_path = ''
output_path = ''

In [None]:
if os.path.isdir(output_path):
    pass
else:
    os.mkdir(output_path)

In [None]:
#first step is to create a S3 bucket
def create_bucket(bucket_name):
    try:
        s3_client = boto3.client('s3', aws_access_key_id = aws_access_key_id, 
                                        aws_secret_access_key = aws_secret_access_key, 
                                        region_name=region)
        location = {'LocationConstraint': region}
        s3_client.create_bucket(Bucket=bucket_name,
                                CreateBucketConfiguration=location)
       
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [None]:
#upload local audio files in S3 bucket

def upload_files(path, bucket_name):
    session = boto3.Session(
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        region_name=region
    )
    s3 = session.resource('s3')
    bucket = s3.Bucket(bucket_name)
    
    print ("Uploading files...")
    
    for subdir, dirs, files in os.walk(path):
        for file in files:
            full_path = os.path.join(subdir, file)
            with open(full_path, 'rb') as data:
                bucket.put_object(Key=full_path[len(path)+1:], Body=data)

In [None]:
#function to get all the file names from a S3 bucket for Transcribe

def get_file_names(bucket_name):
    s3 = boto3.client('s3', aws_access_key_id = aws_access_key_id, aws_secret_access_key =aws_secret_access_key)
    object_listing = s3.list_objects_v2(Bucket=bucket_name)
    file_names = [item['Key'] for item in object_listing['Contents']]
    return file_names

In [None]:
#run AWS transcibe
#Job name will be the name of the Audio file
#max_speakers is the number of speakers in the audio file

def amazon_transcribe(audio_file_name, LangCode, bucket_name, max_speakers = -1):

    if max_speakers > 10:
        raise ValueError("Maximum detected speakers is 10.")


    job_uri = "https://" + bucket_name + ".s3." + region + ".amazonaws.com/" + audio_file_name
    job_name = (audio_file_name.split('.')[0]).replace(" ", "")
    

    if max_speakers != -1:
        transcribe.start_transcription_job(
            TranscriptionJobName = job_name,
            Media={'MediaFileUri': job_uri},
            MediaFormat=audio_file_name.split('.')[1],
            LanguageCode=LangCode,
            Settings = {'ShowSpeakerLabels': True,
                      'MaxSpeakerLabels': max_speakers
                      }
        )
    else: 
        transcribe.start_transcription_job(
            TranscriptionJobName=job_name,
            Media={'MediaFileUri': job_uri},
            MediaFormat=audio_file_name.split('.')[1],
            LanguageCode=LangCode,
            Settings = {'ShowSpeakerLabels': True
                        
                      }
        )    

    while True:
        result = transcribe.get_transcription_job(TranscriptionJobName=job_name)
        if result['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
            break
    time.sleep(15)
    
    if result['TranscriptionJob']['TranscriptionJobStatus'] == 'COMPLETED':
        data = pd.read_json(result['TranscriptionJob']['Transcript']['TranscriptFileUri'])
    
    return result 

In [None]:
#read output from AWS transcribe and store output in a text file

def read_output(data, audio_file_name):
    filename = (audio_file_name).split('.')[0]
    dir_path = os.path.dirname(os.path.realpath(__file__))
    new_dir = os.path.join(dir_path, 'outputs')
    if os.path.isdir(output_path):
        pass
    else:
        os.mkdir(output_path)
    
    print('Saving ' + filename+'.txt')
    text_file_path = os.path.join(new_dir,filename)
    with open(text_file_path+'.txt','w', encoding='utf-8') as w:
        labels = data['results']['speaker_labels']['segments']
        speaker_start_times={}
        for label in labels:
            for item in label['items']:
                speaker_start_times[item['start_time']] = item['speaker_label']
        items = data['results']['items']
        lines = []
        line = ''
        time = 0
        speaker = 'null'
        i = 0
    
        # loop through all elements
        for item in items:
            i = i+1
            content = item['alternatives'][0]['content']
            # if it's starting time
            if item.get('start_time'):
                current_speaker = speaker_start_times[item['start_time']]
            # in AWS output, there are types as punctuation
            elif item['type'] == 'punctuation':
                line = line + content
            
            # handle different speaker
            if current_speaker != speaker:
                if speaker:
                    lines.append({'speaker':speaker, 'line':line, 'time':time})
                line = content
                speaker = current_speaker
                time = item['start_time']
            elif item['type'] != 'punctuation':
                line = line + ' ' + content
        lines.append({'speaker': speaker, 'line': line,'time': time})       
        # sort the results by the time
        sorted_lines = sorted(lines,key=lambda k: float(k['time']))
        # write into the .txt file
        for line_data in sorted_lines:
            line = '[' + str(datetime.timedelta(seconds=int(round(float(line_data['time']))))) + '] ' + line_data.get('speaker') + ': ' + line_data.get('line')
            w.write(line + '\n\n')

In [None]:
#run AWS Transcribe for several audio files at once!

def run_concurrent_files(file_name):
        audio_file =  file_name
        print (file_name)
        
        result = amazon_transcribe(audio_file_name = audio_file, 
                                    LangCode = 'en-US',
                                    bucket_name = bucket_name,
                                    max_speakers = 2)
                                    

        data = pd.read_json(result['TranscriptionJob']['Transcript']['TranscriptFileUri'])
        audio_filename = audio_file.split('.')[0]
        read_output(data, audio_filename)


In [None]:
#calling Transcribe
transcribe = boto3.client('transcribe',
    aws_access_key_id = aws_access_key_id,
    aws_secret_access_key = aws_secret_access_key, 
    region_name = region)

In [None]:
# Main Function
if __name__ == "__main__":
    value = create_bucket(bucket_name)
    if value == True:
        print ("Bucket Sucessfully Created")
    else:
        print ("Bucket Creation Unsuccesful")
    exit()
    upload_files(audio_file_path, bucket_name)
    file_names = get_file_names(bucket_name)
    print (file_names)
    print (len(file_names))
    with Pool(5) as p:
        print(p.map(run_concurrent_files, file_names))

If you would like to run this code again for the same audio files, old jobs have to be deleted. This is because job names should be unique and in this code, we are specifying the job name as the name of the Audio File.

In [None]:
#Use the below code to delete jobs

existed_jobs = transcribe.list_transcription_jobs()
total_jobs = []
for job in existed_jobs['TranscriptionJobSummaries']:
        total_jobs.append(job['TranscriptionJobName'])
        
print (total_jobs)
                                        
                                     
for file in total_jobs:
    transcribe.delete_transcription_job(TranscriptionJobName=file)