In [None]:
# !pip install pydub
# !pip install ffmpeg-python
# !pip install librosa

In [None]:
# !sudo yum install -y unzip
# # #Download ffmpeg and ffprobe from https://ffbinaries.com/downloads
# # #now we want ffmpeg and ffprobe files to be in the folder of their same name respectively

# ! unzip ffmpeg-6.1-linux-64.zip ffmpeg
# ! unzip ffprobe-6.1-linux-64.zip ffprobe


In [None]:
import os
import librosa
import numpy as np
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
from pydub import AudioSegment
import boto3
import os
from uuid import uuid4
from botocore.exceptions import ClientError
import time
import pandas as pd
import requests
import time

In [None]:
os.environ["PATH"] += os.pathsep + f'{os.getcwd()}/ffmpeg'
os.environ["PATH"] += os.pathsep + f'{os.getcwd()}/ffprobe'

---

# Audio Splitting : Splitting the audio into 40 second chunks.

In [None]:
time_duration = 40

In [None]:
audio_dir = "/data/audio"
split_audio_dir = "/data/split_audio"
audio_file_list = []
for audio_file in tqdm(os.listdir(audio_dir)):
    if audio_file == ".ipynb_checkpoints":
        continue

    audio = AudioSegment.from_mp3(f"{audio_dir}/{audio_file}")
    # list_of_timestamps = [ 10, 20, 30, 40, 50 ,60, 70, 80, 90 ] #and so on in *seconds*
    total_duration = audio.duration_seconds
    print(f"total_duration - {total_duration}")
    list_of_timestamps = [time_duration*x for x in range(1, int(total_duration / time_duration)+ 1)]
    print(list_of_timestamps)
    start = 0

    for  idx,t in enumerate(list_of_timestamps):
        #break loop if at last element of list
        if idx == len(list_of_timestamps):
            break

        end = t * 1000 #pydub works in millisec
        print("split at [ {}:{}] ms".format(start, end))
        audio_chunk=audio[start:end]
        file_name = "{}_audio_chunk_{}.wav".format(audio_file[:-4].replace(" ","_"),end)
        audio_chunk.export("{}/{}".format(split_audio_dir,file_name), format="wav")
        audio_file_list.append(file_name)
        start = end  #pydub works in millisec

---

# Transcribe : Using Amazon transcribe to fetch text from the audio chunks.

In [None]:
def start_job(
    job_name,
    media_uri,
    media_format,
    language_code,
    transcribe_client,
    vocabulary_name=None,
):
    """
    Starts a transcription job. This function returns as soon as the job is started.
    To get the current status of the job, call get_transcription_job. The job is
    successfully completed when the job status is 'COMPLETED'.

    :param job_name: The name of the transcription job. This must be unique for
                     your AWS account.
    :param media_uri: The URI where the audio file is stored. This is typically
                      in an Amazon S3 bucket.
    :param media_format: The format of the audio file. For example, mp3 or wav.
    :param language_code: The language code of the audio file.
                          For example, en-US or ja-JP
    :param transcribe_client: The Boto3 Transcribe client.
    :param vocabulary_name: The name of a custom vocabulary to use when transcribing
                            the audio file.
    :return: Data about the job.
    """
    try:
        job_args = {
            "TranscriptionJobName": job_name,
            "Media": {"MediaFileUri": media_uri},
            "MediaFormat": media_format,
            "LanguageCode": language_code,
        }
        if vocabulary_name is not None:
            job_args["Settings"] = {"VocabularyName": vocabulary_name}
        response = transcribe_client.start_transcription_job(**job_args)
        job = response["TranscriptionJob"]
        print("Started transcription job %s.", job_name)
    except ClientError:
        print("Couldn't start transcription job %s.", job_name)
        raise
    else:
        return job


In [None]:
transcribe_client = boto3.client("transcribe")

#Syncing the local files on S3 bucket to be used for transcribe job

os.system("aws s3 sync /data/split_audio s3://sagemaker-us-east-1-296512243111/shubham_pandey/split_audio")



job_name_list = []



counter = 0
for item in tqdm(audio_file_list):
        counter = counter + 1
        job_name = f"nptel{uuid4().hex}"
        job_name_list.append(job_name)
        print(item)
        if counter % 30 ==0:
            print(f"sleeping for 10 seconds")
            time.sleep(10)
        start_job(job_name,
                  f"s3://sagemaker-us-east-1-296512243111/shubham_pandey/split_audio/{item}",
                  media_format = "wav",
                  language_code = "en-GB",
                  transcribe_client = transcribe_client )

---

# Wait for transcribe jobs to finish

Else an error will be encountered

In [None]:
output_list = []
for job in tqdm(job_name_list):

    response = transcribe_client.get_transcription_job(
        TranscriptionJobName=job
    )

    # print(response)
    transcript_simple = requests.get(
        response['TranscriptionJob']['Transcript']['TranscriptFileUri']).json()
    # print(f"Transcript for job {transcript_simple['jobName']}:")
    output_list.append(transcript_simple['results']['transcripts'][0]['transcript'])

  0%|          | 0/424 [00:00<?, ?it/s]

In [None]:
output_list

Adding the file name with the time stamp to include in the transcript

In [None]:
df = pd.DataFrame()
df["file_name"] = audio_file_list
df["text"] = output_list
df["time"] = [x.split("_")[-1][:-7] for x in audio_file_list]

In [None]:
df.iloc[0,0]

In [None]:
def formattedtime(seconds):
    #print(f"formattedtime({seconds})")
    final_time = time.strftime("%H:%M:%S", time.gmtime(float(seconds)))
    return f"{final_time}"

In [None]:
text_file_path = "/data/text_files"

os.system(f"mkdir -p {text_file_path}")

for index, row  in tqdm(df.iterrows(),total = df.shape[0]):
    file_name = row["file_name"]
    start_time = formattedtime(row["time"])
    end_time  = formattedtime(str(int(row["time"])+ time_duration ))
    file_name = file_name[:file_name.rfind("_")-12] + ".mp4"
    text = f"File_name : {file_name} \n \n start_time : {start_time} \n\n end_time : {end_time} \n\n {df.loc[index,'text']}"
    filename = f"{df.loc[index,'file_name'][:-4]}.txt"
    with open(f"{text_file_path}/{filename}", "w") as f:
        f.write(text)


In [None]:
text_file_path = "/data/text_files"

os.system(f"mkdir -p {text_file_path}")
s = ""
for index, row  in tqdm(df.iterrows(),total = df.shape[0]):
    file_name = row["file_name"]
    start_time = formattedtime(row["time"])
    end_time  = formattedtime(str(int(row["time"])+ time_duration ))
    file_name = file_name[:file_name.rfind("_")-12] + ".mp4"
    text =  df.loc[index,'text']
    s = f"{s}\n {text}"
    filename = f"{df.loc[index,'file_name'][:-4]}.txt"
with open(f"full_text.txt", "w") as f:
    f.write(s)
