In [1]:
from dotenv import load_dotenv
import os
from utils import download_youtube_audio,split_audio_on_silence, process_chunks_and_collect_transcripts, create_srt_file


In [2]:
load_dotenv()

SARVAM_KEY = os.getenv('SARVAM_KEY')

# Empty the audio chunks

In [None]:
# empty the audio_chunks folder
os.system('rm -rf audio_chunks/*')

# Inputs

In [3]:
# take youtube link as input from user
youtube_link = input("Enter the youtube link: ")

In [4]:
# take language from users
# hi-IN: Hindi, bn-IN: Bengali, kn-IN: Kannada, ml-IN: Malayalam, mr-IN: Marathi, od-IN: Odia, pa-IN: Punjabi, ta-IN: Tamil, te-IN: Telugu, gu-IN: Gujarati, en-IN: English
num_to_language_mapping = {
    0: 'Unknown',
    1: 'Hindi',
    2: 'Telugu',
    3: 'Malayalam',
    4: 'Kannada',
    5: 'Bengali',
    6: 'Marathi',
    7: 'Odia',
    8: 'Punjabi',
    9: 'Tamil',
    10: 'English',
    11: 'Gujarati'
}

print("""
Language of the video and the number to be entered:
    Unknow language: 0
    Hindi: 1
    Telugu: 2
    Malayalam: 3
    Kannada: 4
    Bengali: 5
    Marathi: 6
    Odia: 7
    Punjabi: 8
    Tamil: 9
    English: 10
    Gujarati: 11
      """)
language_num = int(input("Enter the language number: "))

if language_num not in num_to_language_mapping.keys():
    language_num = 0


if language_num == 0:
    prompt = None
else:
    prompt = f"This is a {num_to_language_mapping[language_num]} language audio clip from a Youtube Video"


print('The prompt to assist is:')
print(prompt)


Language of the video and the number to be entered:
    Unknow language: 0
    Hindi: 1
    Telugu: 2
    Malayalam: 3
    Kannada: 4
    Bengali: 5
    Marathi: 6
    Odia: 7
    Punjabi: 8
    Tamil: 9
    English: 10
    Gujarati: 11
      
The prompt to assist is:
This is a Telugu language audio clip from a Youtube Video


# Download audio

In [5]:
downloaded_audio = download_youtube_audio(youtube_link, output_path='audio_files', audio_format='mp3')
if downloaded_audio:
    print(f"Downloaded audio file path: {downloaded_audio}")
else:
    raise Exception("Failed to download audio file")

[youtube] Extracting URL: https://www.youtube.com/watch?v=e9o86GbVAgQ
[youtube] e9o86GbVAgQ: Downloading webpage
[youtube] e9o86GbVAgQ: Downloading ios player API JSON
[youtube] e9o86GbVAgQ: Downloading web creator player API JSON
[youtube] e9o86GbVAgQ: Downloading m3u8 information
[info] e9o86GbVAgQ: Downloading 1 format(s): 251
[download] Destination: audio_files/Bramayugam Movie Micro Details ｜ Bramayugam Movie Breakdown ｜ Vithin cine.webm
[download] 100% of   12.74MiB in 00:00:00 at 18.18MiB/s    
[ExtractAudio] Destination: audio_files/Bramayugam Movie Micro Details ｜ Bramayugam Movie Breakdown ｜ Vithin cine.mp3
Deleting original file audio_files/Bramayugam Movie Micro Details ｜ Bramayugam Movie Breakdown ｜ Vithin cine.webm (pass -k to keep)
Audio downloaded successfully: audio_files/Bramayugam Movie Micro Details ｜ Bramayugam Movie Breakdown ｜ Vithin cine.mp3
Downloaded audio file path: audio_files/Bramayugam Movie Micro Details ｜ Bramayugam Movie Breakdown ｜ Vithin cine.mp3


# Split audio into Chunks

In [32]:
from pydub import AudioSegment, silence
import os

def split_audio_on_silence(input_audio_path, output_dir='audio_chunks', 
                           min_silence_len=1000, silence_thresh=-40, 
                           keep_silence=500, max_chunk_duration=10000):
    """
    Splits audio into chunks based on silence and maximum duration.
    
    Parameters:
    - input_audio_path (str): Path to the input audio file.
    - output_dir (str): Directory to save the audio chunks.
    - min_silence_len (int): Minimum length of silence in ms.
    - silence_thresh (int): Silence threshold in dBFS.
    - keep_silence (int): Amount of silence to keep at the beginning and end of each chunk in ms.
    - max_chunk_duration (int): Maximum duration of each chunk in ms (e.g., 10000 ms for 10 seconds).
    
    Returns:
    - List of tuples: Each tuple contains (chunk_path, start_time, end_time)
    """
    os.makedirs(output_dir, exist_ok=True)
    audio = AudioSegment.from_file(input_audio_path)
    
    print("Splitting audio into chunks based on silence...")
    initial_chunks = silence.split_on_silence(
        audio,
        min_silence_len=min_silence_len,
        silence_thresh=silence_thresh,
        keep_silence=keep_silence
    )
    
    final_chunks = []
    current_time = 0  # in milliseconds
    
    for i, chunk in enumerate(initial_chunks):
        chunk_duration = len(chunk)
        start_time = current_time
        end_time = current_time + chunk_duration
        
        # If chunk is longer than max_chunk_duration, split it further
        if chunk_duration > max_chunk_duration:
            num_subchunks = (chunk_duration + max_chunk_duration - 1) // max_chunk_duration  # Ceiling division
            subchunk_duration = chunk_duration / num_subchunks
            for j in range(num_subchunks):
                sub_start = j * subchunk_duration
                sub_end = min((j + 1) * subchunk_duration, chunk_duration)
                subchunk = chunk[int(sub_start):int(sub_end)]
                
                subchunk_filename = f"chunk_{i+1}_{j+1}.mp3"
                subchunk_path = os.path.join(output_dir, subchunk_filename)
                subchunk.export(subchunk_path, format="mp3")
                
                sub_start_time = start_time + int(sub_start)
                sub_end_time = start_time + int(sub_end)
                
                final_chunks.append((subchunk_path, sub_start_time, sub_end_time))
                print(f"Created {subchunk_filename}: {sub_start_time/1000:.2f}s to {sub_end_time/1000:.2f}s")
                
            current_time += chunk_duration + keep_silence  # Update current_time correctly
        else:
            # Export the chunk as is
            chunk_filename = f"chunk_{i+1}.mp3"
            chunk_path = os.path.join(output_dir, chunk_filename)
            chunk.export(chunk_path, format="mp3")
            
            final_chunks.append((chunk_path, start_time, end_time))
            print(f"Created {chunk_filename}: {start_time/1000:.2f}s to {end_time/1000:.2f}s")
            
            # Update the current time correctly
            current_time = end_time + keep_silence  # Adding kept silence to avoid overlap
    
    return final_chunks


In [60]:
from pydub import AudioSegment
import os

def split_audio_fixed_duration(input_audio_path, output_dir='audio_chunks_fixed', 
                               chunk_duration_ms=7000):
    """
    Splits audio into fixed-length chunks without considering silences.
    
    Parameters:
    - input_audio_path (str): Path to the input audio file.
    - output_dir (str): Directory to save the audio chunks.
    - chunk_duration_ms (int): Duration of each chunk in milliseconds (e.g., 10000 ms for 10 seconds).
    
    Returns:
    - List of tuples: Each tuple contains (chunk_path, start_time, end_time)
    """
    os.makedirs(output_dir, exist_ok=True)
    audio = AudioSegment.from_file(input_audio_path)
    
    total_duration = len(audio)  # Duration in milliseconds
    num_chunks = (total_duration + chunk_duration_ms - 1) // chunk_duration_ms  # Ceiling division
    
    final_chunks = []
    
    for i in range(num_chunks):
        start_time = i * chunk_duration_ms
        end_time = min((i + 1) * chunk_duration_ms, total_duration)
        chunk = audio[start_time:end_time]
        
        chunk_filename = f"chunk_{i+1}.mp3"
        chunk_path = os.path.join(output_dir, chunk_filename)
        chunk.export(chunk_path, format="mp3")
        
        final_chunks.append((chunk_path, start_time, end_time))
        print(f"Created {chunk_filename}: {format_timestamp(start_time)} to {format_timestamp(end_time)}")
    
    return final_chunks

def format_timestamp(ms):
    """
    Formats milliseconds to SRT timestamp format.
    
    Parameters:
    - ms (int): Time in milliseconds.
    
    Returns:
    - str: Formatted timestamp.
    """
    seconds, milliseconds = divmod(ms, 1000)
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"


In [73]:
from pydub import AudioSegment
import os
import math
import logging

def format_timestamp(ms):
    """
    Formats milliseconds to SRT timestamp format.
    
    Parameters:
    - ms (int): Time in milliseconds.
    
    Returns:
    - str: Formatted timestamp.
    """
    seconds, milliseconds = divmod(ms, 1000)
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"

def split_audio_with_sliding_window(input_audio_path, output_dir='audio_chunks_sliding', 
                                   chunk_duration_ms=7000, context_duration_ms=2000):
    """
    Splits audio into fixed-length subtitle chunks with overlapping context.
    
    Each chunk corresponds to a 7-second subtitle window and includes 2 seconds
    of audio before and after the subtitle window to provide context for translation.
    
    Parameters:
    - input_audio_path (str): Path to the input audio file.
    - output_dir (str): Directory to save the audio chunks.
    - chunk_duration_ms (int): Duration of each subtitle in milliseconds (e.g., 7000 ms for 7 seconds).
    - context_duration_ms (int): Duration of context to include before and after each chunk in ms (e.g., 2000 ms).
    
    Returns:
    - List of tuples: Each tuple contains (chunk_path, subtitle_start_time, subtitle_end_time)
    """
    # Configure logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Load the audio file
    audio = AudioSegment.from_file(input_audio_path)
    total_duration = len(audio)  # in milliseconds
    
    # Calculate the number of chunks
    num_chunks = math.ceil(total_duration / chunk_duration_ms)
    
    final_chunks = []
    
    for i in range(num_chunks):
        # Define subtitle timing
        subtitle_start = i * chunk_duration_ms
        subtitle_end = min((i + 1) * chunk_duration_ms, total_duration)
        
        # Define audio chunk timing with context
        chunk_start = max(subtitle_start - context_duration_ms, 0)
        chunk_end = min(subtitle_end + context_duration_ms, total_duration)
        
        # Extract the audio chunk
        chunk = audio[chunk_start:chunk_end]
        
        # Define chunk filename
        chunk_filename = f"chunk_{i+1}.mp3"
        chunk_path = os.path.join(output_dir, chunk_filename)
        
        # Export the audio chunk
        chunk.export(chunk_path, format="mp3")
        
        # Append to final chunks list
        final_chunks.append((chunk_path, subtitle_start, subtitle_end))
        
        # Log the creation
        logging.info(f"Created {chunk_filename}: {format_timestamp(subtitle_start)} to {format_timestamp(subtitle_end)}")
    
    return final_chunks


In [74]:
if downloaded_audio:
    try:
        chunks = split_audio_with_sliding_window(
            downloaded_audio
        )
    except Exception as e:
        raise Exception(f"Failed to split audio into chunks: {e}")
else:
    raise Exception("Download audio file not found")

2024-09-20 01:13:33,449 - INFO - Created chunk_1.mp3: 00:00:00,000 to 00:00:07,000
2024-09-20 01:13:33,674 - INFO - Created chunk_2.mp3: 00:00:07,000 to 00:00:14,000
2024-09-20 01:13:33,924 - INFO - Created chunk_3.mp3: 00:00:14,000 to 00:00:21,000
2024-09-20 01:13:34,156 - INFO - Created chunk_4.mp3: 00:00:21,000 to 00:00:28,000
2024-09-20 01:13:34,384 - INFO - Created chunk_5.mp3: 00:00:28,000 to 00:00:35,000
2024-09-20 01:13:34,605 - INFO - Created chunk_6.mp3: 00:00:35,000 to 00:00:42,000
2024-09-20 01:13:34,826 - INFO - Created chunk_7.mp3: 00:00:42,000 to 00:00:49,000
2024-09-20 01:13:35,047 - INFO - Created chunk_8.mp3: 00:00:49,000 to 00:00:56,000
2024-09-20 01:13:35,282 - INFO - Created chunk_9.mp3: 00:00:56,000 to 00:01:03,000
2024-09-20 01:13:35,503 - INFO - Created chunk_10.mp3: 00:01:03,000 to 00:01:10,000
2024-09-20 01:13:35,732 - INFO - Created chunk_11.mp3: 00:01:10,000 to 00:01:17,000
2024-09-20 01:13:35,946 - INFO - Created chunk_12.mp3: 00:01:17,000 to 00:01:24,000
2

# Testing chunks audio timing with video timing

In [23]:
def format_timestamp(ms):
    """
    Formats milliseconds to SRT timestamp format.

    Parameters:
    - ms (int): Time in milliseconds.

    Returns:
    - str: Formatted timestamp.
    """
    seconds, milliseconds = divmod(ms, 1000)
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"

In [75]:
from IPython.display import Audio, display
CHUNK_NUM = 54
audio = Audio(filename=chunks[CHUNK_NUM][0])
display(audio)

start_time = chunks[CHUNK_NUM][1]
end_time = chunks[CHUNK_NUM][2]

# convert milliseconds to minutes:seconds format
print(f"Start Time: {format_timestamp(start_time)}")
print(f"End Time: {format_timestamp(end_time)}")

Start Time: 00:06:18,000
End Time: 00:06:25,000


In [76]:
chunks

[('audio_chunks_sliding/chunk_1.mp3', 0, 7000),
 ('audio_chunks_sliding/chunk_2.mp3', 7000, 14000),
 ('audio_chunks_sliding/chunk_3.mp3', 14000, 21000),
 ('audio_chunks_sliding/chunk_4.mp3', 21000, 28000),
 ('audio_chunks_sliding/chunk_5.mp3', 28000, 35000),
 ('audio_chunks_sliding/chunk_6.mp3', 35000, 42000),
 ('audio_chunks_sliding/chunk_7.mp3', 42000, 49000),
 ('audio_chunks_sliding/chunk_8.mp3', 49000, 56000),
 ('audio_chunks_sliding/chunk_9.mp3', 56000, 63000),
 ('audio_chunks_sliding/chunk_10.mp3', 63000, 70000),
 ('audio_chunks_sliding/chunk_11.mp3', 70000, 77000),
 ('audio_chunks_sliding/chunk_12.mp3', 77000, 84000),
 ('audio_chunks_sliding/chunk_13.mp3', 84000, 91000),
 ('audio_chunks_sliding/chunk_14.mp3', 91000, 98000),
 ('audio_chunks_sliding/chunk_15.mp3', 98000, 105000),
 ('audio_chunks_sliding/chunk_16.mp3', 105000, 112000),
 ('audio_chunks_sliding/chunk_17.mp3', 112000, 119000),
 ('audio_chunks_sliding/chunk_18.mp3', 119000, 126000),
 ('audio_chunks_sliding/chunk_19.mp3

# Send API request

In [7]:
try:
    transcripts = process_chunks_and_collect_transcripts(chunks, SARVAM_KEY, prompt=prompt)
except Exception as e:
    raise Exception(f"Failed to collect transcripts: {e}")

Processing chunk: audio_chunks/chunk_1_1.mp3 from 0.00s to 6.88s
Transcription successful for chunk_1_1.mp3
Processing chunk: audio_chunks/chunk_1_2.mp3 from 6.88s to 13.76s
Transcription successful for chunk_1_2.mp3
Processing chunk: audio_chunks/chunk_1_3.mp3 from 13.76s to 20.65s
Transcription successful for chunk_1_3.mp3
Processing chunk: audio_chunks/chunk_1_4.mp3 from 20.65s to 27.53s
Transcription successful for chunk_1_4.mp3
Processing chunk: audio_chunks/chunk_1_5.mp3 from 27.53s to 34.41s
Transcription successful for chunk_1_5.mp3
Processing chunk: audio_chunks/chunk_1_6.mp3 from 34.41s to 41.29s
Transcription successful for chunk_1_6.mp3
Processing chunk: audio_chunks/chunk_1_7.mp3 from 41.29s to 48.17s
Transcription successful for chunk_1_7.mp3
Processing chunk: audio_chunks/chunk_1_8.mp3 from 48.17s to 55.06s
Transcription successful for chunk_1_8.mp3
Processing chunk: audio_chunks/chunk_1_9.mp3 from 55.06s to 61.94s
Transcription successful for chunk_1_9.mp3
Processing ch

# Create subtitle file

In [8]:
if transcripts:
    try:
        create_srt_file(transcripts, output_file='subtitles.srt', max_chars=42)
    except Exception as e:
        raise Exception(f"Failed to create srt file: {e}")
else:
    raise Exception("No transcripts found")

SRT file created at subtitles.srt
