In [3]:
from dotenv import load_dotenv
import os
from utils import download_youtube_audio,split_audio_with_sliding_window, process_chunks_and_collect_transcripts, create_srt_file


In [4]:
load_dotenv()

SARVAM_KEY = os.getenv('SARVAM_KEY')

# Empty the audio chunks

In [5]:
# empty the audio_chunks folder
os.system('rm -rf audio_chunks/*')

0

# Inputs

In [6]:
# take youtube link as input from user
youtube_link = input("Enter the youtube link: ")

In [7]:
# take language from users
# hi-IN: Hindi, bn-IN: Bengali, kn-IN: Kannada, ml-IN: Malayalam, mr-IN: Marathi, od-IN: Odia, pa-IN: Punjabi, ta-IN: Tamil, te-IN: Telugu, gu-IN: Gujarati, en-IN: English
num_to_language_mapping = {
    0: 'Unknown',
    1: 'Hindi',
    2: 'Telugu',
    3: 'Malayalam',
    4: 'Kannada',
    5: 'Bengali',
    6: 'Marathi',
    7: 'Odia',
    8: 'Punjabi',
    9: 'Tamil',
    10: 'English',
    11: 'Gujarati'
}

print("""
Language of the video and the number to be entered:
    Unknow language: 0
    Hindi: 1
    Telugu: 2
    Malayalam: 3
    Kannada: 4
    Bengali: 5
    Marathi: 6
    Odia: 7
    Punjabi: 8
    Tamil: 9
    English: 10
    Gujarati: 11
      """)
language_num = int(input("Enter the language number: "))

if language_num not in num_to_language_mapping.keys():
    language_num = 0


if language_num == 0:
    prompt = None
else:
    prompt = f"This is a {num_to_language_mapping[language_num]} language audio clip from a Youtube Video"


print('The prompt to assist is:')
print(prompt)


Language of the video and the number to be entered:
    Unknow language: 0
    Hindi: 1
    Telugu: 2
    Malayalam: 3
    Kannada: 4
    Bengali: 5
    Marathi: 6
    Odia: 7
    Punjabi: 8
    Tamil: 9
    English: 10
    Gujarati: 11
      
The prompt to assist is:
This is a Telugu language audio clip from a Youtube Video


# Download audio

In [8]:
downloaded_audio = download_youtube_audio(youtube_link, output_path='audio_files', audio_format='mp3')
if downloaded_audio:
    print(f"Downloaded audio file path: {downloaded_audio}")
else:
    raise Exception("Failed to download audio file")

[youtube] Extracting URL: https://www.youtube.com/watch?v=e9o86GbVAgQ
[youtube] e9o86GbVAgQ: Downloading webpage
[youtube] e9o86GbVAgQ: Downloading ios player API JSON
[youtube] e9o86GbVAgQ: Downloading web creator player API JSON
[youtube] e9o86GbVAgQ: Downloading m3u8 information
[info] e9o86GbVAgQ: Downloading 1 format(s): 251
[download] Destination: audio_files/Bramayugam Movie Micro Details ｜ Bramayugam Movie Breakdown ｜ Vithin cine.webm
[download] 100% of   12.74MiB in 00:00:00 at 18.66MiB/s    
[ExtractAudio] Destination: audio_files/Bramayugam Movie Micro Details ｜ Bramayugam Movie Breakdown ｜ Vithin cine.mp3
Deleting original file audio_files/Bramayugam Movie Micro Details ｜ Bramayugam Movie Breakdown ｜ Vithin cine.webm (pass -k to keep)
Audio downloaded successfully: audio_files/Bramayugam Movie Micro Details ｜ Bramayugam Movie Breakdown ｜ Vithin cine.mp3
Downloaded audio file path: audio_files/Bramayugam Movie Micro Details ｜ Bramayugam Movie Breakdown ｜ Vithin cine.mp3


# Split audio into Chunks

In [None]:
if downloaded_audio:
    try:
        chunks = split_audio_with_sliding_window(downloaded_audio, output_dir='audio_chunks', 
                                   chunk_duration_ms=10000, context_duration_ms=500)
    except Exception as e:
        raise Exception(f"Failed to split audio into chunks: {e}")
else:
    raise Exception("Download audio file not found")

# Testing chunks audio timing with video timing

In [10]:
def format_timestamp(ms):
    """
    Formats milliseconds to SRT timestamp format.

    Parameters:
    - ms (int): Time in milliseconds.

    Returns:
    - str: Formatted timestamp.
    """
    seconds, milliseconds = divmod(ms, 1000)
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"

In [34]:
from IPython.display import Audio, display
CHUNK_NUM = 4
audio = Audio(filename=chunks[CHUNK_NUM][0])
display(audio)

start_time = chunks[CHUNK_NUM][1]
end_time = chunks[CHUNK_NUM][2]

# convert milliseconds to minutes:seconds format
print(f"Start Time: {format_timestamp(start_time)}")
print(f"End Time: {format_timestamp(end_time)}")

Start Time: 00:00:40,000
End Time: 00:00:50,000


In [32]:
from pydub import AudioSegment, silence


def split_audio_into_sentences(chunk_audio_path, output_dir='sentences', 
                              min_silence_len=100, silence_thresh=-40, keep_silence=500):
    """
    Splits an audio chunk into individual sentences based on silences.
    
    Parameters:
    - chunk_audio_path (str): Path to the input audio chunk file.
    - output_dir (str): Directory to save the sentence audio files.
    - min_silence_len (int): Minimum length of silence in ms to consider as a split point (default: 500 ms).
    - silence_thresh (int): Silence threshold in dBFS (default: -40 dBFS).
    - keep_silence (int): Amount of silence to retain at the beginning and end of each chunk in ms (default: 500 ms).
    
    Returns:
    - List of tuples: Each tuple contains (sentence_audio_path, start_time, end_time)
    """
    # Configure logging
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Load the audio chunk
    audio = AudioSegment.from_file(chunk_audio_path)
    print(f"Loaded audio file: {chunk_audio_path} | Duration: {len(audio)/1000:.2f}s")
    
    # Detect non-silent chunks (i.e., sentences)
    print("Detecting sentences based on silence...")
    sentence_chunks = silence.split_on_silence(
        audio,
        min_silence_len=min_silence_len,
        silence_thresh=silence_thresh,
        keep_silence=keep_silence
    )
    
    print(f"Detected {len(sentence_chunks)} sentences.")
    
    final_sentences = []
    
    for i, sentence in enumerate(sentence_chunks, start=1):
        # Calculate start and end times
        # To find the actual start and end times, we can use detect_silence to get the silent intervals
        # But for simplicity, we'll assume sentences are split sequentially without considering overlapping silences
        # This may cause minor inaccuracies in start and end times
        
        # Export the sentence audio
        sentence_filename = f"sentence_{i}.mp3"
        sentence_path = os.path.join(output_dir, sentence_filename)
        sentence.export(sentence_path, format="mp3")
        
        # Estimate start and end times based on sentence index and durations
        # This is a simplistic approach; for precise timings, more advanced methods are required
        # Here, we'll accumulate durations
        if i == 1:
            start_time = 0
        else:
            # Sum durations of previous sentences and silences
            start_time = final_sentences[-1][2] + min_silence_len  # Previous end + silence
        end_time = start_time + len(sentence)
        
        final_sentences.append((sentence_path, start_time, end_time))
        print(f"Created {sentence_filename}: {format_timestamp(start_time)} to {format_timestamp(end_time)}")
    
    return final_sentences


In [40]:
fname = chunks[CHUNK_NUM][0]
sentences = split_audio_into_sentences(fname, output_dir='sentences', min_silence_len=500)

Loaded audio file: audio_chunks/chunk_5.mp3 | Duration: 11.00s
Detecting sentences based on silence...
Detected 1 sentences.
Created sentence_1.mp3: 00:00:00,000 to 00:00:11,000


In [36]:
sentences

[('sentences/sentence_1.mp3', 0, 11000)]

# Send API request

In [7]:
try:
    transcripts = process_chunks_and_collect_transcripts(chunks, SARVAM_KEY, prompt=prompt)
except Exception as e:
    raise Exception(f"Failed to collect transcripts: {e}")

Processing chunk: audio_chunks/chunk_1_1.mp3 from 0.00s to 6.88s
Transcription successful for chunk_1_1.mp3
Processing chunk: audio_chunks/chunk_1_2.mp3 from 6.88s to 13.76s
Transcription successful for chunk_1_2.mp3
Processing chunk: audio_chunks/chunk_1_3.mp3 from 13.76s to 20.65s
Transcription successful for chunk_1_3.mp3
Processing chunk: audio_chunks/chunk_1_4.mp3 from 20.65s to 27.53s
Transcription successful for chunk_1_4.mp3
Processing chunk: audio_chunks/chunk_1_5.mp3 from 27.53s to 34.41s
Transcription successful for chunk_1_5.mp3
Processing chunk: audio_chunks/chunk_1_6.mp3 from 34.41s to 41.29s
Transcription successful for chunk_1_6.mp3
Processing chunk: audio_chunks/chunk_1_7.mp3 from 41.29s to 48.17s
Transcription successful for chunk_1_7.mp3
Processing chunk: audio_chunks/chunk_1_8.mp3 from 48.17s to 55.06s
Transcription successful for chunk_1_8.mp3
Processing chunk: audio_chunks/chunk_1_9.mp3 from 55.06s to 61.94s
Transcription successful for chunk_1_9.mp3
Processing ch

# Create subtitle file

In [8]:
if transcripts:
    try:
        create_srt_file(transcripts, output_file='subtitles.srt', max_chars=42)
    except Exception as e:
        raise Exception(f"Failed to create srt file: {e}")
else:
    raise Exception("No transcripts found")

SRT file created at subtitles.srt


In [52]:
# generate 1000 random numbers
import numpy as np
x =  np.random.normal(0, 1, 10)
from tqdm import tqdm

# for i,j in tqdm(enumerate(x)):
for i,j in enumerate(tqdm(x)):
    print(i,j)



100%|██████████| 10/10 [00:00<00:00, 9650.95it/s]

0 -0.5036483928746249
1 -0.8869058842911447
2 0.12936126797629105
3 -0.268202274667024
4 0.7002179087689994
5 -1.2643607339959881
6 0.4495470676364118
7 0.29625573387349474
8 0.39923446884245073
9 -0.190756665061196





In [56]:
from tqdm import trange
for i in trange(10):
    continue

100%|██████████| 10/10 [00:00<00:00, 97090.37it/s]
