### Project: Subtitle generator
Uses YouTube Link or download link as input

#### Installing packages

In [1]:
# %pip install wget yt-dlp pydub SpeechRecognition googletrans > /dev/null

In [2]:
# !sudo apt update && sudo apt install -y ffmpeg

#### Importing packages

In [3]:
import os
import subprocess

import wget
from yt_dlp import YoutubeDL

#### Defining constants

In [4]:
media_dir = 'media'
chunks_dir = f'{media_dir}/chunks'

# create directories if they dont exist
os.makedirs(media_dir, exist_ok=True)
os.makedirs(chunks_dir, exist_ok=True)

#### Downloading video

In [5]:
filename = f'{media_dir}/Telugu_trailer.mp4'
video_url = 'https://www.youtube.com/watch?v=7kpGQ_hbU30'
available_video_formats = ['mp4', 'mkv', 'webm', 'flv', 'avi', 'mov', 'wmv', 'mpg', 'mpeg', '3gp']

# convert youtu.be format to youtube.com format
if 'youtu.be' in video_url:
	video_id = video_url.split('/')[-1]
	video_url = f'https://www.youtube.com/watch?v={video_id}'

try:
	# get file name from url
	url_filename = video_url.split('/')[-1]
	# remove query string
	url_filename = url_filename.split('?')[0]
	url_file_format = url_filename.split('.')[-1]
except:
	pass

if 'youtube.com' in video_url and url_filename == 'watch':
	# create filename from video id
	# get v= value from query string. ignore other values after that
	video_id = video_url.split('v=')[-1].split('&')[0]
	filename = f'{media_dir}/{video_id}.mp4'
elif url_file_format in available_video_formats:
	filename = f'{media_dir}/{url_filename}'
else:
	raise ValueError(f'Unsupported video format: {url_file_format}')


if not os.path.exists(filename):
	if url_filename == 'watch':
		# 18/best to try for 480p if available, else best available format
		# os.system(f"yt-dlp -f '18/best' --output '{filename}' '{video_url}'")
		ydl_opts = {
			'format': '18/best',
			'outtmpl': filename,
		}
		with YoutubeDL(ydl_opts) as ydl:
			ydl.download([video_url])
	elif url_file_format in available_video_formats:
		filename = f'{media_dir}/{url_filename}'
		wget.download(video_url, filename)

[youtube] Extracting URL: https://www.youtube.com/watch?v=7kpGQ_hbU30
[youtube] 7kpGQ_hbU30: Downloading webpage
[youtube] 7kpGQ_hbU30: Downloading ios player API JSON
[youtube] 7kpGQ_hbU30: Downloading android player API JSON




[youtube] 7kpGQ_hbU30: Downloading android player API JSON




[youtube] 7kpGQ_hbU30: Downloading android player API JSON




[youtube] 7kpGQ_hbU30: Downloading android player API JSON




[youtube] 7kpGQ_hbU30: Downloading player 9383995e
[youtube] 7kpGQ_hbU30: Downloading m3u8 information
[info] 7kpGQ_hbU30: Downloading 1 format(s): 18
[download] Destination: media/7kpGQ_hbU30.mp4
[download] 100% of    2.20MiB in 00:00:03 at 642.19KiB/s 


#### Extracting audio

In [6]:
# extract audio
file_format = filename.split('.')[-1]
audio_filename = filename.replace(file_format, 'mp3')
if os.path.exists(audio_filename):
	os.remove(audio_filename)
try:
    subprocess.check_output(f"ffmpeg -i '{filename}' -vn -acodec libmp3lame '{audio_filename}' > /dev/null 2>&1", shell=True)
except subprocess.CalledProcessError as e:
    print(f"Command '{e.cmd}' returned non-zero exit status {e.returncode}")

#### Splitting audio into chunks by silence

In [7]:
# split by silence or change in voice
from pydub import AudioSegment
from pydub.silence import split_on_silence

# delete all files in chunks_dir
for file in os.listdir(chunks_dir):
	os.remove(f"{chunks_dir}/{file}")

audio = AudioSegment.from_file(audio_filename)

chunks = split_on_silence(
    audio,
    min_silence_len=200,
    silence_thresh=-30,  # silence threshold in dB
)

# save chunks to files and note down timestamps
timestamps = []
current_time = 0  # start time of the first chunk

audio_file_format = audio_filename.split('.')[-1]

for i, chunk in enumerate(chunks):
    chunk_filename = f"{chunks_dir}/chunk{i}.{audio_file_format}"
    print(f"exporting chunk {i} to {chunk_filename}")
    chunk.export(chunk_filename, format=audio_file_format)

    # note down timestamp
    chunk_length_ms = len(chunk)  # length of chunk in milliseconds
    timestamps.append((current_time, current_time + chunk_length_ms))
    current_time += chunk_length_ms  # update current_time for next chunk

# print timestamps
for i, (start_time, end_time) in enumerate(timestamps):
    print(f"Chunk {i}: Start Time = {start_time} ms, End Time = {end_time} ms")

ModuleNotFoundError: No module named 'pydub'

##### Attempt to split by change in voice

In [None]:
# %pip install pyAudioAnalysis hmmlearn eyed3 imblearn tqdm

In [None]:
# import warnings
# from pyAudioAnalysis import audioSegmentation as aS
# from pydub import AudioSegment

# warnings.filterwarnings("ignore")

# audio_file_format = audio_filename.split('.')[-1]
# audio = AudioSegment.from_file(audio_filename, format=audio_file_format)

# print(f"Loaded audio file {audio_filename} with format {audio_file_format}")

# best_acc = 0
# best_chunks = None
# best_n_speakers = None

# # try different numbers of speakers
# for n_speakers in range(2, 5):  # start the range from 2 instead of 1
# 	print(f"Trying with {n_speakers} speakers")
# 	try:
# 		# perform speaker diarization
# 		flags, class_names, acc, CM = aS.speaker_diarization(audio_filename, n_speakers, mid_window=2.0, mid_step=0.2, short_window=0.05, lda_dim=min(35, n_speakers-1, 10))  # reduce lda_dim to a smaller value
# 	except ValueError as e:
# 		print(f"ValueError encountered with {n_speakers} speakers: {e}")
# 		continue

# 	print(f"Accuracy with {n_speakers} speakers: {acc}")

# 	# if this number of speakers gives a better accuracy, save the chunks
# 	if acc > best_acc:
# 		print(f"Better accuracy found with {n_speakers} speakers")
# 		best_acc = acc
# 		best_n_speakers = n_speakers

# 		# flags contains the speaker for each frame, split the audio based on change in speaker
# 		chunks = []
# 		start = 0
# 		current_speaker = flags[0]
# 		for i, speaker in enumerate(flags):
# 			if speaker != current_speaker:
# 				# save the chunk of the previous speaker
# 				chunk = audio[start:i]
# 				chunks.append(chunk)
# 				# start a new chunk for the new speaker
# 				start = i
# 				current_speaker = speaker
# 		# save the last chunk
# 		chunks.append(audio[start:])

# 		best_chunks = chunks

# # delete all files in chunks_dir
# for file in os.listdir(chunks_dir):
# 	print(f"Deleting file {file} in {chunks_dir}")
# 	os.remove(f"{chunks_dir}/{file}")

# # save chunks to files
# if best_chunks is not None:
# 	for i, chunk in enumerate(best_chunks):
# 		chunk_filename = f"{chunks_dir}/chunk{i}.{audio_file_format}"
# 		print(f"Saving chunk {i} to {chunk_filename}")
# 		chunk.export(chunk_filename, format=audio_file_format)
# else:
# 	print("No chunks to save")

### Generating subtitles

In [None]:
import speech_recognition as sr
from pydub import AudioSegment
from pycaption import CaptionSet, SRTWriter, Caption, CaptionNode
from datetime import timedelta
import os

# Initialize the recognizer
r = sr.Recognizer()

# Initialize the CaptionSet with an empty dictionary
caption_set = CaptionSet({})

lang_code = 'te-IN'
caption_set.set_captions(lang_code, [])

# Iterate over the chunks
for i, (start_time, end_time) in enumerate(timestamps):
	# Load the chunk
	audio_file_path = f"{chunks_dir}/chunk{i}.{audio_file_format}"
	wav_audio_file_path = f"{chunks_dir}/chunk{i}.wav"

	# Convert audio file to wav
	AudioSegment.from_file(audio_file_path, format=audio_file_format).export(wav_audio_file_path, format="wav")

	# Transcribe the audio chunk
	with sr.AudioFile(wav_audio_file_path) as source:
		audio = r.record(source)
	try:
		text = r.recognize_google(audio, language=lang_code)
		print(f"chunk {i} --> {text}")
	except sr.UnknownValueError:
		print(f"Could not understand audio from chunk {i}")
		text = ""

	# Convert start_time and end_time to microseconds
	start_time_microseconds = start_time * 1000
	end_time_microseconds = end_time * 1000

	# Add the transcription to the CaptionSet only if text is not empty
	if text:
		caption_set.get_captions(lang_code).append(
			Caption(
				start=start_time_microseconds,
				end=end_time_microseconds,
				nodes=[CaptionNode.create_text(text)],
			)
		)

	# Delete the wav file after processing
	os.remove(wav_audio_file_path)

# Write the CaptionSet to an SRT file
with open("media/subtitles.srt", "w") as subtitle_file:
	subtitle_file.write(SRTWriter().write(caption_set))

Could not understand audio from chunk 0
chunk 1 --> మీరు ఈరోజు ఇల్లు ఖాళీ చేయాలి ప్రాణంగా చూసుకుంటూ నెయ్యి ఇంటిని ఎవరు తీసుకుంటారు ఈ గౌతమ్ పాట
chunk 2 --> 13 కోట్ 25 రోడ్డు
Could not understand audio from chunk 3
Could not understand audio from chunk 4


### Translation to English

In [None]:
# write code to translate above subtitles to english
from googletrans import Translator

translator = Translator()
lang_code = 'te-IN'  # already defined above
translated_lang_code = 'en'

# Read the SRT file
with open("media/subtitles.srt", "r") as subtitle_file:
	lines = subtitle_file.readlines()

# Initialize the translated caption set
translated_caption_set = CaptionSet({})
translated_caption_set.set_captions(translated_lang_code, [])

# Iterate over the lines of the SRT file
for line in lines:
	# Check if the line is a timestamp line
	if '-->' in line:
		timestamps = line.strip().split(' --> ')
		start_time = timedelta(milliseconds=int(timestamps[0].replace(',', '.')))
		end_time = timedelta(milliseconds=int(timestamps[1].replace(',', '.')))

		# Add the timestamp line to the translated caption set
		translated_caption_set.get_captions(translated_lang_code).append(
			Caption(
				start=start_time,
				end=end_time,
				nodes=[CaptionNode.create_text(line)],
			)
   		)
	else:
		# Translate the text line
		translated_text = translator.translate(line, src=lang_code, dest=translated_lang_code).text

		# Add the translated text line to the translated caption set
		translated_caption_set.get_captions(translated_lang_code)[-1].nodes.append(CaptionNode.create_text(translated_text))
  
# Write the translated CaptionSet to an SRT file
with open("media/translated_subtitles.srt", "w") as translated_subtitle_file:
	translated_subtitle_file.write(SRTWriter().write(translated_caption_set))

ModuleNotFoundError: No module named 'googletrans'