In [None]:
!pip3 install youtube_transcript_api

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi
import re

def get_subtitle(url):
    video_id_match = re.search(r"(?<=v=)[\w-]+", url)
    if video_id_match:
        video_id = video_id_match.group(0)
        try:
            transcript = YouTubeTranscriptApi.get_transcript(video_id)
            return transcript
        except Exception as e:
            print(f"An error occurred: {e}")
            return None
    else:
        print("Unable to extract video ID from the URL.")
        return None

def process_subtitles(url):
    transcript = get_subtitle(url)

    if transcript is not None:
        total_words = sum(len(re.findall(r'\w+', line["text"])) for line in transcript)
        total_duration = transcript[-1]["start"] + transcript[-1]["duration"]
        average_duration_per_sentence = total_duration / total_words

        segmented_subtitles = []
        current_start = None

        for line in transcript:
            cleaned_text = re.sub(r'\s+', ' ', line["text"]).strip()
            text_parts = cleaned_text.split(". ")

            for part in text_parts:
                if current_start is None:
                    current_start = line["start"]

                duration = len(re.findall(r'\w+', part)) * average_duration_per_sentence
                current_end = round(current_start + duration, 2)

                segmented_subtitles.append({
                    "text": part,
                    "start": current_start,
                    "end": current_end
                })

                current_start = current_end

        for subtitle in segmented_subtitles:
            print(f"Text: {subtitle['text']}")
            print(f"Start Time: {subtitle['start']}, End Time: {subtitle['end']}\n")
    else:
        print("Subtitle retrieval failed.")

# Example usage
video_url = "https://www.youtube.com/watch?v=okvZUE5j4R8&t=17s"
process_subtitles(video_url)


