## Select / download video

In [None]:
!pip install yt-dlp moviepy openai-whisper==20230124 whisper-timestamped



In [None]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
import os
import json
import numpy as np
import whisper_timestamped
from moviepy.editor import VideoFileClip
import yt_dlp

In [None]:
def download_audio_from_youtube(url):
    try:
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': '%(id)s.%(ext)s',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'wav',
                'preferredquality': '192'
            }],
            'noplaylist': True,
        }

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info_dict = ydl.extract_info(url, download=True)
            audio_file = f"{info_dict['id']}.wav"
            return audio_file

    except Exception as e:
        return str(e)

In [None]:
def extract_audio_from_video(video_file):
    try:
        audio_file = "extracted_audio.wav"
        video_clip = VideoFileClip(video_file)
        video_clip.audio.write_audiofile(audio_file, codec='pcm_s16le', ffmpeg_params=["-ac", "1", "-ar", "16000"])
        return audio_file

    except Exception as e:
        return str(e)

In [None]:
from pydub import AudioSegment

In [None]:
def process_audio(audio_file):
    try:
        print("Processing audio file:", os.path.abspath(audio_file))
        if not os.path.exists(audio_file):
            return "Audio file does not exist."
        audio = AudioSegment.from_file(audio_file)
        duration_ms = 3 * 60 * 1000
        audio = audio[:duration_ms]

        temp_file = "temp_audio.wav"
        audio.export(temp_file, format="wav")

        audio = whisper_timestamped.load_audio(temp_file)
        audio = audio / np.max(np.abs(audio))
        model = whisper_timestamped.load_model("base", device="cpu")
        result = whisper_timestamped.transcribe(model, audio, vad = True, language="en")

        # Save results to SRT file
        writer = whisper_timestamped.utils.get_writer("srt", ".")
        writer(result, "output")
        print(f"SRT file saved: output.srt")

        # Save results to JSON file
        json_output_file = "output.json"
        with open(json_output_file, 'w', encoding='utf-8') as json_file:
            json.dump(result, json_file, indent=2, ensure_ascii=False)
        print(f"JSON file saved: {json_output_file}")

        return "Transcription completed"

    except Exception as e:
        return str(e)

Use Youtube link

In [None]:
youtube_url = 'https://www.youtube.com/watch?v=XdqPpRRewrE&list=PLBmriQSLAuRJvLNLhRBNso054qvpsqv3t&index=12'  # Change this to YouTube URL
audio_file = download_audio_from_youtube(youtube_url)
if audio_file:
    result = process_audio(audio_file)
    print(result)

[youtube:tab] Extracting URL: https://www.youtube.com/watch?v=XdqPpRRewrE&list=PLBmriQSLAuRJvLNLhRBNso054qvpsqv3t&index=12
[youtube:tab] Downloading just the video XdqPpRRewrE because of --no-playlist
[youtube] Extracting URL: https://www.youtube.com/watch?v=XdqPpRRewrE
[youtube] XdqPpRRewrE: Downloading webpage
[youtube] XdqPpRRewrE: Downloading ios player API JSON
[youtube] XdqPpRRewrE: Downloading mweb player API JSON
[youtube] XdqPpRRewrE: Downloading m3u8 information
[info] XdqPpRRewrE: Downloading 1 format(s): 251
[download] Destination: XdqPpRRewrE.webm
[download] 100% of   15.98MiB in 00:00:00 at 46.56MiB/s  
[ExtractAudio] Destination: XdqPpRRewrE.wav
Deleting original file XdqPpRRewrE.webm (pass -k to keep)
Processing audio file: /content/XdqPpRRewrE.wav



100%|██████████| 17723/17723 [00:53<00:00, 328.40frames/s]

SRT file saved: output.srt
JSON file saved: output.json
Transcription completed





Use local path

In [None]:
video_file = 'path_to_your_video_file.mp4'  # Change this to video file path
audio_file = extract_audio_from_video(video_file)
if audio_file:
    result = process_audio(audio_file)
    print(result)

Processing audio file: /content/MoviePy error: the file path_to_your_video_file.mp4 could not be found!
Please check that you entered the correct path.
Audio file does not exist.


## Extract timestamp for each word

In [None]:
import json
import re

In [None]:
file_path = 'output.json'

with open(file_path, 'r') as file:
    data = json.load(file)

print(data["text"])

 Okay, let's talk about why we make bad decisions. So if you're like me, then you've probably been in that situation where it's the end of the day or it's the morning and you're thinking like, okay, I'm gonna make a solid plan for myself. Today, I'm gonna be disciplined. I'm gonna come home from work and I'm gonna get all these errands done and I'm gonna treat myself really well. I'm gonna make a nice healthy, proper meal. I'm gonna go to the gym afterwards. I'm gonna exercise and then maybe when I come home, I'll like start reading a book or something. I'll do something a little different, something that makes me feel like I'm gonna advance my life in a new positive direction. And this is our real life volition and it's there to lead you on your way to making good life decisions. But this doesn't always work out that way. Sometimes you get to the end of the day and you realize that you haven't been very disciplined at all. A lot of those errands are still incomplete or like half-start

In [None]:
# This function is to lowracse whole text, easier to re-match and contain big letter "I"
def lowercase(text):
    exceptions = {"I", "I'm", "I'll", "I've", "I'd"}
    words = re.findall(r"\b\w+'\w+|\b\w+\b|[^\w\s]", text)
    processed_words = [
        word if word in exceptions else word.lower()
        for word in words
    ]
    result = " ".join(processed_words)
    result = re.sub(r'([?.!,:;])\s*', r'\1 ', result)
    result = re.sub(r'\s([?.!,:;])', r'\1', result)

    return result

In [None]:
data["text"] = lowercase(data["text"])
print(data["text"])

okay, let's talk about why we make bad decisions. so if you're like me, then you've probably been in that situation where it's the end of the day or it's the morning and you're thinking like, okay, I'm gonna make a solid plan for myself. today, I'm gonna be disciplined. I'm gonna come home from work and I'm gonna get all these errands done and I'm gonna treat myself really well. I'm gonna make a nice healthy, proper meal. I'm gonna go to the gym afterwards. I'm gonna exercise and then maybe when I come home, I'll like start reading a book or something. I'll do something a little different, something that makes me feel like I'm gonna advance my life in a new positive direction. and this is our real life volition and it's there to lead you on your way to making good life decisions. but this doesn't always work out that way. sometimes you get to the end of the day and you realize that you haven't been very disciplined at all. a lot of those errands are still incomplete or like half - star

In [None]:
def clean_word(word):
  word = re.sub(r'[,.!?]', '', word)
  return word

In [None]:
# Extract the desired features
word_timings = []
for segment in data['segments']:
    for word in segment['words']:
        word_timing = {
            'text': clean_word(word['text']),
            'start': word['start'],
            'end': word['end']
        }
        word_timings.append(word_timing)

# Print the total number of words extracted
print(f"Total words extracted: {len(word_timings)}")

# Print the last few entries to verify
print("Last 5 entries:")
for entry in word_timings[-5:]:
    print(entry)

# Optionally, save to a new JSON file
with open('extracted_word_timings.json', 'w') as outfile:
    json.dump(word_timings, outfile, indent=2)

print("Extracted data saved to 'extracted_word_timings.json'")

Total words extracted: 496
Last 5 entries:
{'text': 'I', 'start': 178.39, 'end': 178.55}
{'text': 'shift', 'start': 178.55, 'end': 178.85}
{'text': 'myself', 'start': 178.85, 'end': 179.27}
{'text': 'around', 'start': 179.27, 'end': 179.73}
{'text': 'I', 'start': 179.97, 'end': 179.99}
Extracted data saved to 'extracted_word_timings.json'


## LLM segamentation

In [None]:
# Import the Python SDK
import google.generativeai as genai

genai.configure(api_key='AIzaSyB2ApAg4Dk5ctVmhR0XHCKRv6dMGIrCtts')

In [None]:
model = genai.GenerativeModel("gemini-1.5-flash")

In [None]:
text = data["text"]

In [None]:
# Segmentation function
def segment_text_gemini(model, text):
    prompt = f"""
You are a helpful assistant. Your task is to segment sentences which is longer than 65 characters, including spaces and punctuation, into shorter sentences.
Each segmented sentence MUST NOT exceed 65 characters.
Each segmented sentence must be independent, complete, and clear, suitable for direct translation or subtitle creation.
NOTE: All connecting words (e.g., where, which, and, but, that) MUST remain intact. They should NOT be omitted, split, or modified in any way.

Example:
Input:
so one of the biggest challenges when talking about anything related to mental health whether depression mental illness addiction is it can sometimes be really hard to explain how can you want something
Output:
so one of the biggest challenges when talking about
anything related to mental health
whether depression mental illness addiction
is it can sometimes be really hard to explain
how can you want something:{text}
    """

    # Send request to model
    response = model.generate_content(prompt, generation_config=genai.types.GenerationConfig(temperature=0.0))

    return response

# Get segmented text
segmented_output = segment_text_gemini(model, text)

In [None]:
# Critic function
def critic_text_gemini(model, ori_text, text):
    prompt = f"""
    You are a critic.
    Your task is to make sure that the content of these sentences is the same as the original,
    and if there are differences, revise them according to the original({ori_text}), output should only contain corrected sentences:{text}
    """

    # Send request to model
    response = model.generate_content(prompt, generation_config=genai.types.GenerationConfig(temperature=0.0))

    return response

# Get segmented text
segmented = critic_text_gemini(model, text, segmented_output.text)

## SaT segmentation

In [None]:
!pip install wtpsplit

Successfully installed adapters-1.0.1 cached-property-2.0.1 coloredlogs-15.0.1 docopt-0.6.2 huggingface-hub-0.25.2 humanfriendly-10.0 mosestokenizer-1.2.1 onnxruntime-1.20.1 openfile-0.0.7 skops-0.10.0 toolwrapper-2.1.0 transformers-4.45.2 uctools-1.3.0 wtpsplit-2.1.1


In [None]:
from wtpsplit import SaT
import torch

Importing from quanto will be deprecated in v4.47. Please install optimum-quanto instrad `pip install optimum-quanto`


In [None]:
sat = SaT("sat-3l-sm")
sat.half().to("cuda")

In [None]:
text = data["text"]

In [None]:
result = sat.split(text)

In [None]:
result

["okay, let's talk about why we make bad decisions. ",
 "so if you're like me, then you've probably been in that situation where it's the end of the day or it's the morning and you're thinking like, okay, I'm gonna make a solid plan for myself. ",
 "today, I'm gonna be disciplined. ",
 "I'm gonna come home from work and I'm gonna get all these errands done and I'm gonna treat myself really well. ",
 "I'm gonna make a nice healthy, proper meal. ",
 "I'm gonna go to the gym afterwards. ",
 "I'm gonna exercise and then maybe when I come home, I'll like start reading a book or something. ",
 "I'll do something a little different, something that makes me feel like I'm gonna advance my life in a new positive direction. ",
 "and this is our real life volition and it's there to lead you on your way to making good life decisions. ",
 "but this doesn't always work out that way. ",
 "sometimes you get to the end of the day and you realize that you haven't been very disciplined at all. ",
 'a lot 

In [None]:
on_progress = []
for i, sentence in enumerate(result):
  if len(sentence) > 65:
    on_progress.append([i, sentence])
    prompt = """ou are a helpful assistant. Your task is to segment sentences which is longer than 60 characters(including spaces and punctuation) into several shorter sentences.
    Each shorter sentence MUST NOT exceed 60 characters. Each segmented sentence must be independent, complete, and clear, suitable for direct translation or subtitle creation.

    NOTE: All connecting words (e.g., where, which, and, but, that) MUST remain intact. They should NOT be omitted, split, or modified in any way.

    Example:
    Input:
    so one of the biggest challenges when talking about anything related to mental health, whether depression mental illness addiction, is it can sometimes be really hard to explain how can you want something.
    Output:
    so one of the biggest challenges when talking about
    anything related to mental health,
    whether depression mental illness addiction,
    is it can sometimes be really hard to explain
    how can you want something.

    Make sure you strictly follow the above NOTE and instructions, before you give the answer, check
    whether the new sentence is shorter than 60 characters and whether the words in new sentences are the same
    as the words in original text, output should only contains shorter sentences. Think step by step:"""

    response = model.generate_content(prompt, generation_config=genai.types.GenerationConfig(temperature=0.0))
    print(response.text)

In [None]:
import google.generativeai as genai

genai.configure(api_key='AIzaSyB2ApAg4Dk5ctVmhR0XHCKRv6dMGIrCtts')

In [None]:
model = genai.GenerativeModel("gemini-1.5-flash")

In [None]:
for i,sentence in on_progress:
  prompt =

In [None]:
# Segmentation function
def segment_text_gemini(model, text, max_length=50):
    prompt = f"""

    """
    response = model.generate_content(prompt, generation_config=genai.types.GenerationConfig(temperature=0.0))

    return response

segmented_output = segment_text_gemini(model, on_progress)

In [None]:
"""
Okay let's talk about why we make bad decisions
So if you're like me then you've probably been in that situation
where it's the end of the day or it's the morning
and you're thinking like okay I'm going to make a solid plan for myself
Today I'm going to be disciplined
I'm going to come home from work and I'm going to get all these errands done
and I'm going to treat myself really well
I'm going to make a nice healthy proper meal
I'm going to go to the gym afterwards
I'm going to exercise and th
"""

## Train SaT

In [None]:
!git clone https://github.com/segment-any-text/wtpsplit

In [None]:
%cd wtpsplit
!pip install -r requirements.txt
!pip install adapters==0.2.1 --no-dependencies
%cd ..

In [None]:
"""
Change:

/content/wtpsplit/wtpsplit/train/train_lora.py

with the NEW train_lora.py

Put:

lora_dummy_config.json

to: /content/wtpsplit/configs/lora

"""

In [None]:
import glob

def extract_srt_text(srt_file_path):
    with open(srt_file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    subtitles = re.findall(r'\d+\s+\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\s+(.+?)(?=\n\n|\Z)', content, re.DOTALL)
    return [subtitle.replace('\n', ' ').strip() for subtitle in subtitles]

def format_srt_to_list(srt_file_paths):
    all_subtitles = []
    for file_path in srt_file_paths:
        subtitles = extract_srt_text(file_path)
        all_subtitles.extend([f"{sentence}" for sentence in subtitles])
    return all_subtitles

train_srt_files = glob.glob('/content/Analysis03.srt')
test_srt_files = glob.glob('/content/Analysis01.srt')

train_text = format_srt_to_list(train_srt_files)
test_text = format_srt_to_list(test_srt_files)

In [None]:
print(train_text)

In [None]:
import torch

In [None]:
torch.save(
    {
        "language_code": {
            "sentence": {
                "dummy-dataset": {
                    "meta": {
                        "train_data": train_text,
                    },
                    "data": test_text,
                }
            }
        }
    },
    "dummy-dataset.pth"
)

In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
!pip install --upgrade huggingface_hub

In [None]:
!pip install huggingface_hub

In [None]:
!pip install --upgrade transformers

In [None]:
!pip install --upgrade adapters

In [None]:
!pip install accelerate -U

In [None]:
# %cd wtpsplit
!python3 wtpsplit/train/train_lora.py configs/lora/lora_dummy_config.json

In [None]:
sat_lora_adapted = SaT("sat-3l", lora_path="/content/wtpsplit/sat-3l-my/dummy-dataset/language_code/")

In [None]:
sat_lora_adapted.split(text)

## Re-match timestamp

In [None]:
output_sentences = segmented_output.text.split("\n")

In [None]:
output_sentences = result

In [None]:
# Clean the text to match the word
cleaned_output = []
for sentence in output_sentences:
  sentence = re.sub(r'[,.!?]', '', sentence)
  cleaned_output.append(lowercase(sentence))

In [None]:
result = []
current_id = 0

for sentence in cleaned_output:
    words = sentence.split()
    num_words = len(words)

    if num_words == 0:
        continue

    start_word_id = current_id + 1
    end_word_id = current_id + num_words

    print(f"Processing sentence: '{sentence}'")
    print(f"Total words in sentence: {num_words}, Current total words: {current_id}")

    if start_word_id > len(word_timings) or end_word_id > len(word_timings):
        print(f"Warning: Index out of range. start_word_id: {start_word_id}, end_word_id: {end_word_id}")
        print(f"word_timings length: {len(word_timings)}")
        continue
    try:
        start_time = word_timings[start_word_id - 1]['start']
        end_time = word_timings[end_word_id - 1]['end']
    except IndexError as e:
        print(f"Error: Index error when accessing word_timings. {e}")
        continue

    result.append({
        'sentence': sentence,
        'start': start_time,
        'end': end_time
    })

    current_id += num_words

Processing sentence: 'okay let's talk about why we make bad decisions'
Total words in sentence: 9, Current total words: 0
Processing sentence: 'so if you're like me then you've probably been in that situation where it's the end of the day or it's the morning and you're thinking like okay I'm gonna make a solid plan for myself'
Total words in sentence: 36, Current total words: 9
Processing sentence: 'today I'm gonna be disciplined'
Total words in sentence: 5, Current total words: 45
Processing sentence: 'I'm gonna come home from work and I'm gonna get all these errands done and I'm gonna treat myself really well'
Total words in sentence: 21, Current total words: 50
Processing sentence: 'I'm gonna make a nice healthy proper meal'
Total words in sentence: 8, Current total words: 71
Processing sentence: 'I'm gonna go to the gym afterwards'
Total words in sentence: 7, Current total words: 79
Processing sentence: 'I'm gonna exercise and then maybe when I come home I'll like start reading a b

In [None]:
for entry in result[:5]:
    print(entry)

{'sentence': "okay let's talk about why we make bad decisions", 'start': 3.31, 'end': 6.37}
{'sentence': "so if you're like me then you've probably been in that situation where it's the end of the day or it's the morning and you're thinking like okay I'm gonna make a solid plan for myself", 'start': 6.87, 'end': 17.67}
{'sentence': "today I'm gonna be disciplined", 'start': 18.33, 'end': 20.29}
{'sentence': "I'm gonna come home from work and I'm gonna get all these errands done and I'm gonna treat myself really well", 'start': 21.17, 'end': 27.02}
{'sentence': "I'm gonna make a nice healthy proper meal", 'start': 27.02, 'end': 30.53}


# Transform into .SRT

In [None]:
def format_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    milliseconds = int((seconds - int(seconds)) * 1000)
    return f"{hours:02}:{minutes:02}:{secs:02},{milliseconds:03}"

In [None]:
def generate_srt(data):
    srt_lines = []
    for index, item in enumerate(data):
        start_time = format_time(item['start'])
        end_time = format_time(item['end'])
        sentence = item['sentence']

        srt_lines.append(f"{index + 1}")
        srt_lines.append(f"{start_time} --> {end_time}")
        srt_lines.append(sentence)
        srt_lines.append("")

    return "\n".join(srt_lines)

In [None]:
def save_srt(filename, srt_content):
    with open(filename, "w", encoding="utf-8") as file:
        file.write(srt_content)

In [None]:
srt_content = generate_srt(result)

save_srt("output.srt", srt_content)

print("SRT file has been saved as 'output.srt'")

SRT file has been saved as 'output.srt'
