In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/NLP/TranscribeAI/

/content/drive/MyDrive/NLP/TranscribeAI


In [None]:
import json

# Specify the path to your JSON file
file_path = '/content/drive/MyDrive/NLP/TranscribeAI/whisper_jsons/part10.json'

# Open and read the JSON file
with open(file_path, 'r') as file:
    data = json.load(file)

# Print the contents of the JSON file
data["text"]

" Hello and welcome back to my therapist plays disco elisium analysis series. On today's episode we're going to address something kind of unavoidable in a game like this, which is what are some of the crossovers between politics and mental health and kind of some personal struggles I have about integrating those two subjects. I'm sure there will be a lot of discussion about this in the comments, just know that this is all recorded previously. I think I'm going to probably make a dedicated video at some point kind of like compiling my thoughts a little more, but yeah, to get an early look at kind of where my, my, my, my stomach's going to be problemless. Aside from that, we have a very special moment in this episode, which is finally we're going to reach the end of our first day as a detective. It's taken us a while and we're going to take another look at how Kim and I's relationship has been developing over this very challenging day and see if there's actually some things that we've be

In [None]:
# Extract the desired features


word_timings = []
for segment in data['segments']:
    for word in segment['words']:
        word_timing = {
            'text': word['text'],
            'start': word['start'],
            'end': word['end']
        }
        word_timings.append(word_timing)

# Print the total number of words extracted
print(f"Total words extracted: {len(word_timings)}")

# Print the last few entries to verify
print("Last 5 entries:")
for entry in word_timings[:10]:
    print(entry)

# Optionally, save to a new JSON file
with open('extracted_word_timings.json', 'w') as outfile:
    json.dump(word_timings, outfile, indent=2)

print("Extracted data saved to 'extracted_word_timings.json'")


Total words extracted: 455
Last 5 entries:
{'text': 'Hello', 'start': 0.58, 'end': 0.78}
{'text': 'and', 'start': 0.78, 'end': 1.44}
{'text': 'welcome', 'start': 1.44, 'end': 1.62}
{'text': 'back', 'start': 1.62, 'end': 1.82}
{'text': 'to', 'start': 1.82, 'end': 2.0}
{'text': 'my', 'start': 2.0, 'end': 2.22}
{'text': 'therapist', 'start': 2.22, 'end': 2.62}
{'text': 'plays', 'start': 2.62, 'end': 2.96}
{'text': 'disco', 'start': 2.96, 'end': 3.32}
{'text': 'elisium', 'start': 3.32, 'end': 3.84}
Extracted data saved to 'extracted_word_timings.json'


In [None]:
# captions_list = []

# for i,segment in enumerate(data["segments"]):
#   #print(segment["text"])
#   # print(segment["words"])
#   captions_list.append(segment["text"])

In [None]:
import re

def split_long_segments(captions, max_words=8, gap_threshold=1.0):
    refined_captions = []

    for caption in captions:
        words = caption['text'].split()
        start_time = caption['start']
        end_time = caption['end']
        duration = end_time - start_time

        # Split text into phrases using punctuation as natural break points
        phrases = re.split(r'([.,!?])', caption['text'])
        phrases = [''.join(x).strip() for x in zip(phrases[0::2], phrases[1::2])] + [phrases[-1]] if len(phrases) > 1 else [phrases[0]]

        phrase_start = start_time
        word_time = duration / len(words) if len(words) > 0 else 0

        for phrase in phrases:
            phrase_words = phrase.split()
            phrase_duration = word_time * len(phrase_words)
            phrase_end = phrase_start + phrase_duration

            # If phrase exceeds max_words, split it further
            for i in range(0, len(phrase_words), max_words):
                sub_phrase_words = phrase_words[i:i + max_words]
                sub_phrase_start = phrase_start + i * word_time
                sub_phrase_end = min(phrase_start + (i + max_words) * word_time, phrase_end)

                refined_captions.append({
                    'start': sub_phrase_start,
                    'end': sub_phrase_end,
                    'sentence': ' '.join(sub_phrase_words)
                })

            phrase_start = phrase_end

    # Incorporate pauses
    final_captions = []
    for i, caption in enumerate(refined_captions):
        # Add a pause gap if necessary
        if i > 0 and caption['start'] - refined_captions[i - 1]['end'] > gap_threshold:
            final_captions.append({
                'start': refined_captions[i - 1]['end'],
                'end': caption['start'],
                'sentence': ''
            })
        # Append the current caption
        final_captions.append(caption)

    return final_captions


In [None]:
captions_dict = split_long_segments(data["segments"])

# Transform into .SRT

In [None]:
# Clean the text to match the word
cleaned_output = []
for sentence in captions_dict:
  cleaned = sentence["sentence"] #re.sub(r'[,.!?]', '', sentence)
  cleaned_output.append(cleaned)

In [None]:
result = []
current_id = 0

for sentence in cleaned_output:
    words = sentence.split()
    num_words = len(words)

    if num_words == 0:
        continue

    start_word_id = current_id + 1
    end_word_id = current_id + num_words # So this can still match the time even when the last word is not complete haha
    try:
      start_time = word_timings[start_word_id - 1]['start']
      end_time = word_timings[end_word_id - 1]['end']
    except:
      start_word_id = current_id + 1 - (2)
      end_word_id = current_id + num_words-(2)
      # print(start_word_id)
      # print(end_word_id)
      # print(len(word_timings))
      # print("except")
      start_time = word_timings[start_word_id - 1]['start']
      end_time = word_timings[end_word_id - 1]['end']
    result.append({
        'sentence': sentence,
        'start': start_time,
        'end': end_time
    })

    current_id += num_words

In [None]:
def format_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    milliseconds = int((seconds - int(seconds)) * 1000)
    return f"{hours:02}:{minutes:02}:{secs:02},{milliseconds:03}"

In [None]:
def generate_srt(data):
    srt_lines = []
    for index, item in enumerate(data):
        start_time = format_time(item['start'])
        end_time = format_time(item['end'])
        sentence = item['sentence']

        srt_lines.append(f"{index + 1}")
        srt_lines.append(f"{start_time} --> {end_time}")
        srt_lines.append(sentence)
        srt_lines.append("")

    return "\n".join(srt_lines)

In [None]:
def save_srt(filename, srt_content):
    with open(filename, "w", encoding="utf-8") as file:
        file.write(srt_content)

In [None]:
srt_content = generate_srt(result)

save_srt("Segment_fn_srt/captions_part10.srt", srt_content)

print("SRT file has been saved")# as 'Segment_fn_srt/captions_analysis3.srt'")

SRT file has been saved


In [None]:
!sudo apt-get install ffmpeg


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [None]:
import time

# Start timing the conversion process
start_time = time.time()

# Run the ffmpeg command with the updated subtitle style
!ffmpeg -i video.mp4 -i Gemini_async/srts/pa.srt -c:v libx264 -vf "subtitles=captions_srt_output.srt:force_style='Fontname=Tahoma,Fontsize=24,PrimaryColour=&H0018B2FF& ,SecondaryColour=&H00FFFFFF& ,BorderStyle=1,Outline=1.0,Shadow=0.0'" caption_videos/part3.mp4

# Calculate and print the elapsed time
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Conversion took {elapsed_time/60:.2f} mins.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[0m[1;36m[libx264 @ 0x5b023d3a0680] [0m[0;32mframe=25462 QP=23.31 NAL=0 Slice:B Poc:142 I:30   P:205  SKIP:685  size=825 bytes
[0m[1;32m[Parsed_subtitles_0 @ 0x5b023d32df40] [0m[0;32mCopying data in avfilter.
[0m[1;32m[Parsed_subtitles_0 @ 0x5b023d32df40] [0m[0;32mline break at 31[0m[0;32m
[0m[1;36m[libx264 @ 0x5b023d3a0680] [0m[0;32mframe=25463 QP=19.88 NAL=2 Slice:P Poc:152 I:28   P:421  SKIP:471  size=2553 bytes
[0m[0;36m[h264 @ 0x5b023d3a61c0] [0m[0;32mnal_unit_type: 1(Coded slice of a non-IDR picture), nal_ref_idc: 2
[0m[0;36m[h264 @ 0x5b023d3a61c0] [0m[0;32mslice:1 F mb:0 P fix frame:12 poc:312/312 ref:3/1 qp:26 loop:1:0:0 weight:0 
[0m[1;32m[Parsed_subtitles_0 @ 0x5b023d32df40] [0m[0;32mCopying data in avfilter.
[0m[1;32m[Parsed_subtitles_0 @ 0x5b023d32df40] [0m[0;32mline break at 31[0m[0;32m
[0m[0;36m[h264 @ 0x5b023d2f5740] [0m[0;32mnal_unit_type: 1(Coded slice of a non-IDR 