In [11]:
# Install Whisper and MoviePy
!pip install git+https://github.com/openai/whisper.git
!pip install moviepy
!apt-get install -y ffmpeg  # Install ffmpeg for audio/video processing
!pip install moviepy==2.0.0.dev2
!pip install imageio==2.25.1
!pip install pysrt==1.1.2
!pip install pysrt
!apt-get install -y imagemagick


Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-smnv1u7v
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-smnv1u7v
  Resolved https://github.com/openai/whisper.git to commit 5979f03701209bb035a0a466f14131aeb1116cbb
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 50 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
imagemagick is already the newest version (8:6.9.11.60+dfsg-1.3ubuntu0.22.04.5).
0 upgraded, 0 newly installed, 0 to remove and 50 not upgraded.


In [17]:
import os
import pysrt
from google.colab import drive
from moviepy.editor import VideoFileClip, CompositeVideoClip, ImageClip
import whisper
from PIL import Image, ImageDraw, ImageFont
import numpy as np

# Mount Google Drive to access your files
drive.mount('/content/drive')

# Replace with the path to your video file in Google Drive
video_path = "/content/drive/MyDrive/Video for testing/videoplayback2.mp4"
print(f"Video path: {video_path}")

# Step 1: Extract audio from the video file
video = VideoFileClip(video_path)
audio_filename = "extracted_audio.wav"
video.audio.write_audiofile(audio_filename)
print(f"Audio extracted to: {audio_filename}")

# Step 2: Load the Whisper model for transcription
print("Loading Whisper model...")
model = whisper.load_model("small")

# Step 3: Transcribe the audio
print("Transcribing audio...")
result = model.transcribe(audio_filename, verbose=True, word_timestamps=True)

# Extract the transcription text and timestamps
transcription_text = result["text"]
segments = result["segments"]

# Print transcription for reference
print("\nTranscription:\n")
for segment in segments:
    print(f"[{segment['start']:.3f} --> {segment['end']:.3f}] {segment['text']}")

def time_to_seconds(time_obj):
    return time_obj.hours * 3600 + time_obj.minutes * 60 + time_obj.seconds + time_obj.milliseconds / 1000

def save_as_srt(segments, filename='transcription.srt'):
    subs = pysrt.SubRipFile()
    for i, segment in enumerate(segments):
        start = pysrt.SubRipTime(segment['start'] // 3600, (segment['start'] % 3600) // 60, segment['start'] % 60, int((segment['start'] % 1) * 1000))
        end = pysrt.SubRipTime(segment['end'] // 3600, (segment['end'] % 3600) // 60, segment['end'] % 60, int((segment['end'] % 1) * 1000))
        subs.append(pysrt.SubRipItem(index=i + 1, start=start, end=end, text=segment['text']))

    subs.save(filename)
    print(f"Subtitles saved to {filename}")

# Save transcription to SRT
save_as_srt(segments)


# Function to create an image with text
def create_text_image(text, fontsize=24, color='yellow', height_margin=20):
    # Load a font (use default or specify a path to a .ttf file)
    font = ImageFont.truetype('IBMPlexSans-Bold.ttf', fontsize)

    # Create a blank image with a transparent background to calculate text size
    img = Image.new('RGBA', (1, 1), (0, 0, 0, 0))
    draw = ImageDraw.Draw(img)

    # Calculate the bounding box of the text
    bbox = draw.textbbox((0, 0), text, font=font)
    text_width = bbox[2] - bbox[0]  # Width of the bounding box
    text_height = bbox[3] - bbox[1]  # Height of the bounding box

    # Create the image with the correct size, adding extra height margin
    img = Image.new('RGBA', (text_width, text_height + height_margin), (0, 0, 0, 0))
    draw = ImageDraw.Draw(img)

    # Draw the text, centering it vertically
    draw.text((0, height_margin // 2), text, fill=color, font=font)

    # Convert to numpy array
    img_array = np.array(img)
    return img_array

# Create subtitle clips
def create_subtitle_clips(subtitles, videosize, fontsize=24, color='red', margin=10):
    subtitle_clips = []

    for subtitle in subtitles:
        start_time = time_to_seconds(subtitle.start)
        end_time = time_to_seconds(subtitle.end)
        total_duration = end_time - start_time

        # Split subtitle text into chunks of up to 3 words
        words = subtitle.text.split()
        chunks = [' '.join(words[i:i + 3]) for i in range(0, len(words), 3)]

        # Calculate the duration for each chunk
        chunk_duration = total_duration / len(chunks)

        for i, chunk in enumerate(chunks):
            # Create text image with margin
            img_array = create_text_image(chunk, fontsize=fontsize, color=color)

            # Calculate the start time for the current chunk
            chunk_start_time = start_time + (i * chunk_duration)

            # Convert image to video clip
            text_clip = ImageClip(img_array).set_start(chunk_start_time).set_duration(chunk_duration)

            # Set position of the subtitle, adjusting the y-position by margin
            text_clip = text_clip.set_position(('center', videosize[1] - img_array.shape[0] - margin))  # Centered at the bottom with margin
            subtitle_clips.append(text_clip)

    return subtitle_clips



# Load subtitles from the SRT file
subtitles = pysrt.open("transcription.srt")

# Create subtitle clips
subtitle_clips = create_subtitle_clips(subtitles, video.size)

# Create a composite video clip with subtitles
final_video = CompositeVideoClip([video] + subtitle_clips)

# Write the final video file
final_video.write_videofile("final_video_with_subtitles.mp4", codec='libx264')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Video path: /content/drive/MyDrive/Video for testing/videoplayback2.mp4
MoviePy - Writing audio in extracted_audio.wav




MoviePy - Done.
Audio extracted to: extracted_audio.wav
Loading Whisper model...


  checkpoint = torch.load(fp, map_location=device)


Transcribing audio...
Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: English
[00:00.000 --> 00:03.420]  Ash Seraphine, which would be such a problematic duo to deal with.
[00:03.740 --> 00:10.100]  And I also feel like if you're looking at FlyQuest, you're looking at a team who has not had strong early games so far,
[00:10.300 --> 00:15.980]  so you're probably not as worried about allowing that jinx the time to scale up and really become a powerhouse as we're still waiting.
[00:16.180 --> 00:16.800]  Maybe Seraphine mid.

Transcription:

[0.000 --> 3.420]  Ash Seraphine, which would be such a problematic duo to deal with.
[3.740 --> 10.100]  And I also feel like if you're looking at FlyQuest, you're looking at a team who has not had strong early games so far,
[10.300 --> 15.980]  so you're probably not as worried about allowing that jinx the time to scale up and really become a powerhouse as we're still waiting.
[16.180



MoviePy - Done.
Moviepy - Writing video final_video_with_subtitles.mp4





Moviepy - Done !
Moviepy - video ready final_video_with_subtitles.mp4
