In [1]:
import cv2
import os
import base64
import openai
import time
from pathlib import Path
import whisper
from moviepy.editor import *

from openai import OpenAI

client = OpenAI(api_key="")

def extract_frames(video_path, output_folder, frame_interval=60):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Could not open video.")
        return []

    frame_count = 0
    extracted_frame_paths = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if frame_count % frame_interval == 0:
            frame_filename = f"{output_folder}/frame_{frame_count}.jpg"
            cv2.imwrite(frame_filename, frame)
            extracted_frame_paths.append(frame_filename)

        frame_count += 1

    cap.release()
    print(f"Frame extraction complete. {len(extracted_frame_paths)} frames extracted.")

    return extracted_frame_paths

def get_video_duration(video_path):
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print("Error: Could not open video.")
        return 0

    fps = cap.get(cv2.CAP_PROP_FPS)  # Get frame rate
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))  # Get total frame count

    duration = round(frame_count / fps) if fps > 0 else 0  # Round the duration

    cap.release()
    return duration

def image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def get_frame_descriptions(frame_paths, final_prompt, max_retries=3, retry_delay=2):
    base64_frames = [image_to_base64(path) for path in frame_paths]

    prompt_messages = [
        {
            "role": "user",
            "content": [
                *map(lambda x: {"image": x, "resize": 768}, base64_frames),
                final_prompt
            ]
        }
    ]

    params = {
        "model": "gpt-4o-mini",
        "messages": prompt_messages,
        "max_tokens": 1000,
        "temperature": 0
    }

    for attempt in range(max_retries):
        try:
            result = client.chat.completions.create(**params)
            return result.choices[0].message.content
        except openai.InternalServerError as e:
            print(f"Server error on attempt {attempt + 1}: {e}. Retrying after {retry_delay} seconds...")
            time.sleep(retry_delay)

    print("Failed to obtain descriptions after several retries.")
    return None


# this is only in version2
# Function to extract audio from video
def extract_audio(video_path, audio_output_path):
  video_clip = VideoFileClip(video_path)
  video_clip.audio.write_audiofile(audio_output_path)


# this is only in version2 
# Function to transcribe audio using Whisper(this is an API from OpenAI)
def transcribe_audio_with_whisper(audio_file):
  model = whisper.load_model("base")
  result = model.transcribe(audio_file)
  return result["text"]


# Main execution
# video_path = "video.mp4"
video_path = "v2.mp4"
output_folder = "extracted_frames"

# Calculate video duration and adjust prompt
video_duration = get_video_duration(video_path)
print(f"Video duration: {video_duration} seconds.")

word_count = video_duration * 2.2 # make it 2, 2.2 and 2.5 
print(f"Word Count: {word_count}")

prompt = f"(This video is ONLY {video_duration} seconds long, so make sure the voiceover MUST be less than {word_count} words)"

# final_prompt = "ACT as an commentator. In a conversational style, explain step-by-step what is happening in match the frames suitable for a voiceover." + prompt
# promt can be act as an engaging commentator
final_prompt = "ACT as an commentator. In a conversational style, explain step-by-step what is happening in valorant game the frames suitable for a voiceover." + prompt

# Extract frames 
extracted_frames = extract_frames(video_path, output_folder)
print("extracted_frames",extracted_frames)

# Extract audio from the video
audio_output_path = 'extracted_audio.mp3'
extract_audio(video_path, audio_output_path)


# Transcribe audio
audio_transcription = transcribe_audio_with_whisper(audio_output_path)


# get dget_frame_descriptions
descriptions = get_frame_descriptions(extracted_frames, final_prompt)

print(descriptions)



# Combine audio transcription with visual descriptions
combined_text = f"Video Description:\n{descriptions}\n\nAudio Transcription:\n{audio_transcription}"

# Function to get rewritten description from Chat Completions API
def get_rewritten_description(text):
  response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
      {"role": "system", "content": "Use the Video Description to describe what happening in the video and also use the Audio Transcription to complement the analysis into a spoken report"+ prompt},
      {"role": "user", "content": text}
    ]
  )

  # Corrected method to extract the content from the response
  revised_text = response.choices[0].message.content
  return revised_text

# Get revised description
revised_description = get_rewritten_description(combined_text)
print("Revised Description from Chat Completions API:")
print(revised_description)


Video duration: 42 seconds.
Word Count: 92.4
Frame extraction complete. 22 frames extracted.
extracted_frames ['extracted_frames/frame_0.jpg', 'extracted_frames/frame_60.jpg', 'extracted_frames/frame_120.jpg', 'extracted_frames/frame_180.jpg', 'extracted_frames/frame_240.jpg', 'extracted_frames/frame_300.jpg', 'extracted_frames/frame_360.jpg', 'extracted_frames/frame_420.jpg', 'extracted_frames/frame_480.jpg', 'extracted_frames/frame_540.jpg', 'extracted_frames/frame_600.jpg', 'extracted_frames/frame_660.jpg', 'extracted_frames/frame_720.jpg', 'extracted_frames/frame_780.jpg', 'extracted_frames/frame_840.jpg', 'extracted_frames/frame_900.jpg', 'extracted_frames/frame_960.jpg', 'extracted_frames/frame_1020.jpg', 'extracted_frames/frame_1080.jpg', 'extracted_frames/frame_1140.jpg', 'extracted_frames/frame_1200.jpg', 'extracted_frames/frame_1260.jpg']
MoviePy - Writing audio in extracted_audio.mp3


                                                                                                                                                                                              

MoviePy - Done.


100%|███████████████████████████████████████| 139M/139M [00:11<00:00, 12.5MiB/s]


In this intense Valorant match, we see a player strategically peeking around corners, keeping an eye on the enemy's movements. With only 50 seconds left, they spot an opponent and take a shot, landing a headshot! The tension rises as they navigate through tight spaces, using cover effectively. 

With the score at 4-5, every move counts. The player quickly assesses their surroundings, preparing for a potential ambush. As the clock ticks down, they make a decisive play, securing a clutch moment just in time! What a thrilling finish!
Revised Description from Chat Completions API:
In this thrilling 42-second Valorant match, a player exemplifies strategic gameplay as they cautiously peek around corners, closely monitoring enemy movements. With only 30 seconds left on the clock, they spot an opponent, make a precise shot, and secure a headshot. The score is tight at 4-5, intensifying the stakes. 

As the player maneuvers through the map, they utilize cover effectively and prepare for a possi

In [2]:
def create_voiceover(text, output_audio_path, model="tts-1", voice="echo"):
    response = client.audio.speech.create(
        model=model,
        voice=voice,
        input=text
    )
    response.stream_to_file(Path(output_audio_path))


In [3]:
from moviepy.editor import VideoFileClip, AudioFileClip

# Function to merge audio with video
def merge_audio_video(video_path, audio_path, output_video_path):
    video_clip = VideoFileClip(video_path)
    audio_clip = AudioFileClip(audio_path)

    final_clip = video_clip.set_audio(audio_clip)
    final_clip.write_videofile(output_video_path, codec="libx264", audio_codec="aac")


In [4]:
if extracted_frames:
    # Create and save voiceover
    output_audio_path = 'voiceover.mp3'
    create_voiceover(descriptions, output_audio_path)

    # Merge audio with video and save as new file
    output_video_path = 'openai2.mp4'
    merge_audio_video(video_path, output_audio_path, output_video_path)
else:
    print("No frames were extracted.")


Moviepy - Building video openai2.mp4.
MoviePy - Writing audio in openai2TEMP_MPY_wvf_snd.mp4


                                                                                                                                                                                              

MoviePy - Done.
Moviepy - Writing video openai2.mp4



                                                                                                                                                                                              

Moviepy - Done !
Moviepy - video ready openai2.mp4
