# 04-5: GPT-4 and Gemini for video and TTS API

In [None]:
!pip install opencv-python openai

In [None]:
from IPython.display import display, Image, Audio

import cv2        # We're using OpenCV to read video
import base64
import time
from openai import OpenAI
import os
import requests

import getpass

openai_api_key = getpass.getpass("OPENAI API Key:")

client = OpenAI(api_key=openai_api_key)

## 1. Using OpenCV to extract frames


First, we use OpenCV to extract frames from a nature [video](https://www.youtube.com/watch?v=kQ_7GtE529M) containing bisons and wolves:


In [None]:
video = cv2.VideoCapture("./chicago2.mp4")

base64Frames = []
while video.isOpened():
    success, frame = video.read()
    if not success:
        break
    _, buffer = cv2.imencode(".jpg", frame)
    base64Frames.append(base64.b64encode(buffer).decode("utf-8"))

video.release()
print(len(base64Frames), "frames read.")

Display frames to make sure we've read them in correctly:


In [None]:
display_handle = display(None, display_id=True)
for img in base64Frames:
    display_handle.update(Image(data=base64.b64decode(img.encode("utf-8"))))
    time.sleep(0.025)

## 2. Using GPT's visual capabilities to get a description of a video

In [None]:
from openai import OpenAI

import getpass
openai_api_key = getpass.getpass()

openai_client = OpenAI(api_key=openai_api_key)

In [None]:
result = openai_client.responses.create(
    model="gpt-4.1-mini",
    input=[
        {
            "role": "user",
            "content": [
                {
                    "type": "input_text",
                    "text": (
                        "These are frames of a video. Create a short voiceover script in the style of David Attenborough. Only include the narration."
                    )
                },
                *[
                    {
                        "type": "input_image",
                        "image_url": f"data:image/jpeg;base64,{frame}"
                    }
                    for frame in base64Frames[0::25]
                ]
            ]
        }
    ]
)

print(result.output_text)

## 3. Generating a voiceover using TTS API


Let's create a voiceover for this video in the style of David Attenborough. Using the same video frames we prompt GPT to give us a short script:


In [None]:
instructions = """
Voice Affect: Calm, measured, and warmly engaging; convey awe and quiet reverence for the natural world.

Tone: Inquisitive and insightful, with a gentle sense of wonder and deep respect for the subject matter.

Pacing: Even and steady, with slight lifts in rhythm when introducing a new species or unexpected behavior; natural pauses to allow the viewer to absorb visuals.

Emotion: Subtly emotive—imbued with curiosity, empathy, and admiration without becoming sentimental or overly dramatic.

Emphasis: Highlight scientific and descriptive language (“delicate wings shimmer in the sunlight,” “a symphony of unseen life,” “ancient rituals played out beneath the canopy”) to enrich imagery and understanding.

Pronunciation: Clear and articulate, with precise enunciation and slightly rounded vowels to ensure accessibility and authority.

Pauses: Insert thoughtful pauses before introducing key facts or transitions (“And then... with a sudden rustle...”), allowing space for anticipation and reflection.
"""

audio_response = response = openai_client.audio.speech.create(
  model="gpt-4o-mini-tts",
  voice="echo",
  instructions=instructions,
  input=result.output_text,
  response_format="wav"
)

audio_bytes = audio_response.content
Audio(data=audio_bytes)

## 4. Gemini 2.5 Flash for video summary

In [None]:
from google import genai

import getpass
google_api_key = getpass.getpass()

client = genai.Client(api_key=google_api_key)

MODEL_ID = "gemini-2.5-flash-preview-05-20" 
PROMPT = "Por favor, haz un resumen de este video en 3 frases en español."

import time

def upload_video(video_file_name):
  video_file = client.files.upload(file=video_file_name)

  while video_file.state == "PROCESSING":
      print('Waiting for video to be processed.')
      time.sleep(10)
      video_file = client.files.get(name=video_file.name)

  if video_file.state == "FAILED":
    raise ValueError(video_file.state)
  print(f'Video processing complete: ' + video_file.uri)

  return video_file

chicago_video = upload_video("./chicago2.mp4")

response = client.models.generate_content(
    model=MODEL_ID,
    contents=[
        chicago_video,
        PROMPT,
    ]
)

print(response.text)

## 5. TTS with Chirp3-HD

In [None]:
#!pip install --upgrade --quiet google-cloud-texttospeech
from google.cloud import texttospeech_v1beta1 as texttospeech
from google.api_core.client_options import ClientOptions


prompt = response.text

voice = "Aoede"  # @param ["Aoede", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Zephyr"]

language_code = "es-ES"  # @param [ "de-DE", "en-AU", "en-GB", "en-IN", "en-US", "fr-FR", "hi-IN", "pt-BR", "ar-XA", "es-ES", "fr-CA", "id-ID", "it-IT", "ja-JP", "tr-TR", "vi-VN", "bn-IN", "gu-IN", "kn-IN", "ml-IN", "mr-IN", "ta-IN", "te-IN", "nl-NL", "ko-KR", "cmn-CN", "pl-PL", "ru-RU", "th-TH"]

voice_name = f"{language_code}-Chirp3-HD-{voice}"
voice = texttospeech.VoiceSelectionParams(
    name=voice_name,
    language_code=language_code,
)

In [None]:
TTS_LOCATION="global"
API_ENDPOINT = (
    f"{TTS_LOCATION}-texttospeech.googleapis.com"
    if TTS_LOCATION != "global"
    else "texttospeech.googleapis.com"
)

client = texttospeech.TextToSpeechClient(
    client_options=ClientOptions(api_endpoint=API_ENDPOINT)
)

response = client.synthesize_speech(
    input=texttospeech.SynthesisInput(text=prompt),
    voice=voice,
    # Select the type of audio file you want returned
    audio_config=texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    ),
)

In [None]:
from IPython.display import Audio, display

display(Audio(response.audio_content))

## 6. Veo3

* Este ejemplo genera video con audio.
* Veo3 tiene un coste de $0.75 / segundo de video, y no está incluido en el Free Tier.
* Todos los videos de Veo3 llevan SynthID

In [None]:
import time
from google import genai
from google.genai import types

client = genai.Client()

operation = client.models.generate_videos(
    model="veo-3.0-generate-preview",
    prompt="Panning wide shot of a purring kitten sleeping in the sunshine",
    config=types.GenerateVideosConfig(
        #person_generation="allow_all",  # "allow_adult" and "dont_allow" for Veo 2 only
        aspect_ratio="16:9",  # "16:9", and "9:16" for Veo 2 only
    ),
)

while not operation.done:
    time.sleep(20)
    operation = client.operations.get(operation)

for n, generated_video in enumerate(operation.response.generated_videos):
    client.files.download(file=generated_video.video)
    generated_video.video.save(f"video{n}.mp4")
