In [None]:
%pip install google-genai
%pip install os
%pip install dotenv
%pip install pydantic

In [None]:
from google import genai
from dotenv import load_dotenv
from google.genai import types
from pydantic import BaseModel, Field

In [None]:
class Segment(BaseModel):
    start: str = Field(description="Start of a segment.")
    end: str = Field(description="End of a segment.")
    reason: str | None = Field(None, description="Reason why this segment is important.")

class GroundedVideoAnswer(BaseModel):
    answer: str = Field(..., description="answer")
    segments: list[Segment] = Field(..., description="a list of video segments")

In [None]:
def create_vertexai_client():
    import os
    
    cloud_api_key = os.getenv("GOOGLE_CLOUD_API_KEY")
    if not cloud_api_key:
        raise ValueError("GOOGLE_CLOUD_API_KEY not found in .env file")
    
    # Configure the client with your API key
    client = genai.Client(
        vertexai=True, 
        api_key=cloud_api_key, 
    )

    return client

In [None]:
load_dotenv()

# Configure the client with your API key
client = create_vertexai_client()

In [None]:
def ask_my_youtube(file_uri: str, question: str, video_metadata: types.VideoMetadata | None) -> GroundedVideoAnswer:
    def clean_json_string(raw_string):
        # Remove the markdown code blocks
        clean_str = raw_string.strip()
        if clean_str.startswith("```json"):
            clean_str = clean_str[7:]
        if clean_str.endswith("```"):
            clean_str = clean_str[:-3]
        return clean_str.strip()
    
    video_metadata_dict = video_metadata.__dict__ if video_metadata is not None else {}
    # custom frame rate and clipping interval
    merged_dict = {**video_metadata_dict, "fps": 2}
    merged_video_dict = types.VideoMetadata(**merged_dict)

    response = client.models.generate_content(
        model='gemini-3-flash-preview',
        contents=types.Content(
            role="user",
            parts=[
                types.Part(
                    file_data=types.FileData(file_uri=file_uri, mime_type="video/mp4"),
                    video_metadata=merged_video_dict  
                ),
                types.Part(text=question)
            ]
        ),
        config=types.GenerateContentConfig(
            response_mime_type="application/json",
            response_json_schema=GroundedVideoAnswer.model_json_schema(),
            media_resolution=types.MediaResolution.MEDIA_RESOLUTION_LOW,
        )
    )
    
    result = GroundedVideoAnswer.model_validate_json(clean_json_string(response.text))
    return result

def print_result(file_uri: str, question: str, video_metadata: types.VideoMetadata | None = None):
    result = ask_my_youtube(file_uri=file_uri, 
        question=question, 
        video_metadata=video_metadata
    )

    print("Question: ", question)
    print("Answer: ", result.answer)
    print ("Similarities:")
    for s in result.segments:
        print("- ", f"[{s.start} - {s.end}], reason: {s.reason}")

In [None]:
file_uri = "https://youtu.be/v6B44n1V9no?si=lupM2r3kDGxFzzHr"

In [None]:
print_result(file_uri=file_uri, question="Summarize the video in three sentences")

In [None]:
print_result(file_uri=file_uri, question="Explain her demo that generates multiple images using Nano Banana Pro")

In [None]:
print_result(file_uri=file_uri, question="How did she build the prompt at each step?")

In [None]:
# A long YouTube video (26:53) about gemini 2.5 flash tts mode
file_uri = "https://youtu.be/R9LZrysSil0?si=pSXGE1fKCPVKLdWc"

In [None]:
start_offset =  f"{22 * 60 + 49}s"
end_offset = f"{26 * 60 + 40}s"

video_metadata = types.VideoMetadata(start_offset=start_offset, end_offset=end_offset)
print_result(
    file_uri=file_uri, 
    question="What was Connie saying in this video clip?",
    video_metadata=video_metadata
)