In [None]:
%pip install google-genai
%pip install os
%pip install dotenv
%pip install pydantic

In [None]:
from google import genai
from dotenv import load_dotenv
from google.genai import types
from pydantic import BaseModel, Field

In [None]:
class Segment(BaseModel):
    start: str = Field(description="Start of a segment.")
    end: str = Field(description="End of a segment.")
    reason: str | None = Field(None, description="Reason why this segment is important.")

class GroundedVideoAnswer(BaseModel):
    answer: str = Field(..., description="answer")
    segments: list[Segment] = Field(..., description="a list of video segments")


In [None]:
def create_vertexai_client():
    import os
    
    cloud_api_key = os.getenv("GOOGLE_CLOUD_API_KEY")
    if not cloud_api_key:
        raise ValueError("GOOGLE_CLOUD_API_KEY not found in .env file")
    
    # Configure the client with your API key
    client = genai.Client(
        vertexai=True, 
        api_key=cloud_api_key, 
    )

    return client

In [None]:
load_dotenv()

# Configure the client with your API key
client = create_vertexai_client()

In [None]:
def clean_json_string(raw_string):
    # Remove the markdown code blocks
    clean_str = raw_string.strip()
    if clean_str.startswith("```json"):
        clean_str = clean_str[7:]
    if clean_str.endswith("```"):
        clean_str = clean_str[:-3]
    return clean_str.strip()

In [None]:
def ask_my_youtube(file_uri: str, question: str, video_metadata: types.VideoMetadata | None) -> GroundedVideoAnswer:
   
    video_metadata_dict = video_metadata.__dict__ if video_metadata is not None else {}
    # custom frame rate and clipping interval. 
    # decrease frame rate when 0 < fps < 1
    merged_dict = {**video_metadata_dict, "fps": 0.5}
    merged_video_dict = types.VideoMetadata(**merged_dict)

    response = client.models.generate_content(
        model='gemini-3-flash-preview',
        contents=types.Content(
            role="user",
            parts=[
                types.Part(
                    file_data=types.FileData(file_uri=file_uri, mime_type="video/mp4"),
                    video_metadata=merged_video_dict  
                ),
                types.Part(text=question)
            ]
        ),
        config=types.GenerateContentConfig(
            response_mime_type="application/json",
            response_json_schema=GroundedVideoAnswer.model_json_schema(),
            media_resolution=types.MediaResolution.MEDIA_RESOLUTION_LOW,
        )
    )
    
    result = GroundedVideoAnswer.model_validate_json(clean_json_string(response.text))
    return result

def print_result(file_uri: str, question: str, video_metadata: types.VideoMetadata | None = None):
    result = ask_my_youtube(file_uri=file_uri, 
        question=question, 
        video_metadata=video_metadata
    )

    print("Question: ", question)
    print("Answer: ", result.answer)
    print ("Segments:")
    for s in result.segments:
        print("- ", f"[{s.start} - {s.end}], reason: {s.reason}")

    return result.answer

In [None]:
# short videos (~14 minutes) about Nano Banana generation
# file_uri = "https://youtu.be/v6B44n1V9no?si=lupM2r3kDGxFzzHr"

In [None]:
# print_result(file_uri=file_uri, question="Summarize the video in three sentences")

In [None]:
# print_result(file_uri=file_uri, question="Explain her demo that generates multiple images using Nano Banana Pro")

In [None]:
# print_result(file_uri=file_uri, question="How did she build the prompt at each step?")

In [None]:
# A long YouTube video (26:53) about gemini 2.5 flash tts mode
file_uri = "https://youtu.be/R9LZrysSil0?si=pSXGE1fKCPVKLdWc"

In [None]:
from enum import Enum

class QuestionType(Enum):
    Technical="Technical"
    Visual="Visual"
    Concept="Concept"

class FollowUpQuestion(BaseModel):
    id: int
    type: QuestionType
    text: str
    reason: str

class FollowUpQuestions(BaseModel):
    follow_up_analysis: str
    questions: list[FollowUpQuestion]

def get_followup_questions(file_uri: str, previous_answer: str, video_metadata: types.VideoMetadata | None = None) -> FollowUpQuestions:
    system_prompt = """
### ROLE
You are the "Expert Insight Architect," a specialist in Socratic inquiry and multimodal analysis. Your expertise lies in identifying "logical gaps" and "hidden nuances" within video content to help users explore a topic beyond the surface-level answer.

### OBJECTIVE
Based on a provided YouTube video and a previous answer, generate three (3) highly targeted follow-up questions. These questions should encourage the user to think deeper about the visual, technical, or conceptual evidence presented in the video.

### OPERATING PRINCIPLES
1. MULTIMODAL GROUNDING: Every question must be directly related to something seen or heard in the provided video. Do not ask generic questions that could apply to any video on the topic.
2. THE "THINKING" PHASE: Use your internal reasoning (Thinking Mode) to cross-reference the previous answer against the video's full timeline. Identify what was missed, glossed over, or requires further proof.
3. DIVERSITY OF INQUIRY: Provide three distinct types of questions:
   - THE TECHNICAL DRILL-DOWN: Focus on a specific detail, data point, or instruction mentioned.
   - THE VISUAL CONTEXT: Focus on something shown on screen (charts, demos, body language, or environment).
   - THE CONCEPTUAL EXTENSION: Link a point made in the video to a broader implication or real-world application.

### OUTPUT FORMAT
You must return the response as a structured JSON object for seamless UI integration.
{
  "follow_up_analysis": "A brief internal thought on why these questions were chosen.",
  "questions": [
    {
      "id": 1,
      "type": "Technical",
      "question": "string",
      "reason": "A small clue about where in the video this is addressed."
    },
    {
      "id": 2,
      "type": "Visual",
      "question": "string",
      "reason": "string"
    },
    {
      "id": 3,
      "type": "Conceptual",
      "question": "string",
      "reason": "string"
    }
  ]
}

### CRITICAL CONSTRAINT
Never suggest a question that has already been fully answered by the provided "Previous Answer" text. Always push for "Next-Level" understanding.
    """

    response = client.models.generate_content(
        model='gemini-3-flash-preview',
        contents=[
            types.Content(
                role="model",
                parts=[types.Part(text=system_prompt)]
            ),
            types.Content(
                role="user",
                parts=[
                    types.Part(
                        file_data=types.FileData(file_uri=file_uri, mime_type="video/mp4"),
                        video_metadata=video_metadata
                    ),
                    types.Part(text=f"Previous answer: {previous_answer}")
                ]
        )],
        config=types.GenerateContentConfig(
            response_mime_type="application/json",
            response_json_schema=FollowUpQuestions.model_json_schema(),
            media_resolution=types.MediaResolution.MEDIA_RESOLUTION_LOW,
        )
    )
    
    result = FollowUpQuestions.model_validate_json(clean_json_string(response.text))
    return result

In [None]:
start_offset =  f"{22 * 60 + 49}s"
end_offset = f"{26 * 60 + 40}s"

video_metadata = types.VideoMetadata(start_offset=start_offset, end_offset=end_offset)
previous_answer = print_result(
    file_uri=file_uri, 
    question="What was Connie saying in this video clip? Provide the top 3 most relevant segments.",
    video_metadata=video_metadata
)

followup_questions = get_followup_questions(
    file_uri=file_uri, 
    previous_answer=previous_answer,
    video_metadata=video_metadata
)

print(f"Follow up analysis: {followup_questions.follow_up_analysis}")
for q in followup_questions.questions:
    print(f"Id: {q.id}, Type: {q.type}")
    print(f"Text: {q.text}")
    print(f"Reason: {q.reason}")

In [None]:
previous_answer = print_result(
    file_uri=file_uri, 
    question="How did Connie calculate the duration when the playback rate was randomized?",
    video_metadata=video_metadata
)

followup_questions = get_followup_questions(
    file_uri=file_uri, 
    previous_answer=previous_answer,
    video_metadata=video_metadata
)

print(f"Follow up analysis: {followup_questions.follow_up_analysis}")
for q in followup_questions.questions:
    print(f"Id: {q.id}, Type: {q.type}")
    print(f"Text: {q.text}")
    print(f"Reason: {q.reason}")