In [51]:
import os
from dotenv import load_dotenv
from IPython.display import Markdown

load_dotenv()
GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')

from typing import List
from google import genai
from google.genai import types

client = genai.Client(api_key=GOOGLE_API_KEY)

MODEL_ID = "gemini-2.0-flash-exp" # @param ["gemini-1.5-flash-8b","gemini-1.5-flash-002","gemini-1.5-pro-002","gemini-2.0-flash-exp"] {"allow-input":true}

In [52]:
# Prepare the file to be uploaded
import pathlib

all_mp4_files = list(pathlib.Path('.\\panda70m_hq6m_formatted_humansOnly_v2.1').rglob('*.mp4'))

In [54]:
all_mp4_files[0].stem

'0000000_00000'

In [55]:
video_filename = '0000104_00000'
video_path = list(filter(lambda file: file.stem == video_filename, all_mp4_files))
assert len(video_path) == 1, f"Expected 1 video name match, but found {len(video_path)} files."
video_path = video_path[0]

In [56]:
video_path

WindowsPath('panda70m_hq6m_formatted_humansOnly_v2.1/00001/0000104_00000.mp4')

In [57]:
img_path = video_path
# Upload the file using the API
file_upload = client.files.upload(path=img_path)

In [58]:
import time

# Prepare the file to be uploaded
while file_upload.state == "PROCESSING":
    print('Waiting for video to be processed.')
    time.sleep(10)
    file_upload = client.files.get(name=file_upload.name)

if file_upload.state == "FAILED":
  raise ValueError(file_upload.state)
print(f'Video processing complete: ' + file_upload.uri)

Waiting for video to be processed.
Video processing complete: https://generativelanguage.googleapis.com/v1beta/files/slzcrppaupfv


In [59]:
print(file_upload.state)

ACTIVE


In [60]:
SYSTEM_PROMPT = "When given a video and a query, call the relevant function only once with the appropriate timecodes and text for the video"
# prompt for checking human presence
USER_PROMPT = 'Generate chart data for this video based on the following instructions: for each scene, count the number of people visible. Call set_timecodes_with_numeric_values once with the list of data values and timecodes.'

In [61]:


# response = client.models.generate_content(
#     model=MODEL_ID,
#     contents=[
#         types.Content(
#             role="user",
#             parts=[
#                 types.Part.from_uri(
#                     file_uri=file_upload.uri,
#                     mime_type=file_upload.mime_type),
#                 ]),
#         USER_PROMPT,
#     ],
#     config=types.GenerateContentConfig(
#         system_instruction=SYSTEM_PROMPT,
#         temperature=0.0,
#     ),
# )
#
# Markdown(response.text)

## Add in the Function Calls to get back the data in a way we expect it

In [62]:
set_timecodes = types.FunctionDeclaration(
    name="set_timecodes",
    description="Set the timecodes for the video with associated text",
    parameters={
        "type": "OBJECT",
        "properties": {
            "timecodes": {
                "type": "ARRAY",
                "items": {
                    "type": "OBJECT",
                    "properties": {
                        "time": {"type": "STRING"},
                        "text": {"type": "STRING"},
                    },
                    "required": ["time", "text"],
                }
            }
        },
        "required": ["timecodes"]
    }
)

set_timecodes_with_objects = types.FunctionDeclaration(
    name="set_timecodes_with_objects",
    description="Set the timecodes for the video with associated text and object list",
    parameters={
        "type": "OBJECT",
        "properties": {
            "timecodes": {
                "type": "ARRAY",
                "items": {
                    "type": "OBJECT",
                    "properties": {
                        "time": {"type": "STRING"},
                        "text": {"type": "STRING"},
                        "objects": {
                            "type": "ARRAY",
                            "items": {"type": "STRING"},
                        },
                    },
                    "required": ["time", "text", "objects"],
                }
            }
        },
        "required": ["timecodes"],
    }
)

set_timecodes_with_numeric_values = types.FunctionDeclaration(
    name="set_timecodes_with_numeric_values",
    description="Set the timecodes for the video with associated numeric values",
    parameters={
        "type": "OBJECT",
        "properties": {
            "timecodes": {
                "type": "ARRAY",
                "items": {
                    "type": "OBJECT",
                    "properties": {
                        "time": {"type": "STRING"},
                        "value": {"type": "NUMBER"},
                    },
                    "required": ["time", "value"],
                }
            }
        },
        "required": ["timecodes"],
    }
)

set_timecodes_with_descriptions = types.FunctionDeclaration(
    name="set_timecodes_with_descriptions",
    description="Set the timecodes for the video with associated spoken text and visual descriptions",
    parameters={
        "type": "OBJECT",
        "properties": {
            "timecodes": {
                "type": "ARRAY",
                "items": {
                    "type": "OBJECT",
                    "properties": {
                        "time": {"type": "STRING"},
                        "spoken_text": {"type": "STRING"},
                        "visual_description": {"type": "STRING"},
                    },
                    "required": ["time", "spoken_text", "visual_description"],
                }
            }
        },
        "required": ["timecodes"]
    }
)

video_tools = types.Tool(
    function_declarations=[set_timecodes, set_timecodes_with_objects, set_timecodes_with_numeric_values],
)

def set_timecodes_func(timecodes):
    return [{**t, "text": t["text"].replace("\\'", "'")} for t in timecodes]

def set_timecodes_with_objects_func(timecodes):
    return [{**t, "text": t["text"].replace("\\'", "'")} for t in timecodes]

def set_timecodes_with_descriptions_func(timecodes):
    return [{**t, "text": t["spoken_text"].replace("\\'", "'")} for t in timecodes]

In [63]:
response = client.models.generate_content(
    model=MODEL_ID,
    contents=[
        types.Content(
            role="user",
            parts=[
                types.Part.from_uri(
                    file_uri=file_upload.uri,
                    mime_type=file_upload.mime_type),
                ]),
        USER_PROMPT,
    ],
    config=types.GenerateContentConfig(
        system_instruction=SYSTEM_PROMPT,
        tools=[video_tools],
        temperature=0,
    )
)


In [64]:
response.candidates[0].content.parts[0].function_call.name

'set_timecodes_with_numeric_values'

In [65]:
results = response.candidates[0].content.parts[0].function_call.args

In [66]:
from pprint import pprint
import datetime

# Sort the list of dictionaries by the 'time' key
sorted_timecodes = sorted(results['timecodes'], key=lambda x: datetime.datetime.strptime(x['time'], '%H:%M'))

# Ensure 'time' key is first in each dictionary
sorted_timecodes = [{'time': item['time'], **{k: v for k, v in item.items() if k != 'time'}} for item in sorted_timecodes]

results['timecodes'] = sorted_timecodes

print(results)

{'timecodes': [{'time': '00:00', 'value': 1}, {'time': '00:01', 'value': 1}]}


In [67]:
import json
# Save the JSON to a file with proper formatting

json_output_path = f'{str(video_path.parent)}\\{str(video_path.stem)}_tc_humans.json'

with open(json_output_path, 'w') as json_file:
    json.dump(results, json_file, indent=4)

print(f"Video analyzer results saved to [{json_output_path}]")

Video analyzer results saved to [panda70m_hq6m_formatted_humansOnly_v2.1\00001\0000104_00000_tc_humans.json]
