<a href="https://colab.research.google.com/github/prisar/ai_notebooks/blob/main/nb_101.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Authentication and service account setup
from google.colab import auth
from google.auth import default
import os

# Authenticate with Google Cloud
auth.authenticate_user()

# Set project ID
os.environ['GOOGLE_CLOUD_PROJECT'] = 'mrc-quant-ml'


In [None]:

# Install required packages
!pip install -q google-genai google-cloud-aiplatform

In [None]:
# Import and initialize
from google.genai import Client
from google.genai.types import Part, VideoMetadata, FileData
from google.cloud import storage
import asyncio
from concurrent.futures import ThreadPoolExecutor
# import moviepy.editor as mp # You might need to install moviepy
import nest_asyncio


def summarize_video_chunk(video_uri: str, start_offset: str, end_offset: str, prompt: str = "Analyze this video and provide a summary."):
    """Summarizes a video chunk using the Gemini API."""
    client = Client(
        vertexai=True,
        project="mrc-quant-ml",
        location="us-central1",
    )

    response = client.models.generate_content(
        model="gemini-2.0-flash-exp",
        contents=[
            Part(
                video_metadata=VideoMetadata(
                    fps=1,
                    start_offset=start_offset,
                    end_offset=end_offset
                ),
                file_data=FileData(
                    file_uri=video_uri,
                    mime_type="video/mp4",
                ),
            ),
            prompt
        ],
    )
    return response.text

# Function to get video duration
async def get_video_duration(video_uri: str) -> int:
    """Gets the duration of a video from a GCS URI."""
    try:
        # Assuming the video is in a GCS bucket
        client = storage.Client()
        bucket_name, blob_name = video_uri.replace("gs://", "").split("/", 1)
        bucket = client.get_bucket(bucket_name)
        blob = bucket.blob(blob_name)
        # Download the video temporarily to get duration (consider optimizing this)
        temp_file = f"/tmp/{blob_name.split('/')[-1]}"
        blob.download_to_filename(temp_file)
        clip = mp.VideoFileClip(temp_file)
        duration = int(clip.duration)
        os.remove(temp_file) # Clean up the temporary file
        return duration
    except Exception as e:
        print(f"Error getting video duration: {e}")
        return 0 # Return 0 or raise an error based on desired behavior

# Batch processing optimization
async def process_video_chunks_parallel(video_uri: str, chunk_duration_minutes: int = 30, max_workers: int = 4):
    """Process video chunks in parallel for better throughput with 30-minute intervals."""
    chunk_duration = chunk_duration_minutes * 60 # Convert minutes to seconds

    # Get video duration first
    total_duration = await get_video_duration(video_uri)
    print(f"Total video duration: {total_duration} seconds")
    if total_duration == 0:
        print("Could not get video duration. Aborting processing.")
        return []

    chunks = [(i, min(i + chunk_duration, total_duration))
              for i in range(0, total_duration, chunk_duration)]

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        loop = asyncio.get_event_loop()
        tasks = [
            loop.run_in_executor(
                executor,
                summarize_video_chunk,
                video_uri,
                f"{start}s",
                f"{end}s"
            ) for start, end in chunks
        ]

        summaries = await asyncio.gather(*tasks)

    return summaries

# Example usage with error handling
video_uri = "gs://mrc-quant-ml-video-analysis/videoplayback.mp4"

# Example of how to use the parallel processing function

nest_asyncio.apply() # Apply this if running in Colab

try:
    all_summaries = asyncio.run(process_video_chunks_parallel(video_uri, chunk_duration_minutes=30))
    for i, summary in enumerate(all_summaries):
        print(f"Summary for chunk {i+1}:\n{summary}\n")
except Exception as e:
    print(f"Error during parallel processing: {e}")



Total video duration: 7302 seconds
Error during parallel processing: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource exhausted. Please try again later. Please refer to https://cloud.google.com/vertex-ai/generative-ai/docs/error-code-429 for more details.', 'status': 'RESOURCE_EXHAUSTED'}}


In [None]:
all_summaries

['Here is a summary of the video:\n\nThe creator starts the video by performing some audio testing on YouTube and X (formerly Twitter).  After, he transitions into a discussion on a scientific paper, "Diffusion Beats Autoregressive in Data-Constrained Settings." The thumbnail was generated using an AI image generator, although it misspells the word "autoregressive" with three s\'s.\n\nHe then reviews the main points of the paper. Diffusion models have had better results due to their ability to reuse data. The creator references a video on YouTube that further discusses how the noise that is added to training images causes them to become diffused. However, the results from the study are not definitive since they may be the result of models overfitting on specific niches. The creator also looks into compute power. Finally, he touches on the "Chinchilla scaling laws" which attempt to compute the optimal model.',
 'Here is a summary of the video in question.\n\nThe speaker in this video is