In [1]:
import os
import time
import pandas as pd
import whisper
import subprocess
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import cv2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_parquet("hf://datasets/lmms-lab/AISG_Challenge/data/test-00000-of-00001.parquet")

In [3]:
df.head()

Unnamed: 0,qid,video_id,question_type,capability,question,duration,question_prompt,answer,youtube_url
0,0008-0,sj81PWrerDk,Primary Open-ended Question,Plot Attribute (Montage),What is the difference between the action of t...,8.85,Please state your answer with a brief explanat...,,https://www.youtube.com/shorts/sj81PWrerDk
1,0008-1,sj81PWrerDk,Paraphrased Open-ended Question,Plot Attribute (Montage),Can you describe how the actions of the last p...,8.85,Please state your answer with a brief explanat...,,https://www.youtube.com/shorts/sj81PWrerDk
2,0008-2,sj81PWrerDk,Correctly-led Open-ended Question,Plot Attribute (Montage),Did the last person open the bottle without us...,8.85,Please state your answer with a brief explanat...,,https://www.youtube.com/shorts/sj81PWrerDk
3,0008-3,sj81PWrerDk,Wrongly-led Open-ended Question,Plot Attribute (Montage),Did the last person in the video open the bott...,8.85,Please state your answer with a brief explanat...,,https://www.youtube.com/shorts/sj81PWrerDk
4,0008-7,sj81PWrerDk,Multiple-choice Question with a Single Correct...,Plot Attribute (Montage),How does the last person in the video open the...,8.85,E. None of the above\nSelect one best answer t...,,https://www.youtube.com/shorts/sj81PWrerDk


In [3]:
whisper_model = whisper.load_model("large-v3")
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

KeyboardInterrupt: 

### Number of unique videos


In [4]:
unique_videos = df[["video_id", "youtube_url"]].drop_duplicates()
print(f"Total unique videos: {len(unique_videos)}")

Total unique videos: 292


# Download videos from Youtube


In [7]:
def download_video(video_id, youtube_url):
    video_path = f"videos/{video_id}.mp4"
    
    # Skip if already downloaded
    if os.path.exists(video_path):
        print(f"Skipping {video_id}, already downloaded.")
        return video_path

    try:
        command = f'yt-dlp -f "bestvideo[ext=mp4]+bestaudio[ext=m4a]" --merge-output-format mp4 "{youtube_url}" -o "{video_path}"'

        subprocess.run(command, shell=True, check=True)
        print(f"Downloaded: {video_path}")
    except subprocess.CalledProcessError as e:
        print(f"Error downloading {video_id}: {e}")
        with open("download_errors.log", "a") as f:
            f.write(f"{video_id},{youtube_url}\n")
        time.sleep(5)  # Wait before retrying the next video
    
    return video_path

In [6]:
# Process full dataset, handling errors
for i, row in unique_videos.iterrows():
    download_video(row["video_id"], row["youtube_url"])

Downloaded: videos/sj81PWrerDk.mp4
Error downloading AGCyLqLuUJ0: Command 'yt-dlp -f "bestvideo[ext=mp4]+bestaudio[ext=m4a]" --merge-output-format mp4 "https://www.youtube.com/shorts/AGCyLqLuUJ0" -o "videos/AGCyLqLuUJ0.mp4"' returned non-zero exit status 1.


KeyboardInterrupt: 

# Extracting frames

In [15]:
def extract_frames(video_path, video_id, interval=5):
    """
    Extract frames from a video every `interval` seconds.
    Saves frames in 'frames/{video_id}/' directory.
    """
    output_folder = f"frames/{video_id}"
    os.makedirs(output_folder, exist_ok=True)  # Ensure directory exists

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Cannot open video file {video_path}")
        return
    
    fps = cap.get(cv2.CAP_PROP_FPS)  # Get video FPS
    if fps is None or fps == 0:
        print(f"Error: Could not retrieve FPS for {video_path}")
        return
    
    frame_interval = int(fps * interval)  # Convert seconds to frame count
    
    count = 0
    saved_frames = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break  # Stop if video ends
        
        if count % frame_interval == 0:
            frame_filename = os.path.join(output_folder, f"{video_id}_frame_{saved_frames}.jpg")
            Image.fromarray(frame).save(frame_filename)  # Save frame as image
            saved_frames += 1

        count += 1

    cap.release()
    print(f"Extracted {saved_frames} frames for {video_id}")


In [None]:
video_folder = "/Users/ryan/Documents/GitHub/AISG_Challenge/videos"
video_files = [f for f in os.listdir(video_folder) if f.endswith(".mp4")]

for video_file in video_files:
    video_id = os.path.splitext(video_file)[0]  # Get video ID (filename without extension)
    video_path = os.path.join(video_folder, video_file)  # Full path to video

    extract_frames(video_path, video_id)


Processing: XeUbzTntoW0 - /Users/ryan/Documents/GitHub/AISG_Challenge/videos/XeUbzTntoW0.mp4
Extracted 12 frames for XeUbzTntoW0
Processing: tyz-WC0YVco - /Users/ryan/Documents/GitHub/AISG_Challenge/videos/tyz-WC0YVco.mp4
Extracted 13 frames for tyz-WC0YVco
Processing: b7h58LOBMcE - /Users/ryan/Documents/GitHub/AISG_Challenge/videos/b7h58LOBMcE.mp4
Extracted 6 frames for b7h58LOBMcE
Processing: 1SiTuYb506o - /Users/ryan/Documents/GitHub/AISG_Challenge/videos/1SiTuYb506o.mp4
Extracted 4 frames for 1SiTuYb506o
Processing: Z-x3kXiyOFc - /Users/ryan/Documents/GitHub/AISG_Challenge/videos/Z-x3kXiyOFc.mp4
Extracted 9 frames for Z-x3kXiyOFc
Processing: fyq0pd_pFvE - /Users/ryan/Documents/GitHub/AISG_Challenge/videos/fyq0pd_pFvE.mp4
Extracted 10 frames for fyq0pd_pFvE
Processing: 3Ts9BX6AKuU - /Users/ryan/Documents/GitHub/AISG_Challenge/videos/3Ts9BX6AKuU.mp4
Extracted 3 frames for 3Ts9BX6AKuU
Processing: eU5I-p1HcUs - /Users/ryan/Documents/GitHub/AISG_Challenge/videos/eU5I-p1HcUs.mp4
Extracte