In [90]:
import os
import time
import pandas as pd
import whisper
import subprocess
import pytesseract
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import cv2
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import ollama
import requests


os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
df = pd.read_parquet("hf://datasets/lmms-lab/AISG_Challenge/data/test-00000-of-00001.parquet")

In [4]:
df.head()

Unnamed: 0,qid,video_id,question_type,capability,question,duration,question_prompt,answer,youtube_url
0,0008-0,sj81PWrerDk,Primary Open-ended Question,Plot Attribute (Montage),What is the difference between the action of t...,8.85,Please state your answer with a brief explanat...,,https://www.youtube.com/shorts/sj81PWrerDk
1,0008-1,sj81PWrerDk,Paraphrased Open-ended Question,Plot Attribute (Montage),Can you describe how the actions of the last p...,8.85,Please state your answer with a brief explanat...,,https://www.youtube.com/shorts/sj81PWrerDk
2,0008-2,sj81PWrerDk,Correctly-led Open-ended Question,Plot Attribute (Montage),Did the last person open the bottle without us...,8.85,Please state your answer with a brief explanat...,,https://www.youtube.com/shorts/sj81PWrerDk
3,0008-3,sj81PWrerDk,Wrongly-led Open-ended Question,Plot Attribute (Montage),Did the last person in the video open the bott...,8.85,Please state your answer with a brief explanat...,,https://www.youtube.com/shorts/sj81PWrerDk
4,0008-7,sj81PWrerDk,Multiple-choice Question with a Single Correct...,Plot Attribute (Montage),How does the last person in the video open the...,8.85,E. None of the above\nSelect one best answer t...,,https://www.youtube.com/shorts/sj81PWrerDk


In [None]:
whisper_model = whisper.load_model("large-v3")

### Number of unique videos


In [4]:
unique_videos = df[["video_id", "youtube_url"]].drop_duplicates()
print(f"Total unique videos: {len(unique_videos)}")

Total unique videos: 292


# Download videos from Youtube


In [None]:
def download_video(video_id, youtube_url):
    video_path = f"videos/{video_id}.mp4"
    
    # Skip if already downloaded
    if os.path.exists(video_path):
        print(f"Skipping {video_id}, already downloaded.")
        return video_path

    try:
        command = f'yt-dlp -f "bestvideo[ext=mp4]+bestaudio[ext=m4a]" --merge-output-format mp4 "{youtube_url}" -o "{video_path}"'

        subprocess.run(command, shell=True, check=True)
        print(f"Downloaded: {video_path}")
    except subprocess.CalledProcessError as e:
        print(f"Error downloading {video_id}: {e}")
        with open("download_errors.log", "a") as f:
            f.write(f"{video_id},{youtube_url}\n")
        time.sleep(5)  # Wait before retrying the next video
    
    return video_path

In [None]:
# Process full dataset, handling errors
for i, row in unique_videos.iterrows():
    download_video(row["video_id"], row["youtube_url"])

# Extracting frames


In [43]:
def extract_frames(video_path, video_id, interval=2):
    """
    Extract frames from a video every `interval` seconds.
    Saves frames in 'frames/{video_id}/' directory.
    """
    output_folder = f"frames/{video_id}"
    os.makedirs(output_folder, exist_ok=True)  # Ensure directory exists

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Cannot open video file {video_path}")
        return
    
    fps = cap.get(cv2.CAP_PROP_FPS)  # Get video FPS
    if fps is None or fps == 0:
        print(f"Error: Could not retrieve FPS for {video_path}")
        return
    
    frame_interval = int(fps * interval)  # Convert seconds to frame count
    
    count = 0
    saved_frames = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break  # Stop if video ends
        
        if count % frame_interval == 0:
            frame_filename = os.path.join(output_folder, f"{video_id}_frame_{saved_frames}.jpg")
            Image.fromarray(frame).save(frame_filename)  # Save frame as image
            saved_frames += 1

        count += 1

    cap.release()
    print(f"Extracted {saved_frames} frames for {video_id}")


In [None]:
# Testing on 1 video
video_folder = "/Users/ryan/Documents/GitHub/AISG_Challenge/videos"
video_id = "XeUbzTntoW0"
video_path = os.path.join(video_folder, f"{video_id}.mp4") 

extract_frames(video_path, video_id)

Extracted 30 frames for XeUbzTntoW0


In [None]:
# Processing all videos
video_folder = "/Users/ryan/Documents/GitHub/AISG_Challenge/videos"
video_files = [f for f in os.listdir(video_folder) if f.endswith(".mp4")]

for video_file in video_files:
    video_id = os.path.splitext(video_file)[0]  # Get video ID (filename without extension)
    video_path = os.path.join(video_folder, video_file)  # Full path to video

    extract_frames(video_path, video_id)


# Generating image captions


In [91]:
def extract_text_ocr(image_path):
    """Extract on-frame text using OCR (Tesseract)."""
    image = Image.open(image_path).convert("RGB")
    extracted_text = pytesseract.image_to_string(image, lang="eng") 
    return extracted_text.strip()


In [34]:
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

In [71]:
def blip_generate_caption(image_path):
    image = Image.open(image_path)
    inputs = blip_processor(image, return_tensors="pt")
    caption = blip_model.generate(**inputs)
    return blip_processor.decode(caption[0], skip_special_tokens=True)

In [88]:
def get_combined_caption(image_path):
    """Extract text from image (OCR) and generate a caption (BLIP)."""
    ocr_text = extract_text_ocr(image_path)
    blip_caption = blip_generate_caption(image_path)

    if ocr_text:
        return f"OCR Extracted: '{ocr_text}' | BLIP Caption: '{blip_caption}'"
    return f"BLIP Caption: '{blip_caption}'"

In [92]:
# Testing on 1 video
frames_folder = "/Users/ryan/Documents/GitHub/AISG_Challenge/frames"
video_id = "XeUbzTntoW0"  # Specify the single video ID you want to process

video_frames_path = os.path.join(frames_folder, video_id)

# Check if the directory exists
if not os.path.isdir(video_frames_path):
    print(f"Error: Video frames folder {video_frames_path} does not exist.")
else:
    captions = {}
    video_captions = {}

    # Process each frame
    for frame in sorted(os.listdir(video_frames_path)):  # Sort to maintain order
        frame_path = os.path.join(video_frames_path, frame)

        if not frame.lower().endswith((".jpg", ".jpeg", ".png")):  # Skip non-image files
            continue

        caption = get_combined_caption(frame_path)
        video_captions[frame] = caption  # Store caption

    captions[video_id] = video_captions  # Store all captions for the video

    print("Captions Generated!")

Captions Generated!


In [93]:
# Testing on 1 video
with open("/Users/ryan/Documents/GitHub/AISG_Challenge/test_video_captions.json", "w") as f:
    json.dump(captions, f, indent=4)

print("Captions saved to video_captions.json!")

Captions saved to video_captions.json!


In [None]:
frames_folder = "/Users/ryan/Documents/GitHub/AISG_Challenge/frames"
captions = {}

# Loop through all video frame folders
for video_id in os.listdir(frames_folder):
    video_frames_path = os.path.join(frames_folder, video_id)
    
    # Skip non-folder items
    if not os.path.isdir(video_frames_path):
        continue
    
    # Process each frame
    video_captions = {}
    for frame in sorted(os.listdir(video_frames_path)):  # Sort to maintain order
        frame_path = os.path.join(video_frames_path, frame)
        
        if not frame.lower().endswith((".jpg", ".jpeg", ".png")):  # Skip non-image files
            continue
        
        caption = blip_generate_caption(frame_path)
        video_captions[frame] = caption  # Store caption
    
    captions[video_id] = video_captions  # Store all captions for the video

print("Captions Generated!")

In [24]:
# Save captions to a JSON file
with open("/Users/ryan/Documents/GitHub/AISG_Challenge/video_captions.json", "w") as f:
    json.dump(captions, f, indent=4)

print("Captions saved to video_captions.json!")

Captions saved to video_captions.json!


# Transcribing audio

In [20]:
model = whisper.load_model("turbo")

def transcribe_audio(video_path):
    return model.transcribe(video_path)["text"]

In [None]:
video_folder = "/Users/ryan/Documents/GitHub/AISG_Challenge/videos"
video_files = [f for f in os.listdir(video_folder) if f.endswith(".mp4")]

json_path = "/Users/ryan/Documents/GitHub/AISG_Challenge/video_transcriptions.json"

# Load existing transcriptions if the file exists
if os.path.exists(json_path):
    with open(json_path, "r") as f:
        transcriptions = json.load(f)
else:
    transcriptions = {}  # Initialize empty dictionary if no file exists

# Get all video files
video_files = [f for f in os.listdir(video_folder) if f.endswith(".mp4")]

# Loop through each video and transcribe only if not already transcribed
for video_file in video_files:
    video_id = os.path.splitext(video_file)[0]  # Extract filename without extension

    if video_id in transcriptions:
        print(f"Skipping {video_id} (already transcribed).")
        continue  # Skip if already transcribed

    video_path = os.path.join(video_folder, video_file)
    
    print(f"Processing: {video_id} ...")
    transcriptions[video_id] = transcribe_audio(video_path)  # Store transcription

# Save updated transcriptions to JSON
with open(json_path, "w") as f:
    json.dump(transcriptions, f, indent=4)

print("Transcription process complete!")

# Merging transcriptions and captions from frame

In [None]:
# Load transcriptions
with open("/Users/ryan/Documents/GitHub/AISG_Challenge/video_transcriptions.json", "r") as f:
    transcriptions = json.load(f)

# Load captions
with open("/Users/ryan/Documents/GitHub/AISG_Challenge/video_captions.json", "r") as f:
    captions = json.load(f)

# Merge data for 5 test videos
video_context = {}

for video_id in transcriptions.keys():
    if video_id in captions:
        # Combine transcription + captions
        video_context[video_id] = {
            "transcription": transcriptions[video_id],
            "captions": captions[video_id]
        }

# Save combined data
with open("/Users/ryan/Documents/GitHub/AISG_Challenge/video_context.json", "w") as f:
    json.dump(video_context, f, indent=4)

Merged transcriptions and captions for testing!


In [94]:
# Testing on 1 video
# Load transcriptions
with open("/Users/ryan/Documents/GitHub/AISG_Challenge/video_transcriptions.json", "r") as f:
    transcriptions = json.load(f)

# Load captions
with open("/Users/ryan/Documents/GitHub/AISG_Challenge/test_video_captions.json", "r") as f:
    captions = json.load(f)

# Merge data for 5 test videos
video_context = {}

for video_id in transcriptions.keys():
    if video_id in captions:
        # Combine transcription + captions
        video_context[video_id] = {
            "transcription": transcriptions[video_id],
            "captions": captions[video_id]
        }

# Save combined data
with open("/Users/ryan/Documents/GitHub/AISG_Challenge/test_video_context.json", "w") as f:
    json.dump(video_context, f, indent=4)

# LLM for Video Question and Answering


## Combining audio transcribe and frame captioning 

In [5]:
# Define the file path
json_path = "/Users/ryan/Documents/GitHub/AISG_Challenge/video_context.json"

# Load JSON data
with open(json_path, "r", encoding="utf-8") as file:
    video_data = json.load(file)

def get_combined_context(video_id):
    """Combine transcript and captions for a given video ID."""
    if video_id not in video_data:
        return "Video ID not found."

    # Get transcription
    transcription = video_data[video_id].get("transcription", "")

    # Get captions
    captions_dict = video_data[video_id].get("captions", {})
    captions = " ".join(captions_dict.values())  # Combine all captions

    # Combine both transcript and captions
    combined_text = f"Transcript: {transcription}\n\nCaptions: {captions}"
    
    return combined_text

# Example: Choose a video ID
video_id = list(video_data.keys())[0]  # Pick first available video ID
video_context = get_combined_context(video_id)

print("\nCombined Context Preview:")
print(video_context[:1000])  # Show first 1000 characters


Combined Context Preview:
Transcript:  Hey Daniel, would you mind cleaning your room? Sure. Okay Daniel, time to clean your room. No! Hi! Hey Daniel. Yep. Lunch will be ready in a minute. Okay. Hey Daniel, what's up? Hey Daniel, what's up? Hey Daniel, what's up? Lunch will be ready in a minute. Okay. Hey Daniel, look in mommy's eyes. Hey, one more minute and we're gonna have lunch, okay? Wake up baby brother! Here you go. Thank you. Here you go! Thank you. Thank you. Here you go! Thank you. All done. Okay. Alright. All done. This place will be better.

Captions: there is a man that is standing in front of a counter there is a man standing in a kitchen looking at a counter there is a man that is sitting on a chair with a book araffe man in a living room with a pile of trash arafed man sitting in a chair reading a book there is a person laying on a bed with a pillow doorway view of a bathroom with a person standing in the doorway araffe in a mask is standing in front of a glass door the

In [6]:
questions = df[df["video_id"] == "XeUbzTntoW0"]["question"]

for q in questions:
    print(q)

How many scenarios did the man show between the adult and the toddler?
How many different scenarios did the man demonstrate between the adult and the toddler?
Did the man show ten scenarios between the adult and the toddler?
Did the man show six scenarios between the adult and the toddler?
How many scenarios did the man show between the adult and the toddler?
A. Ten
B. Twelve
C. Six
D. Eight


In [95]:
# Testing on 1 video
with open("/Users/ryan/Documents/GitHub/AISG_Challenge/test_video_context.json", "r", encoding="utf-8") as file:
    test_video_data = json.load(file)

test_video_id = list(test_video_data.keys())[0]
test_video_context = get_combined_context(test_video_id)

## Deepseek model

In [53]:
def ask_deepseek_r1(question, context):
    """Ask DeepSeek R1 a question using the combined transcript + captions."""
    prompt = f"{question}\n\nHere is the video context:\n{context}"
    
    try:
        result = subprocess.run(
            ["ollama", "run", "deepseek-r1", prompt], 
            capture_output=True, text=True
        )
        return result.stdout.strip()
    except Exception as e:
        return f"Error: {str(e)}"

In [None]:
# Ask a question about the video
question = "How many scenarios did the man show between the adult and the toddler?"
response = ask_deepseek_r1(question, video_context)

print("\n DeepSeek R1's Answer:")
print(response)

## Mistral model

# Summarising caption context 

In [60]:
def summarize_context(context):
    """Summarize with proper prompt formatting and input handling."""
    # Create a clean, single-line prompt with [INST] tags
    summary_prompt = (
        f"[INST] Summarize this video transcript concisely:\n\n{context}\n[/INST]"
    )

    try:
        process = subprocess.Popen(
            ["ollama", "run", "mistral"],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )
        # Send prompt via stdin instead of command argument
        output, error = process.communicate(input=summary_prompt, timeout=120)
        return output.strip() if output else context
    except Exception as e:
        print(f"Summarization error: {str(e)}")
        return context

In [20]:
print(summarize_context(video_context))

1. A man (Daniel) is asked multiple times by family members to clean his room, but resists.
2. Lunch is prepared and served to someone else.
3. Daniel looks at someone's eyes and wakes up another person.
4. A person thanks someone for something.
5. Daniel finishes cleaning his room, stating it will be better.
6. Various scenes show other family members in different parts of the house (eating, reading, combing hair, skateboarding).


# Mistral answering based on summarised context

In [28]:
import requests

def ask_mistral(question, context):
    """Robust Mistral implementation using HTTP API"""
    summarized_context = summarize_context(context)
    
    prompt = f"""<s>[INST] <<SYS>>
    You are an expert video analysis and Q&A model. Your task is to analyze video content based on:
    {summarized_context}
    <</SYS>>
    
    {question} [/INST]"""
    
    try:
        response = requests.post(
            "http://localhost:11434/api/generate",
            json={
                "model": "mistral",
                "prompt": prompt,
                "stream": False,
                "options": {
                    "temperature": 0.3,
                    "max_tokens": 500,
                    "stop": ["</s>", "[INST]"]
                }
            },
            timeout=120
        )
        response.raise_for_status()
        return response.json()["response"].strip()
        
    except requests.exceptions.RequestException as e:
        return f"API Error: {str(e)}"
    except KeyError:
        return "Error: Invalid response format from API"

In [29]:
question = "How many scenarios did the man show between the adult and the toddler?"
response = ask_mistral(question, video_context)

print("\nMistral's Answer:")
print(response)


Mistral's Answer:
1 scenario: The video shows a scenario involving an adult (presumably Daniel's parents or siblings) taking care of a baby brother, indicating that there is interaction between Daniel and his younger sibling. However, it does not provide specific instances where Daniel directly interacts with the toddler in a caregiving role, so I would count this as 1 scenario.


# Mistral without summarising

In [98]:
# Initialize tokenizer once
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

def ask_mistral2(question, context, max_context_tokens=6000):
    """Use full context with token limit management"""

    # Truncate context to fit model's token window while keeping meaningful information
    context_tokens = tokenizer.encode(context)
    truncated_context = tokenizer.decode(
        context_tokens[:min(len(context_tokens), max_context_tokens)]
    )

    prompt = f"""<s>[INST] <<SYS>>
    You are an expert in video analysis, capable of reconstructing a video's context based on extracted frames and captions. 
    The frames may contain partial information, so infer missing details using logical continuity while avoiding assumptions 
    beyond what is reasonable.

    Given the following extracted frame descriptions, analyze the video holistically by considering:
    - The sequence of events across frames (temporal continuity).
    - Key actions, objects, and interactions present.
    - Any patterns or logical conclusions that can be drawn from multiple frames.

    If there are gaps in information, explicitly mention uncertainty rather than making unsupported assumptions.

    Context:
    {truncated_context}
    <</SYS>>

    Question: {question}
    Think step by step, linking each observation across frames to construct a holistic understanding of the video. 
    Answer the question directly, relying only on the provided context. [/INST]"""
    
    try:
        response = requests.post(
            "http://localhost:11434/api/generate",
            json={
                "model": "mistral",
                "prompt": prompt,
                "stream": False,
                "options": {
                    "temperature": 0.1,
                    "max_tokens": 500,
                    "stop": ["</s>", "[INST]"]
                }
            },
            timeout=180 
        )
        response.raise_for_status()
        return response.json()["response"].strip()
        
    except requests.exceptions.RequestException as e:
        return f"API Error: {str(e)}"
    except KeyError:
        return "Error: Invalid response format"

In [99]:
question = "Did the man show ten scenarios between the adult and the toddler?"
response = ask_mistral2(question, test_video_context)

print("\nMistral's Answer:")
print(response)


Mistral's Answer:
Based on the given transcript and captions, it appears that there are at least six scenarios between the adult (presumably a parent) and the toddler (Daniel) in the video:

1. The adult asks Daniel to clean his room (transcript: "Hey Daniel, would you mind cleaning your room?").
2. The adult instructs Daniel again about cleaning his room (transcript: "Okay Daniel, time to clean your room"). Daniel initially refuses (transcript: "No!").
3. The adult tries to engage with Daniel in a different context (transcript: "Hi!").
4. The adult informs Daniel that lunch will be ready soon (transcript: "Lunch will be ready in a minute").
5. The adult asks Daniel what he is doing, possibly while waiting for lunch (transcript: "Hey Daniel, what's up?").
6. The adult tries to get Daniel's attention and encourages him to look at them (transcript: "Hey Daniel, look in mommy's eyes"). The adult also tells the toddler that lunch will be ready soon (transcript: "Here you go. One more minu