In [None]:
import os
import time
import pandas as pd
import whisper
import subprocess
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import cv2
import json
from transformers import AutoTokenizer
import torch
import ollama
import requests
import numpy as np
import easyocr
from PIL import Image, ImageEnhance
from paddleocr import PaddleOCR
import re

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
df = pd.read_parquet("hf://datasets/lmms-lab/AISG_Challenge/data/test-00000-of-00001.parquet")

In [3]:
df.head()

Unnamed: 0,qid,video_id,question_type,capability,question,duration,question_prompt,answer,youtube_url
0,0008-0,sj81PWrerDk,Primary Open-ended Question,Plot Attribute (Montage),What is the difference between the action of t...,8.85,Please state your answer with a brief explanat...,,https://www.youtube.com/shorts/sj81PWrerDk
1,0008-1,sj81PWrerDk,Paraphrased Open-ended Question,Plot Attribute (Montage),Can you describe how the actions of the last p...,8.85,Please state your answer with a brief explanat...,,https://www.youtube.com/shorts/sj81PWrerDk
2,0008-2,sj81PWrerDk,Correctly-led Open-ended Question,Plot Attribute (Montage),Did the last person open the bottle without us...,8.85,Please state your answer with a brief explanat...,,https://www.youtube.com/shorts/sj81PWrerDk
3,0008-3,sj81PWrerDk,Wrongly-led Open-ended Question,Plot Attribute (Montage),Did the last person in the video open the bott...,8.85,Please state your answer with a brief explanat...,,https://www.youtube.com/shorts/sj81PWrerDk
4,0008-7,sj81PWrerDk,Multiple-choice Question with a Single Correct...,Plot Attribute (Montage),How does the last person in the video open the...,8.85,E. None of the above\nSelect one best answer t...,,https://www.youtube.com/shorts/sj81PWrerDk


In [None]:
whisper_model = whisper.load_model("large-v3")

### Number of unique videos


In [None]:
unique_videos = df[["video_id", "youtube_url"]].drop_duplicates()
print(f"Total unique videos: {len(unique_videos)}")

Total unique videos: 292


# Download videos from Youtube


In [None]:
def download_video(video_id, youtube_url):
    video_path = f"videos/{video_id}.mp4"
    
    # Skip if already downloaded
    if os.path.exists(video_path):
        print(f"Skipping {video_id}, already downloaded.")
        return video_path

    try:
        command = f'yt-dlp -f "bestvideo[ext=mp4]+bestaudio[ext=m4a]" --merge-output-format mp4 "{youtube_url}" -o "{video_path}"'

        subprocess.run(command, shell=True, check=True)
        print(f"Downloaded: {video_path}")
    except subprocess.CalledProcessError as e:
        print(f"Error downloading {video_id}: {e}")
        with open("download_errors.log", "a") as f:
            f.write(f"{video_id},{youtube_url}\n")
        time.sleep(5)  # Wait before retrying the next video
    
    return video_path

In [None]:
# Process full dataset, handling errors
for i, row in unique_videos.iterrows():
    download_video(row["video_id"], row["youtube_url"])

# Extracting frames


In [None]:
def extract_frames(video_path, video_id, interval=2):
    """
    Extract frames from a video every `interval` seconds.
    Saves RGB frames in 'frames/{video_id}/' directory.
    Returns a list of metadata including filenames and timestamps.
    """
    output_folder = os.path.join("frames", video_id)
    metadata_path = os.path.join(output_folder, "frame_metadata.json")

    # If frames were already extracted, skip processing
    if os.path.exists(metadata_path):
        print(f"Skipping '{video_id}' — frames already extracted.")
        with open(metadata_path, "r") as f:
            return json.load(f)

    os.makedirs(output_folder, exist_ok=True)

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Cannot open video file {video_path}")
        return []

    fps = cap.get(cv2.CAP_PROP_FPS)
    if not fps or fps <= 0:
        print(f"Error: Invalid FPS for {video_path}")
        return []

    # Convert interval in seconds to interval in frame count
    frame_interval = int(fps * interval)

    count = 0
    saved_frames = 0
    frame_metadata = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break  # Stop at end of video or error

        # Only saves frames that are evenly divisible by the interval (e.g. every 60th frame)
        if count % frame_interval == 0:
            # Timestamp in seconds based on current frame index
            timestamp_sec = count / fps

            # Convert from OpenCV's BGR to standard RGB
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

            # Build filename and path
            frame_filename = f"{video_id}_frame_{saved_frames}.jpg"
            frame_path = os.path.join(output_folder, frame_filename)

            # Save frame as RGB JPEG
            Image.fromarray(frame_rgb).save(frame_path)

            # Append metadata for later use
            frame_metadata.append({
                "video_id": video_id,
                "frame": frame_filename,
                "frame_index": count,
                "timestamp": round(timestamp_sec, 2)
            })

            saved_frames += 1

        count += 1

    cap.release()
    print(f"Extracted {saved_frames} frames for '{video_id}'")

    # Save metadata to a JSON file in the same folder
    with open(metadata_path, "w") as f:
        json.dump(frame_metadata, f, indent=2)

    return frame_metadata

In [None]:
# Processing all videos
video_folder = "/Users/ryan/Documents/GitHub/AISG_Challenge/videos"
video_files = [f for f in os.listdir(video_folder) if f.endswith(".mp4")]

for video_file in video_files:
    video_id = os.path.splitext(video_file)[0]  # Get video ID (filename without extension)
    video_path = os.path.join(video_folder, video_file)  # Full path to video

    extract_frames(video_path, video_id)


# Generating image captions


In [86]:
def preprocess_for_ocr(image_path):
    """Enhance image contrast and resize for better OCR."""
    img = Image.open(image_path).convert("L")
    img = img.resize((img.width * 4, img.height * 4))
    img = ImageEnhance.Contrast(img).enhance(2.5)
    img = ImageEnhance.Sharpness(img).enhance(2.5)
    return img

In [102]:
# PaddleOCR setup
ocr_reader = PaddleOCR(use_angle_cls=True, lang='en')

def extract_text_ocr_paddle(image_path, min_conf=0.2):
    """Extract text from image using PaddleOCR """
    try:
        img = np.array(preprocess_for_ocr(image_path))
        results = ocr_reader.ocr(img, cls=True)

        # Handle None or empty list
        if not results or not results[0]:  
            return ""

        lines = []
        for line in results[0]:
            text, conf = line[1]
            if conf >= min_conf and len(text.strip()) >= 3 and not text.strip().isdigit():
                lines.append(text.strip())

        return " ".join(lines)

    except Exception as e:
        print(f"[OCR ERROR] Failed on {image_path}: {e}")
        return ""


[2025/04/14 00:24:29] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/Users/ryan/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/Users/ryan/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6

In [None]:
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

def generate_blip_caption(image_path):
    """Generate a descriptive caption using BLIP-1."""
    image = Image.open(image_path).convert("RGB")
    inputs = blip_processor(image, return_tensors="pt").to("cpu")
    output = blip_model.generate(**inputs, max_new_tokens=30)
    return blip_processor.decode(output[0], skip_special_tokens=True)

In [88]:
def get_combined_caption(image_path, verbose=False):
    """Combines OCR and BLIP-1 captions."""
    ocr_text = extract_text_ocr_paddle(image_path)
    blip_caption = generate_blip_caption(image_path)

    ocr_clean = ocr_text.strip().lower()
    blip_clean = blip_caption.strip().lower()

    # Heuristic check for weak captions
    weak_phrases = ["a photo of", "a person", "someone is", "there is"]
    is_blip_weak = any(p in blip_clean for p in weak_phrases)

    if verbose and not ocr_clean and is_blip_weak:
        print(f"Weak caption: {os.path.basename(image_path)}")
        print(f"  OCR: {ocr_text}")
        print(f"  BLIP: {blip_caption}")

    ocr_part = f"OCR: {ocr_clean}" if ocr_clean else "OCR: [None detected]"
    return f"{ocr_part} | BLIP: {blip_clean}"

## Generating captions for all the videos

In [153]:
test_video_ids = df["video_id"].unique().tolist()

In [154]:
frames_folder = "/Users/ryan/Documents/GitHub/AISG_Challenge/frames"
captions = {}

# To generate captions per video using frame metadata
for video_id in test_video_ids:
    video_frames_path = os.path.join(frames_folder, video_id)
    metadata_path = os.path.join(video_frames_path, "frame_metadata.json")

    if not os.path.isdir(video_frames_path):
        print(f"Skipping {video_id}: Folder {video_frames_path} not found.")
        continue

    if not os.path.exists(metadata_path):
        print(f"Skipping {video_id}: Metadata not found at {metadata_path}")
        continue

    print(f"Processing video: {video_id}")
    video_captions = {}

    # Load metadata to get frame info (including timestamp)
    with open(metadata_path, "r") as f:
        frame_metadata = json.load(f)

    for frame_info in frame_metadata:
        frame_file = frame_info["frame"]
        timestamp = frame_info["timestamp"]
        frame_path = os.path.join(video_frames_path, frame_file)

        # Caption generation
        caption_text = get_combined_caption(frame_path)

        # Store both caption and timestamp
        video_captions[frame_file] = {
            "caption": caption_text,
            "timestamp": timestamp
        }

    captions[video_id] = video_captions

Processing video: sj81PWrerDk
[2025/04/14 01:49:42] ppocr DEBUG: dt_boxes num : 1, elapsed : 0.31121373176574707
[2025/04/14 01:49:42] ppocr DEBUG: cls num  : 1, elapsed : 0.022112131118774414
[2025/04/14 01:49:42] ppocr DEBUG: rec_res num  : 1, elapsed : 0.10810613632202148
[2025/04/14 01:49:49] ppocr DEBUG: dt_boxes num : 4, elapsed : 0.23117685317993164
[2025/04/14 01:49:49] ppocr DEBUG: cls num  : 4, elapsed : 0.02298903465270996
[2025/04/14 01:49:49] ppocr DEBUG: rec_res num  : 4, elapsed : 0.34238600730895996
[2025/04/14 01:49:51] ppocr DEBUG: dt_boxes num : 1, elapsed : 0.22858595848083496
[2025/04/14 01:49:51] ppocr DEBUG: cls num  : 1, elapsed : 0.00634002685546875
[2025/04/14 01:49:51] ppocr DEBUG: rec_res num  : 1, elapsed : 0.0883781909942627
[2025/04/14 01:49:53] ppocr DEBUG: dt_boxes num : 1, elapsed : 0.24655723571777344
[2025/04/14 01:49:53] ppocr DEBUG: cls num  : 1, elapsed : 0.006602048873901367
[2025/04/14 01:49:53] ppocr DEBUG: rec_res num  : 1, elapsed : 0.0896561

In [155]:
output_path = "/Users/ryan/Documents/GitHub/AISG_Challenge/video_captions.json"
with open(output_path, "w") as f:
    json.dump(captions, f, indent=4)

print(f"Captions saved to {output_path}")

Captions saved to /Users/ryan/Documents/GitHub/AISG_Challenge/video_captions.json


# Transcribing audio

In [None]:
model = whisper.load_model("turbo")

def transcribe_audio(video_path):
    return model.transcribe(video_path)["text"]

In [None]:
video_folder = "/Users/ryan/Documents/GitHub/AISG_Challenge/videos"
video_files = [f for f in os.listdir(video_folder) if f.endswith(".mp4")]

json_path = "/Users/ryan/Documents/GitHub/AISG_Challenge/video_transcriptions.json"

# Load existing transcriptions if the file exists
if os.path.exists(json_path):
    with open(json_path, "r") as f:
        transcriptions = json.load(f)
else:
    transcriptions = {}  # Initialize empty dictionary if no file exists

# Get all video files
video_files = [f for f in os.listdir(video_folder) if f.endswith(".mp4")]

# Loop through each video and transcribe only if not already transcribed
for video_file in video_files:
    video_id = os.path.splitext(video_file)[0]  # Extract filename without extension

    if video_id in transcriptions:
        print(f"Skipping {video_id} (already transcribed).")
        continue  # Skip if already transcribed

    video_path = os.path.join(video_folder, video_file)
    
    print(f"Processing: {video_id} ...")
    transcriptions[video_id] = transcribe_audio(video_path)  # Store transcription

# Save updated transcriptions to JSON
with open(json_path, "w") as f:
    json.dump(transcriptions, f, indent=4)

print("Transcription process complete!")

# Merging transcriptions and captions from frame

In [156]:
# Load transcriptions
with open("/Users/ryan/Documents/GitHub/AISG_Challenge/video_transcriptions.json", "r") as f:
    transcriptions = json.load(f)

# Load captions
with open("/Users/ryan/Documents/GitHub/AISG_Challenge/video_captions.json", "r") as f:
    captions = json.load(f)

video_context = {}

for video_id in transcriptions.keys():
    if video_id in captions:
        # Combine transcription + captions
        video_context[video_id] = {
            "transcription": transcriptions[video_id],
            "captions": captions[video_id]
        }

# Save combined data
with open("/Users/ryan/Documents/GitHub/AISG_Challenge/video_context.json", "w") as f:
    json.dump(video_context, f, indent=4)

# LLM for Video Question and Answering


## Creates a Readable and Structured Prompt

In [123]:
def is_gibberish(text):
    text_alpha_ratio = sum(c.isalpha() for c in text) / max(len(text), 1)
    return text_alpha_ratio < 0.4 or bool(re.search(r"[^a-zA-Z0-9\s:.,!?'-]", text))

def clean_ocr(text):
    if text in ["[None detected]", "", None] or is_gibberish(text):
        return "[Low-quality text removed]"
    return text

def clean_blip(text):
    text = text.strip()
    return re.sub(r'\barafed\b', '', text).strip()

def clean_transcription(text):
    text = text.strip()
    text = re.sub(r'\b(ah+|uh+|oh+)\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\s+', ' ', text)
    return text.strip("., ")

In [158]:
def clean_video_context(video_data, frames_root):
    cleaned = {}

    for video_id, content in video_data.items():
        metadata_path = os.path.join(frames_root, video_id, "frame_metadata.json")
        timestamp_lookup = {}
        if os.path.exists(metadata_path):
            with open(metadata_path, "r", encoding="utf-8") as f:
                metadata = json.load(f)
                for entry in metadata:
                    timestamp_lookup[(entry["video_id"], entry["frame"])] = entry["timestamp"]

        cleaned_transcript = clean_transcription(content.get("transcription", ""))
        captions = content.get("captions", {})
        cleaned_captions = {}
        last_blip = None

        for frame, data in captions.items():
            raw_caption = data.get("caption", "")
            timestamp = timestamp_lookup.get((video_id, frame), data.get("timestamp", 0.0))

            if " | " in raw_caption:
                parts = raw_caption.split(" | ", maxsplit=1)
                ocr_text = parts[0].replace("OCR:", "").strip()
                blip_text = parts[1].replace("BLIP:", "").strip() if len(parts) > 1 else ""
            else:
                ocr_text, blip_text = "", raw_caption.replace("BLIP:", "").strip()

            ocr_clean = clean_ocr(ocr_text.replace("OCR:", "").strip())
            blip_clean = clean_blip(blip_text.replace("BLIP:", "").strip())

            if blip_clean == last_blip:
                blip_clean = "[Repeated frame description removed]"
            else:
                last_blip = blip_clean

            cleaned_caption = f"OCR: {ocr_clean} | BLIP: {blip_clean}"
            cleaned_captions[frame] = {
                "caption": cleaned_caption,
                "timestamp": timestamp
            }

        cleaned[video_id] = {
            "transcription": cleaned_transcript,
            "captions": cleaned_captions
        }

    return cleaned

In [159]:
# Load raw video context
with open("/Users/ryan/Documents/GitHub/AISG_Challenge/video_context.json", "r", encoding="utf-8") as f:
    raw_video_data = json.load(f)

# Clean the context using per-video metadata
frames_root = "/Users/ryan/Documents/GitHub/AISG_Challenge/frames"
cleaned_video_data = clean_video_context(raw_video_data, frames_root)

# Save cleaned output
with open("/Users/ryan/Documents/GitHub/AISG_Challenge/video_context_cleaned.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_video_data, f, indent=2)

In [None]:
with open("/Users/ryan/Documents/GitHub/AISG_Challenge/video_context_cleaned.json", "r", encoding="utf-8") as file:
    video_data = json.load(file)

def get_combined_context(video_id):
    """Combine transcript and structured captions for a given video ID."""
    if video_id not in video_data:
        return "Video ID not found."

    transcription = video_data[video_id].get("transcription", "")
    captions_dict = video_data[video_id].get("captions", {})
    structured_captions = []

    # Convert to a list and sort by timestamp
    sorted_captions = sorted(
        captions_dict.items(),
        key=lambda x: x[1].get("timestamp", 0.0)
    )

    for frame, frame_data in sorted_captions:
        caption_text = frame_data.get("caption", "")
        timestamp = frame_data.get("timestamp", None)

        # Split OCR and BLIP (if combined)
        if " | " in caption_text:
            ocr_caption, blip_caption = caption_text.split(" | ")
        else:
            ocr_caption, blip_caption = "", caption_text

        ocr_text = ocr_caption.replace("OCR:", "").strip()
        blip_text = blip_caption.replace("BLIP:", "").strip()

        caption_block = f"{frame} (t={timestamp:.2f}s):\n  OCR: {ocr_text}\n  BLIP: {blip_text}"
        structured_captions.append(caption_block)

    captions_str = "\n\n".join(structured_captions)

    combined_text = f"Transcript: {transcription}\n\nCaptions:\n{captions_str}"
    return combined_text


## Mistral model

In [108]:
# Initialize tokenizer once
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

In [160]:
def ask_mistral2(question, context, question_type=None, capability=None, max_context_tokens=6000):
    """
    Final optimized prompt for Mistral 7B to do YouTube Shorts QA, reasoning from partial/missing frames.
    """

    # Step 1: Truncate context
    context_tokens = tokenizer.encode(context)
    truncated_context = tokenizer.decode(context_tokens[:max_context_tokens])

    # Step 2: Define answer style
    question_format = {
        "Multiple-choice": "Respond with just the correct letter (A, B, C, or D).",
        "Wrongly-led": "Respond 'Yes' or 'No' only. Briefly justify if needed.",
        "Correctly-led": "Respond 'Yes' or 'No' only. Briefly justify if needed."
    }.get(question_type, "Respond in one precise sentence.")

    # Step 3: Define capability focus
    capability_map = {
        "reasoning": "Use logical reasoning, even if some frames are missing.",
        "temporal": "Track sequence and timing of events.",
        "emotional": "Interpret emotional tone, reactions, or changes.",
        "action": "Focus on physical movement or gestures shown."
    }

    capability_hint = ""
    if capability:
        for key, val in capability_map.items():
            if key in capability.lower():
                capability_hint = val
                break
        else:
            capability_hint = f"Apply {capability.lower()} skills where needed."

    # Step 4: Build final system prompt
    prompt = f"""<s>[INST] <<SYS>>
You are a highly accurate assistant answering questions about short video clips (e.g., YouTube Shorts).

The context includes:
- Frame-level descriptions (may be missing or incomplete)
- On-screen text (OCR)
- Audio transcript in time order

{capability_hint}
{question_format}

Important rules:
- Infer missing or unclear parts based on the overall flow
- Do NOT just rely on surface words—**reason from sequence and implication**
- Do NOT repeat or paraphrase the question
- Do NOT explain your reasoning unless asked
- Do NOT say “I think”, “It seems”, or “Based on the video”
- Answer directly and clearly
<</SYS>>

Context:
{truncated_context}

Question: {question}
Answer: [/INST]
"""

    # Step 5: API Call
    try:
        response = requests.post(
            "http://localhost:11434/api/generate",
            json={
                "model": "mistral",
                "prompt": prompt,
                "stream": False,
                "options": {
                    "temperature": 0.4,
                    "max_tokens": 300,
                    "stop": ["</s>", "[INST]"]
                }
            },
            timeout=180
        )
        if response.status_code != 200:
            return f"API Error: {response.status_code}"
        return response.json().get("response", "").strip()
    except Exception as e:
        return f"Error: {str(e)}"


## Running inference on the test videos (10 videos)

In [162]:
# Filtering the df for the test videos and its relevant questions
test_df = df[df["video_id"].isin(test_video_ids)]

In [32]:
test_df.head()

Unnamed: 0,qid,video_id,question_type,capability,question,duration,question_prompt,answer,youtube_url
40,0080-0,_MXxJT8Mk4k,Primary Open-ended Question,Professional Knowledge,What is the purpose of beating the balloon?,13.18,Please state your answer with a brief explanat...,,https://www.youtube.com/shorts/_MXxJT8Mk4k
41,0080-1,_MXxJT8Mk4k,Paraphrased Open-ended Question,Professional Knowledge,Why do you need to hit the balloon in this act...,13.18,Please state your answer with a brief explanat...,,https://www.youtube.com/shorts/_MXxJT8Mk4k
42,0080-2,_MXxJT8Mk4k,Correctly-led Open-ended Question,Professional Knowledge,Is the purpose of hitting the balloon to creat...,13.18,Please state your answer with a brief explanat...,,https://www.youtube.com/shorts/_MXxJT8Mk4k
43,0080-3,_MXxJT8Mk4k,Wrongly-led Open-ended Question,Professional Knowledge,Is the purpose of hitting the balloon to make ...,13.18,Please state your answer with a brief explanat...,,https://www.youtube.com/shorts/_MXxJT8Mk4k
44,0080-7,_MXxJT8Mk4k,Multiple-choice Question with a Single Correct...,Professional Knowledge,What is the purpose of beating the balloon?\nA...,13.18,E. None of the above\nSelect one best answer t...,,https://www.youtube.com/shorts/_MXxJT8Mk4k


In [33]:
test_df["question_type"].unique()

array(['Primary Open-ended Question', 'Paraphrased Open-ended Question',
       'Correctly-led Open-ended Question',
       'Wrongly-led Open-ended Question',
       'Multiple-choice Question with a Single Correct Answer'],
      dtype=object)

In [163]:
with open("/Users/ryan/Documents/GitHub/AISG_Challenge/video_context_cleaned.json", "r", encoding="utf-8") as file:
    test_video_data = json.load(file)

results = []
for _, row in test_df.iterrows():
    qid = row["qid"]
    video_id = row["video_id"]
    question = row["question"]
    question_type = row["question_type"]
    capability = row["capability"]
    context = get_combined_context(video_id)

    answer = ask_mistral2(question, context, question_type, capability)
    print(f"{qid}: {answer}, Question: {question}, Question_type: {question_type}")
    results.append({"video_id": video_id,"qid": qid, "pred": answer, "question" : question,"question_type" : question_type})

0008-0: The last person in the video is not shown performing any action. The first two people are depicted as sitting at a table, one with a bottle of beer and the other with a knife and a bottle of coke., Question: What is the difference between the action of the last person in the video and the actions of the first two people?, Question_type: Primary Open-ended Question
0008-1: The last person in the video is drinking beer, while the others are not shown consuming alcohol. Additionally, the last person is holding a knife and a bottle of Coke, which suggests they may be preparing food or drinks for someone else, as neither the other individuals nor any food or drink items are shown in their hands., Question: Can you describe how the actions of the last person in the video differ from other individuals?, Question_type: Paraphrased Open-ended Question
0008-2: Yes, the man opened the beer bottle without using a knife., Question: Did the last person open the bottle without using a knife?,

In [164]:
submission_test_df = pd.DataFrame(results)

In [None]:
pd.DataFrame(results).to_csv("Raw_QA.csv", index=False)

In [167]:
# Clean multiple-choice answers to keep only the letter (A, B, C, D)
def extract_choice_letter(answer):
    match = re.match(r"^\s*([A-Da-d])\b", str(answer))
    return match.group(1).upper() if match else answer

submission_test_df["pred"] = submission_test_df["pred"].apply(extract_choice_letter)

submission_test_df = submission_test_df[["qid", "pred"]]

In [168]:
# Save the cleaned version
submission_test_df.to_csv("/Users/ryan/Documents/GitHub/AISG_Challenge/submission_cleaned.csv", index=False)