In [2]:
from skimage.metrics import structural_similarity as ssim
import os
import cv2
from tqdm import tqdm
import matplotlib.pyplot as plt
import PIL.Image
import csv
import json
from pytube import YouTube
from moviepy.editor import VideoFileClip
import google.generativeai as genai

def extract_clip(video_path, output_path, start, end):
    try:
        clip = VideoFileClip(video_path)
        extracted_clip = clip.subclip(start, end)
        extracted_clip.write_videofile(output_path)
        print(f"First 5 seconds of {video_path} extracted successfully")
    except Exception as e:
        print(f"Error extracting first 5 seconds of {video_path}: {e}")
def process_youtube_video(video_id, start, end, prompt):
    # Download the YouTube video
    video_url = f"https://www.youtube.com/watch?v={video_id}"
    video = YouTube(video_url)
    video_stream = video.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
    video_filename = f"{video_id}.mp4"
    video_stream.download(filename=video_filename)
    extract_clip(video_filename, f'clip/{video_filename}', start, end)
    # Opening the video file
    cap = cv2.VideoCapture(f'clip/{video_filename}')
    n_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    # Creating a directory to save the selected frames
    output_directory = f'{video_id}/selected_frames'
    os.makedirs(output_directory, exist_ok=True)
    selected_frames = []
    previous_frame = None
    threshold = 0.5  # threshold for frame_idx in tqdm(range(n_frames), desc="Processing Frames"):
    for frame_idx in tqdm(range(n_frames), desc="Processing Frames"):
      ret, img = cap.read()
      if not ret:
        break
      # Splitting the frame into RGB channels
      b, g, r = cv2.split(img)
      if previous_frame is not None:
          # Structural Similarity Index (SSI) for each channel
          ssim_b, _ = ssim(previous_frame[0], b, full=True)
          ssim_g, _ = ssim(previous_frame[1], g, full=True)
          ssim_r, _ = ssim(previous_frame[2], r, full=True)
          # Combining the SSIM scores from each channel
          similarity_index = (ssim_b + ssim_g + ssim_r) / 3
          # If frames are distinct enough, then only adding the current frame to the selected frames
          if similarity_index < threshold:
              selected_frames.append(img)
              # Saving the selected frame to the output directory
              frame_filename = os.path.join(output_directory, f"frame_{frame_idx:04d}.png")
              cv2.imwrite(frame_filename, img)
      previous_frame = cv2.split(img)
    # Releasing the video capture object to free the space captured
    cap.release()
    # Removing the downloaded video file
    os.remove(video_filename)
    print(f'Total key frames based on the threshold chosen: {len(selected_frames)}')
    if len(selected_frames) == 0: return
    model = genai.GenerativeModel('gemini-pro-vision')
    images = []
    for i in os.listdir(f'{video_id}/selected_frames') :
        img = PIL.Image.open(f'{video_id}/selected_frames/{i}')
        images.append(img)
    images = images[:min(len(images, 4))]
    images.insert(0, prompt)
    response = model.generate_content(images)
    return response.text
def read_times(file_path):
    data = {}
    with open(file_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            ytid = row['# YTID']
            start_seconds = float(row['start_seconds'])
            end_seconds = float(row['end_seconds'])
            data[ytid] = (start_seconds, end_seconds)
    return data
def generate_output(json_file_path, csv_file_path):
    with open(json_file_path, 'r') as file:
        data = json.load(file)
    ivals = read_times(csv_file_path)
    output = []
    for i, item in enumerate(data[:20]):
        video = item['video']
        vid_id = video.split('#')[-1]
        question_text = item['question']
        options = ''.join([f"({chr(65+i)}){option}" + (", " if i < len(item['multi_choice']) - 1 else "") for i, option in enumerate(item['multi_choice'])])
        prompt = f"For the video and audio pair, choose the correct option. Do not provide any explanation or extra sentences. {question_text} {options}"
        
        print(prompt)
        ans = process_youtube_video(vid_id, ivals[vid_id][0], ivals[vid_id][1], prompt)
        output_item = {
            "vname": video,
            "raw_ans": f"Based on the video and audio pair, the correct answer is {ans}.",
            "extracted_ans": ans,
            "id": item['id'],
            "prompt": prompt
        }
        output.append(output_item)
    with open('answer.json', 'w') as outfile:
        json.dump(output, outfile, indent=4)
generate_output('/home/mila/s/subhrajyoti.dasgupta/scratch/videollama/data/gt/iasd/audioset/instruct_test_IASD.json', 
                '/home/mila/s/subhrajyoti.dasgupta/scratch/videollama/data/audioset/audioset_videos_of_interest_with_label_names.csv')

For the video and audio pair, choose the correct option. Do not provide any explanation or extra sentences. What object is present in the video? (A)Bottle, (B)Hob, (C)Calculator, (D)Sponge
For the video and audio pair, choose the correct option. Do not provide any explanation or extra sentences. What object is present in the video? (A)Rag, (B)Tile, (C)Sweater, (D)Recliner
For the video and audio pair, choose the correct option. Do not provide any explanation or extra sentences. What object is present in the video? (A)Tray, (B)floor, (C)Beanie, (D)Tape
For the video and audio pair, choose the correct option. Do not provide any explanation or extra sentences. What object is present in the video? (A)Rock, (B)Connector, (C)Grill, (D)Rug
For the video and audio pair, choose the correct option. Do not provide any explanation or extra sentences. What object is present in the video? (A)Luggage, (B)Cloth, (C)Socks, (D)Condenser
For the video and audio pair, choose the correct option. Do not pro