In [2]:
import os
import subprocess
import csv
from pytube import YouTube
from pytube.exceptions import VideoUnavailable
from itertools import islice

def create_first_5(input_file, output_folder, ytid):
    # Set the output file path
    output_file = os.path.join(output_folder, f"{ytid}.mp4")

    # Check if the output file already exists
    if os.path.exists(output_file):
        print(f"The first 5 seconds file {output_file} already exists, skipping extraction.")
        return

    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Run the ffmpeg command to extract the first 5 seconds
    command = f'ffmpeg -i "{input_file}" -t 00:00:05 -c copy "{output_file}"'
    subprocess.run(command, shell=True, check=True)

    print(f'The first 5 seconds of the video have been extracted and saved to {output_file}')

def download_video(ytid, output_folder):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Set the output file path
    output_file = os.path.join(output_folder, f"{ytid}.mp4")

    try:
        # Download the video from YouTube
        yt = YouTube(f"https://www.youtube.com/watch?v={ytid}")
        video = yt.streams.filter(file_extension='mp4').get_highest_resolution()
        video.download(output_folder, f"{ytid}.mp4")

        print(f'The video {ytid} has been downloaded and saved to {output_file}')
    except VideoUnavailable:
        print(f'Video {ytid} is unavailable and will be skipped.')
    except KeyError:
        print(f'Video {ytid} is age-restricted or has access restrictions and will be skipped.')

# Read the CSV file
csv_file = '/home/mila/s/subhrajyoti.dasgupta/scratch/videollama/data/audioset/audioset_videos_of_interest_with_label_names.csv'
label_names = {}
label_set = set()

with open(csv_file, 'r') as file:
    reader = csv.DictReader(file)
    for row in islice(reader, 50):
        ytid = row['# YTID']
        label_name = row['label_name']
        label_names[ytid] = label_name
        label_set.add(label_name)

        # Download the video from YouTube
        # download_video(ytid, 'vids')

        # Check if the video was downloaded successfully
        if os.path.exists(f'vids/{ytid}.mp4'):
            # Generate the first 5 seconds of the video
            input_file = f'vids/{ytid}.mp4'
            output_folder = 'first_5'
            create_first_5(input_file, output_folder, ytid)
    for row in reader:
        ytid = row['# YTID']
        label_name = row['label_name']
        label_names[ytid] = label_name
        label_set.add(label_name)

print('Label names dictionary:', label_names)
print('Label names set:', label_set)

The first 5 seconds file first_5/--i-y1v8Hy8.mp4 already exists, skipping extraction.
The first 5 seconds file first_5/-0CamVQdP_Y.mp4 already exists, skipping extraction.
The first 5 seconds file first_5/-0Gj8-vB1q4.mp4 already exists, skipping extraction.
Label names dictionary: {'--i-y1v8Hy8': 'Music', '-0CamVQdP_Y': 'Music', '-0Gj8-vB1q4': 'Music', '-0YUDn-1yII': 'Music', '-0jeONf82dE': 'Music', '-0p7hKXZ1ww': 'Speech', '-0vPFx-wRRI': 'Music', '-0xzrMun0Rs': 'Music', '-0yRK50zyTI': 'Speech', '-1Hub6Ps_cc': 'Speech', '-1II0Di9Hkc': 'Music', '-1LQP2wemiQ': 'Speech', '-1OlgJWehn8': 'Music', '-1UWSisR2zo': 'Music', '-1hDIl9Udkw': 'Music', '-1nilez17Dg': 'Speech', '-1pPw9zZopA': 'Toilet flush', '-21_SXelVNo': 'Speech', '-2RPPODqLy4': 'Music', '-2xiZDEuHd8': 'Music', '-3-4qmWSJXU': 'Thunderstorm', '-3-JjN3BXjA': 'Speech', '-36qTeAdDMI': 'Speech', '-3Kv4fdm7Uk': 'Music', '-3z5mFRgbxc': 'Speech', '-4SYC2YgzL8': 'Music', '-4pmCrSdMhg': 'Music', '-53zl3bPmpM': 'Music', '-5FoeegAgvU': 'Music'

In [9]:
import json
import random

with open('/home/mila/s/subhrajyoti.dasgupta/scratch/videollama/data/gt/iavqd/instruct_test_IAVQD_44K.json', 'r') as f:
    data = json.load(f)

new_data = []


count = 0
for d in data:
  video_ids = d['video'].split('#')
  k1, k2 = video_ids[0], video_ids[1]
  l1, l2 = label_names[k1], label_names[k2]
  temp = label_set.copy()
  temp.remove(l1)
  temp.remove(l2)

  ls = list(temp)

  new_data.append({'video': d['video'], 
                   'question': 'What is the sequence of events in the video?', 
                   'multiple_choice': 
                     [f'{l1} is followed by {l2}.', 
                      f'{l2} is followed by {l1}.', 
                      'Both of them occur at the same time.', 
                      f'{random.choice(ls)} is followed by {random.choice(ls)}.'],
                    'answer': f'{l1} is followed by {l2}.',
                    'task': 'cod',
                    'id': count
                  })
  count += 1

with open("/home/mila/s/subhrajyoti.dasgupta/scratch/videollama/data/gt/cod/instruct_test_COD_44K.json", "w") as final:
    json.dump(new_data, final)

In [6]:
len(new_data)

51728