In [1]:
import os
import csv

dataset_dir = '/beegfs/work/AudioSet'
data_dir = os.path.join(dataset_dir, 'data')
audio_ext = '.flac'
video_ext = '.mp4'

In [2]:
# Find out which files have not been downloaded from AudioSet

missing_files = {}
missing_audio_files = {}
missing_video_files = {}

for subset_name in os.listdir(data_dir):
    if not os.path.isdir(os.path.join(data_dir, subset_name)):
        continue
        
    subset_path = os.path.join(dataset_dir, "{}.csv".format(subset_name))
    subset_dir = os.path.join(data_dir, subset_name)
    
    missing_files[subset_name] = []
    missing_audio_files[subset_name] = []
    missing_video_files[subset_name] = []
    
    # Get the files that have been downloaded
    local_subset_audio_files = set([os.path.splitext(fname)[0] for fname in os.listdir(os.path.join(subset_dir, 'audio'))])
    local_subset_video_files = set([os.path.splitext(fname)[0] for fname in os.listdir(os.path.join(subset_dir, 'video'))])

    # Get all files from the the subset csv files
    with open(subset_path, 'r') as f:
        subset_data = csv.reader(f)

        for row_idx, row in enumerate(subset_data):
            # Skip commented lines
            if row[0][0] == '#':
                continue
            ytid, ts_start, ts_end = row[0], float(row[1]), float(row[2])
            tms_start, tms_end = int(ts_start * 1000), int(ts_end * 1000)
            media_filename = '{}_{}_{}'.format(ytid, tms_start, tms_end)
            
            missing_audio = media_filename not in local_subset_audio_files
            missing_video = media_filename not in local_subset_video_files
            
            # Keep track of missing audio or video files
            if missing_audio or missing_video:
                missing_files[subset_name].append(row)
                
                # Keep track of the audio and videos separately for comparison
                if missing_audio:
                    missing_audio_files[subset_name].append(row)
                if missing_video:
                    missing_video_files[subset_name].append(row)
                   
    # Write a new csv containing only the YouTube video segments with missing files
    missing_subset_path = os.path.join(dataset_dir, "{}-missing.csv".format(subset_name))
    with open(missing_subset_path, 'w') as f:
        writer = csv.writer(f)
        writer.writerows(missing_files[subset_name])

In [3]:
# Get the number of YouTube videos with missing files for each subset
for subset, missing_file_list in missing_files.items():
    print("{}: Missing files for {} YouTube videos".format(subset, len(missing_file_list)))

eval_segments: Missing files for 578 YouTube videos
unbalanced_train_segments: Missing files for 375165 YouTube videos
balanced_train_segments: Missing files for 657 YouTube videos


In [4]:
# Get the number of YouTube videos with audio but no video for each subset
for subset in missing_files.keys():
    audio_no_video = list(set(map(tuple, missing_audio_files[subset])) - set(map(tuple, missing_video_files[subset])))
    print("{}: {} YouTube videos with audio but no video".format(subset, len(audio_no_video)))
 

eval_segments: 2 YouTube videos with audio but no video
unbalanced_train_segments: 562 YouTube videos with audio but no video
balanced_train_segments: 2 YouTube videos with audio but no video


In [5]:
# Get the number of YouTube videos with video but no audio for each subset

for subset in missing_files.keys():
    video_no_audio = list(set(map(tuple, missing_video_files[subset])) - set(map(tuple, missing_audio_files[subset])))
    print("{}: {} YouTube videos with video but no audio".format(subset, len(video_no_audio)))

eval_segments: 2 YouTube videos with video but no audio
unbalanced_train_segments: 532 YouTube videos with video but no audio
balanced_train_segments: 4 YouTube videos with video but no audio
