# Setup

In [1]:
import os
import torchaudio

In [2]:
# data path:
DATA_ROOT = "/proj/systewar/datasets/audioset_music_mood/audio_files"
# audio constants:
SAMPLE_RATE = 16000     # in Hz
CLIP_LENGTH_EXPECT = 10.0      # in seconds

In [3]:
# script options:
data_subsets = ["train", "eval"]

# Dataset Verification

In [4]:
# extract data file names:
data_file_names = {}
for subset in data_subsets:
    subset_root = os.path.join(DATA_ROOT, subset)
    data_file_names[subset] = [name for name in os.listdir(subset_root) if os.path.isfile(os.path.join(subset_root, name))]
    print("Original size of {} set: {}".format(subset, len(data_file_names[subset])))

Original size of train set: 13268
Original size of eval set: 344


In [5]:
# verify dataset:
for subset in data_subsets:
    print("Veryifing {} subset...".format(subset))
    for file_name in data_file_names[subset]:
        file_path = os.path.join(DATA_ROOT, subset, file_name)
        
        # verify file name:
        assert file_path.endswith(".wav"), "File path does not end in '.wav'"
        # verify sampling rate:
        metadata = torchaudio.info(file_path)
        assert metadata.sample_rate == SAMPLE_RATE, "Incorrect sampling rate."

Veryifing train subset...
Veryifing eval subset...


# Dataset Exploration

In [6]:
# count number of audio files with unexpected clip lengths:
for subset in data_subsets:
    n_unexpect_files = 0
    for file_name in data_file_names[subset]:
        file_path = os.path.join(DATA_ROOT, subset, file_name)

        # get clip length:
        metadata = torchaudio.info(file_path)
        length = metadata.num_frames / SAMPLE_RATE
        if length != CLIP_LENGTH_EXPECT:
            n_unexpect_files += 1
    
    print("Number of files in {} set with unexpected lengths: {}".format(subset, n_unexpect_files))

Number of files in train set with unexpected lengths: 397
Number of files in eval set with unexpected lengths: 4
