# Setup

In [1]:
import os
import torchaudio
import numpy as np
import json
import tqdm

In [2]:
# data paths:
DATA_ROOT = "/proj/systewar/datasets/NSynth"
# audio constants:
SAMPLE_RATE = 16000     # in Hz
CLIP_LENGTH_EXPECT = 4.0      # in seconds

In [3]:
# script options:
data_subsets = ["train", "valid", "test"]

# Dataset Verification

In [4]:
# extract data file names and sampling rates from metadata json files:
data_file_names = {}
for subset in data_subsets:
    # load metadata:
    metadata_file = os.path.join(DATA_ROOT, f"nsynth-{subset}", "examples.json")
    with open(metadata_file, "r") as json_file:
        metadata = json.load(json_file)
    # extract file names:
    data_file_names[subset] = list(metadata.keys())
    print("Size of {} set: {}".format(subset, len(data_file_names[subset])))
    # verify sampling rates:
    sample_rates = [item["sample_rate"] for item in list(metadata.values())]
    assert set(sample_rates) == {SAMPLE_RATE}, "Found unexpected sampling rate."

Size of train set: 289205
Size of valid set: 12678
Size of test set: 4096


In [5]:
# extract data file names from folders:
data_file_names_check = {}
for subset in data_subsets:
    subset_root = os.path.join(DATA_ROOT, f"nsynth-{subset}", "audio")
    data_file_names_check[subset] = [name for name in os.listdir(subset_root) if os.path.isfile(os.path.join(subset_root, name))]
    print("Size of {} set: {}".format(subset, len(data_file_names_check[subset])))

Size of train set: 289205
Size of valid set: 12678
Size of test set: 4096


In [6]:
# verify dataset:
for subset in data_subsets:
    # check for duplicate file names:
    assert len(set(data_file_names_check[subset])) == len(data_file_names_check[subset]), "Duplicate file names found."

    for file_name in tqdm.tqdm(data_file_names_check[subset], total=len(data_file_names_check[subset]), desc="Verifying {} subset...".format(subset)):
        file_path = os.path.join(DATA_ROOT, f"nsynth-{subset}", "audio", file_name)

        # verify file name:
        assert file_path.endswith(".wav"), "File name does not end in '.wav'"
        
        # verify sampling rate:
        metadata = torchaudio.info(file_path)
        assert metadata.sample_rate == SAMPLE_RATE, "Incorrect sampling rate."

Verifying train subset...: 100%|██████████| 289205/289205 [00:10<00:00, 28095.66it/s]
Verifying valid subset...: 100%|██████████| 12678/12678 [00:00<00:00, 25303.57it/s]
Verifying test subset...: 100%|██████████| 4096/4096 [00:00<00:00, 29290.74it/s]


# Dataset Exploration

In [7]:
# count number of audio files with unexpected clip lengths:
for subset in data_subsets:
    n_unexpect_files = 0
    min_clip_length = np.inf
    max_clip_length = 0.0
    print()
    for file_name in tqdm.tqdm(data_file_names_check[subset], total=len(data_file_names_check[subset]), desc="Examining {} subset...".format(subset)):
        file_path = os.path.join(DATA_ROOT, f"nsynth-{subset}", "audio", file_name)

        # get clip length:
        metadata = torchaudio.info(file_path)
        length = metadata.num_frames / SAMPLE_RATE

        # check things:
        if length != CLIP_LENGTH_EXPECT:
            n_unexpect_files += 1
        if length < min_clip_length:
            min_clip_length = length
        if length > max_clip_length:
            max_clip_length = length
    
    print("Number of files with unexpected lengths: {}".format(n_unexpect_files))
    print("Minimim clip length: {}s".format(min_clip_length))
    print("Maximum clip length: {}s".format(max_clip_length))




Examining train subset...: 100%|██████████| 289205/289205 [00:10<00:00, 28529.45it/s]


Number of files with unexpected lengths: 0
Minimim clip length: 4.0s
Maximum clip length: 4.0s



Examining valid subset...: 100%|██████████| 12678/12678 [00:00<00:00, 25796.30it/s]


Number of files with unexpected lengths: 0
Minimim clip length: 4.0s
Maximum clip length: 4.0s



Examining test subset...: 100%|██████████| 4096/4096 [00:00<00:00, 16424.74it/s]

Number of files with unexpected lengths: 0
Minimim clip length: 4.0s
Maximum clip length: 4.0s



