In [13]:
from collections import Counter, defaultdict
from pathlib import Path
from statistics import mean
from typing import Tuple

import torchaudio

from src.data.dataset import AudioClassificationDataset
from src.utils import load_config

In [14]:
CONFIG = load_config()


def calculate_wav_duration(filepath: Path) -> Tuple[float, int]:
    waveform, sample_rate = torchaudio.load(filepath)
    num_samples = waveform.shape[1]
    duration_seconds = num_samples / sample_rate
    return duration_seconds, num_samples


def summarize(dataset: AudioClassificationDataset) -> None:
    num_samples = len(dataset.samples)
    class_counter = Counter([sample["label"] for sample in dataset.samples])
    speaker_counter = Counter([sample["speaker"] for sample in dataset.samples])

    class_durations = defaultdict(float)
    samples_counts = []

    for sample in dataset.samples:
        duration_seconds, num_samples_in_waveform = calculate_wav_duration(sample["filepath"])
        class_durations[sample["label"]] += duration_seconds
        samples_counts.append(num_samples_in_waveform)

    total_seconds = sum(class_durations.values())
    mean_duration = total_seconds / num_samples

    print(f"Number of utterances: {num_samples}")
    print(f"Sample rate: {dataset.sample_rate} Hz")
    print(f"Total duration: {total_seconds:.2f}s")
    print(f"Mean utterance duration: {mean_duration:.2f}s")
    print(f"Max samples in utterance: {max(samples_counts)}")
    print(f"Min samples in utterance: {min(samples_counts)}")
    print(f"Mean samples in utterance: {mean(samples_counts):.2f}")
    print(f"Number of classes: {len(class_counter)}")
    print(f"Number of speakers: {len(speaker_counter)}")
    print("Number of utterances per class:")
    print("\tLabel\tNum.\tNum. %\tSec.\tSec. %")
    for label in dataset.class_labels:
        count = class_counter.get(label)
        class_info_string = f"\t{label} \t{count} \t"
        class_info_string += f"{count / num_samples * 100:.2f}%\t"
        class_info_string += f"{class_durations[label]:.0f}s\t"
        class_info_string += f"{class_durations[label] / total_seconds * 100:.2f}%"
        print(class_info_string)

In [15]:
summarize(AudioClassificationDataset(CONFIG))

Number of utterances: 38908
Sample rate: 16000 Hz
Total duration: 38274.61s
Mean utterance duration: 0.98s
Max samples in utterance: 16000
Min samples in utterance: 4096
Mean samples in utterance: 15739.53
Number of classes: 10
Number of speakers: 2519
Number of utterances per class:
	Label	Num.	Num. %	Sec.	Sec. %
	zero 	4052 	10.41%	3999s	10.45%
	one 	3890 	10.00%	3808s	9.95%
	two 	3880 	9.97%	3809s	9.95%
	three 	3727 	9.58%	3667s	9.58%
	four 	3728 	9.58%	3665s	9.57%
	five 	4052 	10.41%	3988s	10.42%
	six 	3860 	9.92%	3812s	9.96%
	seven 	3998 	10.28%	3938s	10.29%
	eight 	3787 	9.73%	3714s	9.70%
	nine 	3934 	10.11%	3874s	10.12%


In [16]:
summarize(AudioClassificationDataset(CONFIG, subset="training"))

Number of utterances: 31158
Sample rate: 16000 Hz
Total duration: 30646.26s
Mean utterance duration: 0.98s
Max samples in utterance: 16000
Min samples in utterance: 4096
Mean samples in utterance: 15737.21
Number of classes: 10
Number of speakers: 2032
Number of utterances per class:
	Label	Num.	Num. %	Sec.	Sec. %
	zero 	3250 	10.43%	3208s	10.47%
	one 	3140 	10.08%	3073s	10.03%
	two 	3111 	9.98%	3053s	9.96%
	three 	2966 	9.52%	2918s	9.52%
	four 	2955 	9.48%	2904s	9.48%
	five 	3240 	10.40%	3188s	10.40%
	six 	3088 	9.91%	3050s	9.95%
	seven 	3205 	10.29%	3158s	10.30%
	eight 	3033 	9.73%	2974s	9.71%
	nine 	3170 	10.17%	3121s	10.19%


In [17]:
summarize(AudioClassificationDataset(CONFIG, subset="validation"))

Number of utterances: 3643
Sample rate: 16000 Hz
Total duration: 3585.54s
Mean utterance duration: 0.98s
Max samples in utterance: 16000
Min samples in utterance: 6144
Mean samples in utterance: 15747.62
Number of classes: 10
Number of speakers: 246
Number of utterances per class:
	Label	Num.	Num. %	Sec.	Sec. %
	zero 	384 	10.54%	379s	10.56%
	one 	351 	9.63%	342s	9.55%
	two 	345 	9.47%	340s	9.48%
	three 	356 	9.77%	349s	9.73%
	four 	373 	10.24%	368s	10.28%
	five 	367 	10.07%	362s	10.10%
	six 	378 	10.38%	373s	10.41%
	seven 	387 	10.62%	380s	10.61%
	eight 	346 	9.50%	339s	9.46%
	nine 	356 	9.77%	353s	9.84%


In [18]:
summarize(AudioClassificationDataset(CONFIG, subset="testing"))

Number of utterances: 4107
Sample rate: 16000 Hz
Total duration: 4042.82s
Mean utterance duration: 0.98s
Max samples in utterance: 16000
Min samples in utterance: 5944
Mean samples in utterance: 15749.96
Number of classes: 10
Number of speakers: 241
Number of utterances per class:
	Label	Num.	Num. %	Sec.	Sec. %
	zero 	418 	10.18%	412s	10.20%
	one 	399 	9.72%	393s	9.72%
	two 	424 	10.32%	415s	10.28%
	three 	405 	9.86%	400s	9.90%
	four 	400 	9.74%	392s	9.70%
	five 	445 	10.84%	439s	10.86%
	six 	394 	9.59%	389s	9.62%
	seven 	406 	9.89%	400s	9.89%
	eight 	408 	9.93%	401s	9.92%
	nine 	408 	9.93%	400s	9.90%
