In [1]:
import audiocap
import transformers

from audiocap.train_whisper_supervised import get_whisper_model

In [2]:
import lovely_numpy
import lovely_tensors
import torch
import numpy as np

def lovely(x):
    if isinstance(x, np.ndarray):
        return lovely_numpy.lovely(x)
    if isinstance(x, torch.Tensor):
        return lovely_tensors.lovely(x)
    if isinstance(x, str):
        return x
    try:
        return "len: " + str(len(x))
    except:
        pass

def inspect(ds, samples=1):
    for i, sample in enumerate(ds):
        if i >= samples:
            break

        for key, value in sample.items():
            print("  ", key, type(value), end="")
            if lovely(value) is not None:
                print(":", lovely(value), end="")
            print()
        if "audio" in sample:
            print("  ", "audio->array", lovely(sample["audio"]["array"]))
        
        print()


In [3]:
architecture_name = "openai/whisper-tiny"

In [4]:
config = transformers.WhisperConfig.from_pretrained(architecture_name)
model = transformers.WhisperConfig.from_pretrained(architecture_name)
tokenizer = transformers.WhisperTokenizer.from_pretrained(architecture_name, language="en", task="transcribe")
feature_extractor = transformers.WhisperFeatureExtractor.from_pretrained(architecture_name)
assert isinstance(config, transformers.WhisperConfig)
model = get_whisper_model(architecture_name, config, False, False)

In [5]:
import pathlib
import datasets

audioset_dir = pathlib.Path("../data/audioset_small_dummy/")
audiocaps_dir = pathlib.Path("../data/audiocaps_dummy/")

ds_audioset = audiocap.data.load_audioset_small(
    audioset_dir / "audiofolder",
    audioset_dir / "annotations/ontology.json",
)

ds_audiocaps = audiocap.data.load_audiocaps(audiocaps_dir / "audiofolder")

Resolving data files:   0%|          | 0/128434 [00:00<?, ?it/s]

Found cached dataset audiofolder (/var/tmp/xkadlci2/.cache/huggingface/datasets/audiofolder/default-20a7d1f57c344d53/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)


  0%|          | 0/1 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/2727 [00:00<?, ?it/s]

Found cached dataset audiofolder (/var/tmp/xkadlci2/.cache/huggingface/datasets/audiofolder/default-cb14e1e33eee7836/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)


  0%|          | 0/1 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/2844 [00:00<?, ?it/s]

Found cached dataset audiofolder (/var/tmp/xkadlci2/.cache/huggingface/datasets/audiofolder/default-858ce7cd649041a3/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)


  0%|          | 0/1 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/45395 [00:00<?, ?it/s]

Found cached dataset audiofolder (/var/tmp/xkadlci2/.cache/huggingface/datasets/audiofolder/default-5e5b67b602b44fdb/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)


  0%|          | 0/1 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/449 [00:00<?, ?it/s]

Found cached dataset audiofolder (/var/tmp/xkadlci2/.cache/huggingface/datasets/audiofolder/default-b53c2008566841d9/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)


  0%|          | 0/1 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/887 [00:00<?, ?it/s]

Found cached dataset audiofolder (/var/tmp/xkadlci2/.cache/huggingface/datasets/audiofolder/default-1af03cfbd7a86dba/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
import numpy as np

def gen_1():
    yield {"audio": {"array": np.zeros(150)}, "b": np.ones(200), "input_features": np.ones((80, 3000))}
    yield {"audio": {"array": np.zeros(180)}, "b": np.ones(300), "input_features": np.ones((80, 3000))}

def gen_2():
    yield {"audio": {"array": np.zeros(100)}, "b": np.ones(200), "input_features": np.ones((80, 3000))}
    yield {"audio": {"array": np.zeros(100)}, "b": np.ones(300), "input_features": np.ones((80, 3000))}

foo_1 = datasets.IterableDataset.from_generator(gen_1)
foo_2 = datasets.IterableDataset.from_generator(gen_2)

In [8]:
interleaved = audiocap.data.interleave_datasets([foo_1, foo_2], stop_on_first_end=False)

In [9]:
inspect(interleaved)

   audio <class 'dict'>: len: 1
   b <class 'numpy.ndarray'>: array[200] x∈[1.000, 1.000] μ=1.000 σ=0.
   input_features <class 'numpy.ndarray'>: array[80, 3000] n=240000 x∈[1.000, 1.000] μ=1.000 σ=0.
   audio->array array[100] [38;2;127;127;127mall_zeros[0m



In [10]:
interleaved = {
    "train": audiocap.data.interleave_datasets([ds_audioset["train"], ds_audiocaps["train"]], stop_on_first_end=False),
    "val": audiocap.data.interleave_datasets([ds_audioset["val"], ds_audiocaps["val"]], stop_on_first_end=False),
    "test": audiocap.data.interleave_datasets([ds_audioset["test"], ds_audiocaps["test"]], stop_on_first_end=False),
}

In [11]:
preprocessing = audiocap.preprocess.Preprocess(tokenizer, feature_extractor)

for split in ["train", "val", "test"]:
    interleaved[split] = interleaved[split].map(
        preprocessing,
        batched=True,
        batch_size=16,
        remove_columns=["audio", "prefix"],
    )

In [12]:
inspect(interleaved["train"])

   caption <class 'str'>: arrow, onomatopoeia, whoosh - swoosh - swish
   caption_colname <class 'str'>: labels
   source_ds <class 'str'>: audioset
   task <class 'str'>: keywords
   path <class 'NoneType'>
   audio_array <class 'numpy.ndarray'>: array[160497] x∈[-1.003, 1.014] μ=0.000 σ=0.113
   sampling_rate <class 'int'>
   forced_ac_decoder_ids <class 'list'>: len: 7
   filename <class 'NoneType'>
   input_features <class 'numpy.ndarray'>: array[80, 3000] f32 n=240000 x∈[-0.549, 1.451] μ=-0.404 σ=0.356
   labels <class 'list'>: len: 29

