In [1]:
from transformers import pipeline
from datasets import load_dataset



In [2]:
# load pipeline and dummy dataset
pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-tiny"
)
pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language="es", task="transcribe")

dataset = load_dataset("common_voice", "es", split="train", streaming=True)

In [3]:
# inspect dataset -> we're interested in the 'audio' column for ASR inference
print(dataset.features)

{'client_id': Value(dtype='string', id=None), 'path': Value(dtype='string', id=None), 'audio': Audio(sampling_rate=48000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string', id=None), 'up_votes': Value(dtype='int64', id=None), 'down_votes': Value(dtype='int64', id=None), 'age': Value(dtype='string', id=None), 'gender': Value(dtype='string', id=None), 'accent': Value(dtype='string', id=None), 'locale': Value(dtype='string', id=None), 'segment': Value(dtype='string', id=None)}


In [4]:
# get the audio column of the first sample -> we only slice to ["audio"] and no further
sample = next(iter(dataset))["audio"]
print(sample)

{'path': 'cv-corpus-6.1-2020-12-11/es/clips/common_voice_es_18306544.mp3', 'array': array([0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 8.9168549e-05,
       2.5069714e-04, 2.3895502e-04], dtype=float32), 'sampling_rate': 48000}


Note how this audio sample has three entries:
1. path: path to the audio file (this is deprecated and you won't find it on newer datasets). Absolute path for non-streaming mode (saved to disk) and relative path for streaming mode (since we load it to memory)
2. array: 1-dimensional audio array
3. sampling_rate: this is the key one! Note how we only have this when we slice to `["audio"]`, but we'd lose it if we sliced to `["audio"]["array"]`

If we slice further and set:
```python
sample = next(iter(dataset))["audio"]["array"]
```
We'd lose the sampling rate information. Here, the `pipeline` would only receive the raw 1-d audio array, and would **assume** that the sampling rate is equal to the sampling rate of the model. To prevent silent errors, we should always try and pass the dictionary `["audio"]` to our model (rather than just `["audio"]["array"]`

In [5]:
out = pipe(sample)
print(out)

{'text': ' Pero acá su tiene dispuesta a su gente.', 'path': ['cv-corpus-6.1-2020-12-11/es/clips/common_voice_es_18306544.mp3']}




In [6]:
# verifying against the transcription
print(next(iter(dataset))["sentence"])

¿ pero acaso tiene dispuesta su gente ?


In [7]:
from datasets import Audio

dataset = dataset.cast_column("audio", Audio(16000))
sample = next(iter(dataset))["audio"]

out = pipe(sample)
print(out)

{'text': ' Pero acá su tiene dispuesta a su gente.', 'path': ['cv-corpus-6.1-2020-12-11/es/clips/common_voice_es_18306544.mp3']}


Same result => `pipeline` resamples for us provided we specify the sampling rate in the input dict