In [None]:
!pip install soundfile
!pip install librosa

In [None]:
!pip install matplotlib

In [None]:
!pip install datasets
!pip install transformers

In [None]:
!pip install matplotlib

# Audio classification with a pipeline

In [None]:
from datasets import load_dataset
from datasets import Audio

minds = load_dataset("PolyAI/minds14", name="en-AU", split="train")
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))

In [None]:
from transformers import pipeline

classifier = pipeline(
    "audio-classification",
    model="anton-l/xtreme_s_xlsr_300m_minds14",
)

In [None]:
example = minds[0]

In [None]:
classifier(example["audio"]["array"])

In [None]:
id2label = minds.features["intent_class"].int2str
id2label(example["intent_class"])

# Automatic speech recognition with a pipeline

## English

In [None]:
from transformers import pipeline

asr = pipeline("automatic-speech-recognition")

In [None]:
example = minds[0]

In [None]:
asr(example["audio"]["array"])

In [None]:
example["english_transcription"]

## Deutsch

In [None]:
minds_de = load_dataset("PolyAI/minds14", name="de-DE",split="train")
minds_de = minds_de.cast_column("audio", Audio(sampling_rate=16_000))
minds_de

In [None]:
example_de = minds_de[0]
example_de

In [None]:
asr_de = pipeline("automatic-speech-recognition", model="maxidl/wav2vec2-large-xlsr-german")
asr_de(example_de["audio"]["array"])

# Hands-on exercise

## Find dataset and model for Afrikaans and do asr on one example from dataset

In [None]:
from datasets import load_dataset
afr = load_dataset("openslr", name="SLR32", split="train")
afr

In [None]:
from datasets import Audio
afr = afr.cast_column("audio", Audio(sampling_rate=16_000))

In [None]:
example = afr[8]
example

In [None]:
import IPython
IPython.display.Audio(data=example["audio"]["array"], rate=example["audio"]["sampling_rate"])

In [None]:
import matplotlib.pyplot as plt
import librosa.display

plt.figure().set_figwidth(12)
librosa.display.waveshow(example["audio"]["array"], sr=example["audio"]["sampling_rate"])

In [None]:
import numpy as np
S = librosa.feature.melspectrogram(y=example["audio"]["array"], sr=example["audio"]["sampling_rate"], n_mels=40, fmax=8000)
S_db = librosa.power_to_db(S, ref=np.max)
plt.figure().set_figwidth(12)
librosa.display.specshow(S_db, x_axis="time", y_axis="mel", sr=example["audio"]["sampling_rate"], fmax=8000)
plt.colorbar()

In [None]:
model_spec = "Ari/whisper-base-af-za"

from transformers import pipeline

asr = pipeline("automatic-speech-recognition", model=model_spec)
test_transcription = asr(example["audio"]["array"])['text']

In [None]:
print(f"Expected: {example['sentence']}\n")
print(f"Transcribed: {test_transcription}")