#Audio classification


In [None]:
!pip install datasets

In [None]:
#Involves assigning one or more labels to an audio recording based on its content

from datasets import load_dataset
from datasets import Audio

minds = load_dataset("PolyAI/minds14", name="en-AU", split="train")
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))

In [None]:
#o classify an audio recording into a set of classes, we can use the audio-classification pipeline from Transformers

from transformers import pipeline

classifier = pipeline(
    "audio-classification",
    model="anton-l/xtreme_s_xlsr_300m_minds14",
)

In [None]:
#Now we can can directly give audio data as a NumPy array to the classifier and see the output

example = minds[0]
print(f"example {example}\n")

classifier(example["audio"]["array"])

In [None]:
#Let’s see what the actual label for the above example

id2label = minds.features["intent_class"].int2str
id2label(example["intent_class"])

#Automatic speech recognition

In [None]:
#Involves transcribing speech audio recording into text

from transformers import pipeline

asr = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-small"
)

In [None]:
#Give an example to see the ouput of asr

example = minds[0]
print(f"example: {example['transcription']}\n\n")

asr(example["audio"]["array"])

The `pipeline()` takes care of all the pre/post-processing, so there is no need to worry about getting the data into the right format for a model

#Audio generation

In [None]:
!pip install --upgrade transformers

In [None]:
#Create pipeline

from transformers import pipeline

pipe = pipeline("text-to-speech", model="suno/bark-small")

In [None]:
#pass some text through the pipeline

text = "Banks facilitate capital flow, credit creation, financial stability, and economic growth by mobilizing savings and financing businesses and consumers"
output = pipe(text)

In [None]:
#Listen to the result

from IPython.display import Audio

Audio(output["audio"], rate=output["sampling_rate"])

In [None]:
song = "♪ In the jungle, the mighty jungle, the ladybug was seen. ♪ "
output = pipe(song)
Audio(output["audio"], rate=output["sampling_rate"])

###Generating music

In [None]:
music_pipe = pipeline("text-to-audio", model="facebook/musicgen-small")

text = "90s rock song with electric guitar and heavy drums" #text description of the music we want to generate

forward_params = {"max_new_tokens": 512}

output = music_pipe(text, forward_params=forward_params) #control the length of the generated output by passing an additional max_new_tokens parameter to the model
Audio(output["audio"][0], rate=output["sampling_rate"])

In [None]:
text_classic = "Classic sitar with flute and tabla in background" #text description of the music we want to generate

forward_params = {"max_new_tokens": 512}

output = music_pipe(text_classic, forward_params=forward_params) #control the length of the generated output by passing an additional max_new_tokens parameter to the model
Audio(output["audio"][0], rate=output["sampling_rate"])