<a href="https://colab.research.google.com/github/rahiakela/audio-processing-research-and-practice/blob/main/huggingface-audio-course/unit-02-audio-application/01_audio_classification_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

**Reference**:

[Audio classification with a pipeline](https://huggingface.co/learn/audio-course/chapter2/audio_classification_pipeline)

In [None]:
!pip install transformers
!pip install datasets[audio]

In [8]:
from datasets import load_dataset
from datasets import Audio

from transformers import pipeline

from transformers import WhisperFeatureExtractor
from transformers import AutoProcessor

import librosa
import numpy as np
import matplotlib.pyplot as plt
import librosa.display

##Dataset

In [5]:
minds = load_dataset("PolyAI/minds14", name="en-AU", split="train")
minds

Dataset({
    features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
    num_rows: 654
})

In [6]:
# Let’s take a closer look at one of the examples
example = minds[0]
example

{'path': '/root/.cache/huggingface/datasets/downloads/extracted/a19fbc5032eacf25eab0097832db7b7f022b42104fbad6bd5765527704a428b9/en-AU~PAY_BILL/response_4.wav',
 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/a19fbc5032eacf25eab0097832db7b7f022b42104fbad6bd5765527704a428b9/en-AU~PAY_BILL/response_4.wav',
  'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
          0.00024414,  0.0012207 ]),
  'sampling_rate': 8000},
 'transcription': 'I would like to pay my electricity bill using my card can you please assist',
 'english_transcription': 'I would like to pay my electricity bill using my card can you please assist',
 'intent_class': 13,
 'lang_id': 2}

In [7]:
# Resampling audio data
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
minds[0]

{'path': '/root/.cache/huggingface/datasets/downloads/extracted/a19fbc5032eacf25eab0097832db7b7f022b42104fbad6bd5765527704a428b9/en-AU~PAY_BILL/response_4.wav',
 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/a19fbc5032eacf25eab0097832db7b7f022b42104fbad6bd5765527704a428b9/en-AU~PAY_BILL/response_4.wav',
  'array': array([2.36119668e-05, 1.92324660e-04, 2.19284790e-04, ...,
         9.40907281e-04, 1.16613181e-03, 7.20883254e-04]),
  'sampling_rate': 16000},
 'transcription': 'I would like to pay my electricity bill using my card can you please assist',
 'english_transcription': 'I would like to pay my electricity bill using my card can you please assist',
 'intent_class': 13,
 'lang_id': 2}

##Classification pipeline

In [None]:
classifier = pipeline("audio-classification", model="anton-l/xtreme_s_xlsr_300m_minds14")

In [11]:
# it expects the audio data as a NumPy array
example = minds[0]

# let’s pass it straight to the classifier
classifier(example["audio"]["array"])

[{'score': 0.962530791759491, 'label': 'pay_bill'},
 {'score': 0.028672993183135986, 'label': 'freeze'},
 {'score': 0.0033498124685138464, 'label': 'card_issues'},
 {'score': 0.0020058127120137215, 'label': 'abroad'},
 {'score': 0.0008484353311359882, 'label': 'high_value_payment'}]

In [12]:
# Let’s see what the actual label for this example is
id2label = minds.features["intent_class"].int2str
id2label(example["intent_class"])

'pay_bill'