Cell 1: Install the necessary libraries

In [18]:
!pip install transformers datasets soundfile librosa


Collecting datasets
  Obtaining dependency information for datasets from https://files.pythonhosted.org/packages/72/b3/33c4ad44fa020e3757e9b2fad8a5de53d9079b501e6bbc45bdd18f82f893/datasets-2.21.0-py3-none-any.whl.metadata
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting soundfile
  Obtaining dependency information for soundfile from https://files.pythonhosted.org/packages/71/87/31d2b9ed58975cec081858c01afaa3c43718eb0f62b5698a876d94739ad0/soundfile-0.12.1-py2.py3-none-macosx_11_0_arm64.whl.metadata
  Downloading soundfile-0.12.1-py2.py3-none-macosx_11_0_arm64.whl.metadata (14 kB)
Collecting librosa
  Obtaining dependency information for librosa from https://files.pythonhosted.org/packages/8c/8a/2d231b35456506b7c98b3ab9bbf07917b205fed8615d2e59e976ab497fff/librosa-0.10.2.post1-py3-none-any.whl.metadata
  Downloading librosa-0.10.2.post1-py3-none-any.whl.metadata (8.6 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Obtaining dependency information for pyarrow>=15

In [24]:
!pip install --upgrade pyarrow


[0m

In [36]:
!pip uninstall pyarrow -y
!pip install pyarrow
!pip install pyarrow==15.0.0
!pip install --upgrade pip


Found existing installation: pyarrow 15.0.0
Uninstalling pyarrow-15.0.0:
  Successfully uninstalled pyarrow-15.0.0
Collecting pyarrow
  Obtaining dependency information for pyarrow from https://files.pythonhosted.org/packages/8d/8e/ce2e9b2146de422f6638333c01903140e9ada244a2a477918a368306c64c/pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Using cached pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.3 kB)
Using cached pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl (27.2 MB)
Installing collected packages: pyarrow
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mlflow 2.15.1 requires pyarrow<16,>=4.0.0, but you have pyarrow 17.0.0 which is incompatible.[0m[31m
[0mSuccessfully installed pyarrow-17.0.0
[0mCollecting pyarrow==15.0.0
  Obtaining dependency information for pyarrow==15.0.0 from https://files.pythonhosted.org/packages/

Cell 2: Suppress warning messages from the transformers library


In [39]:
# Import logging utilities to control the verbosity of the output from transformers
from transformers.utils import logging

# Set logging level to only display errors to avoid clutter
logging.set_verbosity_error()


Cell 3: Prepare the dataset of audio recordings


In [47]:
# Import necessary functions for loading datasets
from datasets import load_dataset, load_from_disk

# Load the dataset from the local disk
# The dataset is a collection of different sounds, and we are loading the first 10 samples
dataset = load_from_disk("./models/ashraq/esc50/train")

# Extract the first audio sample from the dataset
audio_sample = dataset[0]

# Display the audio sample's metadata to inspect its contents
audio_sample


AttributeError: module 'pyarrow.lib' has no attribute 'ListViewType'

Cell 4: Play the audio sample

In [44]:
# Import the Audio function from IPython to play the audio in a Jupyter notebook environment
from IPython.display import Audio as IPythonAudio

# Play the first audio sample using its array of values and the sampling rate
IPythonAudio(audio_sample["audio"]["array"], rate=audio_sample["audio"]["sampling_rate"])


NameError: name 'audio_sample' is not defined

Cell 5: Build the zero-shot audio classification pipeline

In [None]:
# Import the pipeline function from transformers
from transformers import pipeline

# Create a zero-shot audio classification pipeline using the pre-trained model
# The model is loaded from the local directory "./models/laion/clap-htsat-unfused"
zero_shot_classifier = pipeline(
    task="zero-shot-audio-classification",
    model="./models/laion/clap-htsat-unfused"
)


Cell 6: Sampling rate adjustment explanation

In [None]:
# Example demonstrating the effect of high-resolution audio on the Whisper model.
# The Whisper model expects audio files sampled at 16,000 Hz.
# Here, we show how 1 second of high-resolution audio (192,000 Hz) appears to the model.
high_res_sample_rate = 192_000  # High-resolution audio sample rate
whisper_sample_rate = 16_000    # Whisper model sample rate

# Compute how long 1 second of high-resolution audio appears to Whisper
high_res_duration = (1 * high_res_sample_rate) / whisper_sample_rate
print(f"1 second of high-resolution audio appears as {high_res_duration} seconds to the Whisper model.")

# Compute how long 5 seconds of high-resolution audio appears to Whisper
high_res_duration_5s = (5 * high_res_sample_rate) / whisper_sample_rate
print(f"5 seconds of high-resolution audio appears as {high_res_duration_5s} seconds to the Whisper model.")


Cell 7: Check the model and audio sample's sampling rates

In [None]:
# Check the model's feature extractor sampling rate
model_sampling_rate = zero_shot_classifier.feature_extractor.sampling_rate
print(f"Model's expected sampling rate: {model_sampling_rate}")

# Check the sampling rate of the audio sample
audio_sample_rate = audio_sample["audio"]["sampling_rate"]
print(f"Audio sample's sampling rate: {audio_sample_rate}")


Cell 8: Adjust the dataset to match the model's sampling rate


In [None]:
# Import the Audio feature type from the datasets library
from datasets import Audio

# Adjust the sampling rate of the dataset to match the model's expected rate (48,000 Hz)
dataset = dataset.cast_column(
    "audio",
    Audio(sampling_rate=48_000)
)

# Fetch the first audio sample again after adjusting the sampling rate
audio_sample = dataset[0]

# Display the updated audio sample to verify the new sampling rate
audio_sample


Cell 9: Perform zero-shot classification with candidate labels


In [None]:
# Define candidate labels to classify the sound in the audio sample
candidate_labels = ["Sound of a dog", "Sound of vacuum cleaner"]

# Perform zero-shot audio classification using the audio sample and candidate labels
classification_result = zero_shot_classifier(audio_sample["audio"]["array"],
                                             candidate_labels=candidate_labels)

# Print the classification result
print(classification_result)


Cell 10: Perform zero-shot classification with more candidate labels

In [49]:
# Define a new set of candidate labels with more diverse sounds
candidate_labels = [
    "Sound of a child crying",
    "Sound of vacuum cleaner",
    "Sound of a bird singing",
    "Sound of an airplane"
]

# Perform zero-shot audio classification using the updated candidate labels
classification_result = zero_shot_classifier(audio_sample["audio"]["array"],
                                             candidate_labels=candidate_labels)

# Print the classification result
print(classification_result)


NameError: name 'zero_shot_classifier' is not defined