In [1]:
import torch
import torchaudio
import sys
import os
import random
import IPython.display as ipd
import numpy as np

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("[setup]: determining CUDA support...")
print("PyTorch version:", torch.__version__)
print("Torchaudio version:", torchaudio.__version__)
print("CUDA is available:", torch.cuda.is_available())


[setup]: determining CUDA support...
PyTorch version: 2.0.1+cu118
Torchaudio version: 2.0.2+cu118
CUDA is available: True


In [3]:
libs = [
    "pytorchvideo@git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d",
    "timm",
    "ftfy",
    "regex",
    "einops",
    "fvcore",
    "decord"
]

for lib in libs:
    command = f"pip install {lib}"
    os.system(command)

model_path = "./models/imagebind_huge.pth"
if (not os.path.exists(model_path)):
    os.system("wget -nc -P models https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth")

imagebindmodels_path = os.path.abspath('./models')
if imagebindmodels_path not in sys.path:
    sys.path.append(imagebindmodels_path)
imagebindbpe_path = os.path.abspath('./bpe')
if imagebindbpe_path not in sys.path:
    sys.path.append(imagebindbpe_path)

import data
from models import imagebind_model
from models.imagebind_model import ModalityType

Collecting pytorchvideo@ git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d
  Using cached pytorchvideo-0.1.5-py3-none-any.whl




In [4]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Instantiate ImageBind model
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model = model.to(device)

In [5]:
from torchaudio.datasets import SPEECHCOMMANDS
import os

class SubsetSC(SPEECHCOMMANDS):
    def __init__(self, subset: str = None):
        super().__init__("./data", download=True)

        def load_list(filename):
            filepath = os.path.join(self._path, filename)
            with open(filepath) as fileobj:
                return [os.path.normpath(os.path.join(self._path, line.strip())) for line in fileobj]

        if subset == "validation":
            self._walker = load_list("validation_list.txt")
        elif subset == "testing":
            self._walker = load_list("testing_list.txt")
        elif subset == "training":
            excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
            excludes = set(excludes)
            self._walker = [w for w in self._walker if w not in excludes]

print("loading training data...")
train_set = SubsetSC("training")
print("loading testing data...")
test_set = SubsetSC("testing")
print("loading validation data...")
val_set = SubsetSC("validation")

labels = sorted(list(set(datapoint[2] for datapoint in train_set)))
labels.append("unknown")
labels.extend("silence")

loading training data...
loading testing data...
loading validation data...


In order to streamline the program, helper functions were created

In [6]:
def classify_audio(filepath):
    """
    classify an audio recording from the keyword dataset
    
    inputs:
        filepath(string): file path to the audio file
        
    returns:
        label(string): classified label for the audio file
        prob(float): probability for generated label
    """
    text_list=labels
    audio_paths=[filepath]

    # Load data
    inputs = {
        ModalityType.TEXT: data.load_and_transform_text(text_list, device),
        ModalityType.AUDIO: data.load_and_transform_audio_data(audio_paths, device),
    }

    with torch.no_grad():
        embeddings = model(inputs)

    probs = torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1)
    index = np.argmax(probs.cpu().numpy())
    prob = np.max(probs.cpu().numpy())
    label = labels[index]
    return label, prob

def classify_only_audio(filepath):
    """
    classify an audio recording from the keyword dataset
    
    inputs:
        filepath(string): file path to the audio file
        
    returns:
        label(string): classified label for the audio file
        prob(float): probability for generated label
    """
    text_list=labels
    audio_paths=[filepath]

    # Load data
    inputs = {
        ModalityType.TEXT: data.load_and_transform_text(text_list, device),
        ModalityType.AUDIO: data.load_and_transform_audio_data(audio_paths, device),
    }

    with torch.no_grad():
        embeddings = model(inputs)

    probs = torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1)
    index = np.argmax(probs.cpu().numpy())
    label = labels[index]
    return label

def float_to_percent(value):
    percent = value * 100
    percent_with_two_decimals = "{:.2f}%".format(percent)
    return percent_with_two_decimals

def calculate_accuracy(list_a, list_b):
    if len(list_a) != len(list_b):
        raise ValueError("Lists must have the same length.")

    total_elements = len(list_a)
    score = 0
    for i in range(total_elements):
        if list_a[i] == list_b[i]:
                score+=1
    accuracy = score/total_elements
    return accuracy

def evaluate_audio(filepaths, ground_truth_labels):
    """
    classify an audio recording from the keyword dataset
    
    inputs:
        filepath(string[]): file paths to the audio file
        
    returns:
        labels(string[]): classified label for the audio file
        probs(float[]): probabilities for each generated label
    """
    text_list=labels
    audio_paths=filepaths

    # Load data
    inputs = {
        ModalityType.TEXT: data.load_and_transform_text(text_list, device),
        ModalityType.AUDIO: data.load_and_transform_audio_data(audio_paths, device),
    }

    with torch.no_grad():
        embeddings = model(inputs)

    label_probs = torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1)
    generated_labels = []
    generated_probs = []
    for prob in label_probs:
        index = np.argmax(prob.cpu().numpy())
        prob = np.max(prob.cpu().numpy())
        label = labels[index]
        generated_labels.append(label)
        generated_probs.append(prob)
    accuracy = calculate_accuracy(ground_truth_labels, generated_labels)
    accuracy = float_to_percent(accuracy)
    return accuracy

In [7]:
# get one random sample in the validation set and classify it
sample = random.choice(test_set)
waveform, sample_rate, label, speaker_id, utterance_number = sample
print("ground truth label:", label)
sample_path = f"data/SpeechCommands/speech_commands_v0.02/{label}/{speaker_id}_nohash_{utterance_number}.wav"

generated_label, prob = classify_audio(sample_path)
print("generated label:", generated_label, "confidence:", float_to_percent(prob))

if (label == generated_label):
    print("accurate prediction!")
else:
    print("wrong prediction!")
    
# display audio player
ipd.Audio(waveform.numpy(), rate=sample_rate)

ground truth label: one
generated label: up confidence: 86.94%
wrong prediction!


# Evaluation of ImageBind (pretrained) Zero-Shot Performance

Loading a number of random samples from the validation dataset and evaluating the generated label from the ground truth labels gives an idea as to how performant ImageBind is

*it might also be recommended to test ImageBind in the training and test dataset as well, but it is not expecte to show different results as can be explained below*

### Number of Samples
- a higher count will utilize a greater sample size, improving the statistical performance of the evaluation
- However, a greater number of samples will use more VRAM which can be a bottleneck. In my testing, evaluating 500 samples already uses 21 GB (Nvidia L4). This evaluation roughly takes 12s

## Lackluster Performance
- in the sample below, an accuracy of only 3.8% is achieved. This poor performance is reflected in many runs (roughly 2%-4%), which leads to the impression that ImageBind is not a good zero-shot speech classification model, at least in the KWS Google Speech Commands 35-word dataset. Why is this?

### ImageBind wasn't specifically designed for KWS
ImageBind, as the creators have noted, envisions to create multimodal AI systems. This entails emergent zero-shot recognition tasks across modalities — even better than prior specialist models trained specifically for those modalities. Thus, it is not unreasonable to observe that ImageBind will not perform well at a highly specialized task

### Use of pretrained model
- Since this program is using a pretrained model for a multi-modal system that incorporates text, image, video, audio, depth, thermal, and IMU, on a training dataset that is not publicized, it's poor performance compared to SOTA modals that were trained on the actual Google Speech Commands v2 training data.

### Audio Sample Length
- It is also important to note that the audio sample lengths (usually 1-2 seconds) is considered short compared to the demos and audio lengths that ImageBind showcases in their blog. This indicates that rather short audio lengths is not what ImageBind is trained on. This can be seen in the error logs of ImageBind as well, which indicates to the user the following:

```
WARNING:root:Large gap between audio n_frames(48) and target_length (204). Is the audio_target_length setting correct
```

In [8]:
# get n random samples to evaluate imagebind performance
count = 500 
print(f"evaluating {count} random samples...")
samples = random.choices(test_set, k=count)
sample_paths = []
ground_truth_labels = []
for sample in samples:
    waveform, sample_rate, label, speaker_id, utterance_number = sample
    sample_path = f"data/SpeechCommands/speech_commands_v0.02/{label}/{speaker_id}_nohash_{utterance_number}.wav"
    sample_paths.append(sample_path)
    ground_truth_labels.append(label)

accuracy = evaluate_audio(sample_paths, ground_truth_labels)
print(f"ImageBind Zero-Shot Performance:\n{accuracy} with {count} samples")


evaluating 500 random samples...


ImageBind Zero-Shot Performance:
3.80% with 500 samples


# Gradio

I used gradio to quickly create an easy-to-use UI for the app with a microphone as input and the generated label as output. This is good because it utilizes the `classify` function from earlier. Additionally, setting `share=True` allows easy sharing of the app through the Internet.



In [9]:
import gradio as gr

demo = gr.Interface(
    fn=classify_only_audio,     
    inputs=gr.Audio(source="microphone", type="filepath"), 
    outputs="text").launch(share=True)

demo

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://2f19b0868dffdd60f1.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




# IPyWebRTC as Audio Recorder

Utilize ipywebrtc with ipywidgets in order to record audio from user's microphone and classify audio file using KWS label dataset

*Note: might experience problems running in SSH environments*

In [10]:
from ipywebrtc import AudioRecorder, CameraStream

camera = CameraStream(constraints={'audio': True,'video':False})
recorder = AudioRecorder(stream=camera, filename="input", format="wav")

recorder

AudioRecorder(audio=Audio(value=b'', format='wav'), filename='input', format='wav', stream=CameraStream(constr…

In [11]:

recorder.save("input.wav")
generated_label, prob = classify_audio("input.wav")

print("generated label:", generated_label, "confidence:", float_to_percent(prob))


ValueError: No data, did you record anything?

