<a href="https://colab.research.google.com/github/shreeya-la/audio-ml/blob/main/vggish_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install and Import

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

In [2]:
!pip install tf_keras



In [3]:
import tf_keras as tfk

In [4]:
import pandas as pd

In [66]:
!pip install soundfile
!pip install librosa



In [67]:
import librosa

# VGGish

In [18]:
# Load the model
vgg_model = hub.load('https://www.kaggle.com/models/google/vggish/TensorFlow2/vggish/1')

In [30]:
# Input: three seconds of silence as mono 16 kHz waveform samples
waveform = np.zeros(3 * 16000, dtype=np.float32)
waveform.shape

(48000,)

In [21]:
# Run the model, check the output
embeddings = vgg_model(waveform)
embeddings.shape.assert_is_compatible_with([None, 128])

In [22]:
embeddings.shape

TensorShape([3, 128])

# Download ESC50

In [10]:
!test ! -f "master.zip" && wget "https://github.com/karoldvl/ESC-50/archive/master.zip"

--2024-09-18 18:29:05--  https://github.com/karoldvl/ESC-50/archive/master.zip
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://github.com/karolpiczak/ESC-50/archive/master.zip [following]
--2024-09-18 18:29:05--  https://github.com/karolpiczak/ESC-50/archive/master.zip
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/karolpiczak/ESC-50/zip/refs/heads/master [following]
--2024-09-18 18:29:05--  https://codeload.github.com/karolpiczak/ESC-50/zip/refs/heads/master
Resolving codeload.github.com (codeload.github.com)... 140.82.114.10
Connecting to codeload.github.com (codeload.github.com)|140.82.114.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘master.zip’

master.zip              [<=>

In [11]:
!unzip -qq master.zip

In [12]:
%cd ESC-50-master

/content/ESC-50-master


In [40]:
# create a dataframe
esc50 = pd.read_csv('meta/esc50.csv')
esc50.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A


In [51]:
# the 10 classes that are in the animals category
animals = ['dog', 'rooster', 'pig', 'cow', 'frog', 'cat', 'hen', 'insects', 'sheep', 'crow']

esc50_animals = esc50[esc50['category'].isin(animals)]

# display the filtered dataset; should have 40*10 = 400 rows
print(esc50_animals)

              filename  fold  target category  esc10  src_file take
0     1-100032-A-0.wav     1       0      dog   True    100032    A
8     1-103298-A-9.wav     1       9     crow  False    103298    A
14    1-110389-A-0.wav     1       0      dog   True    110389    A
29    1-121951-A-8.wav     1       8    sheep  False    121951    A
45     1-15689-A-4.wav     1       4     frog  False     15689    A
...                ...   ...     ...      ...    ...       ...  ...
1983  5-261325-A-9.wav     5       9     crow  False    261325    A
1994  5-263831-A-6.wav     5       6      hen  False    263831    A
1995  5-263831-B-6.wav     5       6      hen  False    263831    B
1998   5-61635-A-8.wav     5       8    sheep  False     61635    A
1999    5-9032-A-0.wav     5       0      dog   True      9032    A

[400 rows x 7 columns]


# Sample of ESC50 with VGGish

In [31]:
# Load the .wav file using librosa

def preprocess_wav(filename, target_sr=16000):
    # Load the .wav file using librosa
    audio, sampling_rate = librosa.load(filename, sr=None)

    # Resample to target sample rate (16kHz)
    audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=target_sr)

    # If audio has more than 1 channel, convert to mono
    if len(audio.shape) > 1:
        audio = librosa.to_mono(audio)

    # Convert to float32 TensorFlow tensor
    audio_tensor = tf.convert_to_tensor(audio, dtype=tf.float32)

    return audio_tensor

In [63]:
# get a sample .wav file
sample_file = esc50_animals.iloc[0]['filename']
esc_single_waveform = preprocess_wav('audio/' + sample_file)

# 5 seconds so shape should be (80000,)
esc_single_waveform.shape

TensorShape([80000])

In [36]:
embeddings = vgg_model(esc_single_waveform)
embeddings.shape.assert_is_compatible_with([None, 128])

In [65]:
# should be [5,128]
embeddings.shape

TensorShape([5, 128])

# Test ESC50 with VGGish

In [70]:
# collect the 400 embedding outputs
embeddings_list = []

for index, row in esc50_animals.iterrows():
    # preprocess each .wav file
    esc_waveform = preprocess_wav('audio/' + row['filename'])

    # run VGGish and get the embeddings
    embeddings = vgg_model(esc_waveform)

    # append the embeddings to the array
    embeddings_list.append(embeddings)

In [71]:
len(embeddings_list)

400

In [74]:
embeddings_list[0].shape

TensorShape([5, 128])