<a href="https://colab.research.google.com/github/shreeya-la/audio-ml/blob/main/yamnet_(original_and_simple).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import and Download ESC50

In [None]:
#!pip install tensorflow --upgrade

In [None]:
# !pip uninstall tensorflow
# !pip uninstall tensorflow-io

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.layers import Layer

In [None]:
!test ! -f "master.zip" && wget "https://github.com/karoldvl/ESC-50/archive/master.zip"

--2024-11-06 19:11:12--  https://github.com/karoldvl/ESC-50/archive/master.zip
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://github.com/karolpiczak/ESC-50/archive/master.zip [following]
--2024-11-06 19:11:13--  https://github.com/karolpiczak/ESC-50/archive/master.zip
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/karolpiczak/ESC-50/zip/refs/heads/master [following]
--2024-11-06 19:11:13--  https://codeload.github.com/karolpiczak/ESC-50/zip/refs/heads/master
Resolving codeload.github.com (codeload.github.com)... 140.82.114.10
Connecting to codeload.github.com (codeload.github.com)|140.82.114.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘master.zip’

master.zip              [   

In [None]:
!unzip -qq master.zip
%cd ESC-50-master

/content/ESC-50-master


In [None]:
# create a dataframe
esc50_data = pd.read_csv('meta/esc50.csv')
esc50_data.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A


In [None]:
animals = ['dog', 'rooster', 'pig', 'cow', 'frog', 'cat', 'hen', 'insects', 'sheep', 'crow']
map_class_to_id = {'dog':0,'rooster':1, 'pig':2, 'cow':3, 'frog':4, 'cat':5, 'hen':6, 'insects':7, 'sheep':8, 'crow':9}

esc50_animals = esc50_data[esc50_data.category.isin(animals)]
class_id = esc50_animals['category'].apply(lambda name: map_class_to_id[name])
esc50_animals = esc50_animals.assign(target=class_id)

esc50_animals.head(10)

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
8,1-103298-A-9.wav,1,9,crow,False,103298,A
14,1-110389-A-0.wav,1,0,dog,True,110389,A
29,1-121951-A-8.wav,1,8,sheep,False,121951,A
45,1-15689-A-4.wav,1,4,frog,False,15689,A
46,1-15689-B-4.wav,1,4,frog,False,15689,B
49,1-16568-A-3.wav,1,3,cow,False,16568,A
64,1-17585-A-7.wav,1,7,insects,False,17585,A
69,1-17970-A-4.wav,1,4,frog,False,17970,A
70,1-18074-A-6.wav,1,6,hen,False,18074,A


In [None]:
def preprocess_wav(filename, target_sr=16000):
    # Load the .wav file using librosa
    audio, sampling_rate = librosa.load(filename, sr=None)

    # Resample to target sample rate (16kHz)
    audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=target_sr)

    # If audio has more than 1 channel, convert to mono
    if len(audio.shape) > 1:
        audio = librosa.to_mono(audio)

    # Convert to float32 TensorFlow tensor
    audio_tensor = tf.convert_to_tensor(audio, dtype=tf.float32)

    return audio_tensor

# YAMNet

In [None]:
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet = hub.load(yamnet_model_handle)

In [None]:
# collect the 400 embedding outputs
embeddings_list = []
# and keep track of labels for training/testing
labels_list = []

for index, row in esc50_animals.iterrows():
    filename = row['filename']
    label = row['category']

    # preprocess each .wav file
    esc_waveform = preprocess_wav('audio/' + filename)

    # run YAMNet and get the embeddings
    scores, embeddings, spectrogram = yamnet(esc_waveform)

    embeddings_list.append(embeddings)
    labels_list.append(label)

In [None]:
# %cd /content/ESC-50-master/

/content/ESC-50-master


In [None]:
# one sample file and embedding
sample_file = esc50_animals.iloc[0]['filename']
esc_single_waveform = preprocess_wav('audio/' + sample_file)

scores, embeddings, spectrogram = yamnet(esc_single_waveform)

print("processed waveform:", esc_single_waveform.shape)
print("yamnet embeddings:", embeddings.shape)

processed waveform: (80000,)
yamnet embeddings: (10, 1024)


In [None]:
# convert to numpy arrays
X = np.array(embeddings_list)
y = np.array(labels_list)

In [None]:
# encode labels (one-hot)
lb = LabelBinarizer()
y_encoded = lb.fit_transform(y)

In [None]:
# split training and test sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [None]:
# 10 animal classes ['dog', 'rooster', 'pig', 'cow', 'frog', 'cat', 'hen', 'insects', 'sheep', 'crow']
num_classes = 10

# YAMNet embeddings as input
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(shape=[10, 1024]),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam',
                     loss='categorical_crossentropy',
                     metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='loss',
                                            patience=3,
                                            restore_best_weights=True,)

In [None]:
history = model.fit(X_train, y_train, epochs=40, batch_size=32, validation_split=0.2, callbacks=callback)

Epoch 1/40
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 0.1653 - loss: 2.3180 - val_accuracy: 0.3438 - val_loss: 2.0562
Epoch 2/40
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.4644 - loss: 1.9266 - val_accuracy: 0.5000 - val_loss: 1.7366
Epoch 3/40
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5928 - loss: 1.6118 - val_accuracy: 0.6562 - val_loss: 1.4490
Epoch 4/40
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.7379 - loss: 1.3816 - val_accuracy: 0.7500 - val_loss: 1.2818
Epoch 5/40
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.8055 - loss: 1.2369 - val_accuracy: 0.8438 - val_loss: 1.1436
Epoch 6/40
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.8542 - loss: 1.0994 - val_accuracy: 0.8750 - val_loss: 1.0013
Epoch 7/40
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━

In [None]:
# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test accuracy: {accuracy:.4f}')

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9492 - loss: 0.3591  
Test accuracy: 0.9375


# Save Model

In [None]:
%cd /content

/content


In [None]:
saved_model_path = './yamnet_and_classifier'

In [None]:
class ReduceMeanLayer(tf.keras.layers.Layer):
  def __init__(self, axis=0, **kwargs):
    super(ReduceMeanLayer, self).__init__(**kwargs)
    self.axis = axis

  def call(self, input):
    return tf.math.reduce_mean(input, axis=self.axis)

class YAMNetLayer(Layer):
    def __init__(self, yamnet_model_handle, **kwargs):
        super(YAMNetLayer, self).__init__(**kwargs)
        self.yamnet_layer = hub.KerasLayer(yamnet_model_handle, trainable=False, name='yamnet')

    def call(self, inputs):
        # reshape the input to match YAMNet's expected 1D tensor input
        reshaped_inputs = tf.reshape(inputs, (-1,))
        # get embeddings from YAMNet
        _, embeddings_output, _ = self.yamnet_layer(reshaped_inputs)
        return embeddings_output

class ExpandDimensions(tf.keras.layers.Layer):
    def call(self, inputs):
        expanded = tf.expand_dims(inputs, axis=1)
        with10 = tf.tile(expanded, [1, 10, 1])
        return with10

input_segment = tf.keras.layers.Input(shape=(80000,), dtype=tf.float32, name='audio')
embedding_extraction_layer = YAMNetLayer(yamnet_model_handle)
embeddings_output = embedding_extraction_layer(input_segment)
expanded_embeddings_output = ExpandDimensions()(embeddings_output)

# pass embeddings to sequential model
serving_outputs = model(expanded_embeddings_output)
serving_outputs = ReduceMeanLayer(axis=0, name='classifier')(serving_outputs)

serving_model = tf.keras.Model(input_segment, serving_outputs)
tf.saved_model.save(serving_model, saved_model_path)

In [None]:
#tf.keras.utils.plot_model(serving_model)

In [None]:
# convert the model.
converter = tf.lite.TFLiteConverter.from_keras_model(serving_model)
tflite_model = converter.convert()

# save the model.
with open('./yamnet.tflite', 'wb') as f:
  f.write(tflite_model)

Saved artifact at '/tmp/tmppye6y96i'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 80000), dtype=tf.float32, name='audio')
Output Type:
  TensorSpec(shape=(10,), dtype=tf.float32, name=None)
Captures:
  135973122956048: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135973122949536: TensorSpec(shape=(32,), dtype=tf.float32, name=None)
  135973122962208: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135973122952176: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135973122949712: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135973122950416: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135973122951648: TensorSpec(shape=(32,), dtype=tf.float32, name=None)
  135973122950768: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135973122952000: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135973122952528: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135973122953584: TensorSpec

In [None]:
wav_file_path = 'audio/' + esc50_animals.iloc[0]['filename']

# Reload and test

In [None]:
# load
tflite_model_path = './yamnet.tflite'
interpreter = tf.lite.Interpreter(model_path=tflite_model_path)

In [None]:
%cd /content/ESC-50-master

/content/ESC-50-master


In [None]:
test_num = 9
sample_file = esc50_animals.iloc[test_num]['filename']
input_data = preprocess_wav('audio/' + sample_file)
input_data = tf.expand_dims(input_data, axis=0)

In [None]:
# allocate the tensors
interpreter.allocate_tensors()

# get input details and set the input tensor
input_details = interpreter.get_input_details()
interpreter.set_tensor(input_details[0]['index'], input_data)

# run inference
interpreter.invoke()

In [None]:
# get and process the output
output_details = interpreter.get_output_details()
output_data = interpreter.get_tensor(output_details[0]['index'])

predicted_index = np.argmax(output_data)

print("True Class Label:", esc50_animals.iloc[test_num]['category'])

predicted_label = animals[predicted_index]
print("Predicted Class Label:", predicted_label)

True Class Label: hen
Predicted Class Label: cat


In [None]:
results = []
count = 0

# for each file in the esc50_animals
for index, row in esc50_animals.iterrows():

    #preprocess
    filename = row['filename']
    wav = preprocess_wav('audio/' + filename)

    input_data = tf.expand_dims(wav, axis=0)
    interpreter.allocate_tensors()

    input_details = interpreter.get_input_details()
    interpreter.set_tensor(input_details[0]['index'], input_data)

    interpreter.invoke()
    output_details = interpreter.get_output_details()
    output_data = interpreter.get_tensor(output_details[0]['index'])

    predicted_index = np.argmax(output_data)

    if(animals[predicted_index] == row['category']):
        count += 1

    # Store the result
    results.append({
        'predicted class': animals[predicted_index],
        'true class': row['category']
    })

#results_df = pd.DataFrame(results)
print("Accuracy: ", count / 400)

Accuracy:  0.0925
