In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import warnings
warnings.filterwarnings("ignore")

import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt


## Install VGGish Model

In [2]:
!pip install --upgrade numpy==1.21.6 resampy==0.2.2 tensorflow==2.8.2 tf_slim==1.1.0 six soundfile
!git clone https://github.com/tensorflow/models.git
# Grab the VGGish model
!curl -O https://storage.googleapis.com/audioset/vggish_model.ckpt
!curl -O https://storage.googleapis.com/audioset/vggish_pca_params.npz   
# Copy the source files to the current directory.
!cp models/research/audioset/vggish/* /kaggle/working/
!rm -rf models

[0mCloning into 'models'...
remote: Enumerating objects: 86202, done.[K
remote: Counting objects: 100% (1801/1801), done.[K
remote: Compressing objects: 100% (759/759), done.[K
remote: Total 86202 (delta 1156), reused 1634 (delta 1023), pack-reused 84401[K
Receiving objects: 100% (86202/86202), 598.86 MiB | 26.45 MiB/s, done.
Resolving deltas: 100% (61649/61649), done.
curl: /opt/conda/lib/libcurl.so.4: no version information available (required by curl)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  277M  100  277M    0     0   214M      0  0:00:01  0:00:01 --:--:--  214M
curl: /opt/conda/lib/libcurl.so.4: no version information available (required by curl)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 73020  100 73020    0     0  1249k      0 --:--:--

In [3]:
# Run the test, which also loads all the necessary functions.
from vggish_smoke_test import *


Testing your install of VGGish

Log Mel Spectrogram example:  [[-4.47303259 -4.29463765 -4.14939193 ... -3.97474254 -3.94778045
  -3.78685566]
 [-4.48592983 -4.28831745 -4.13994942 ... -3.98374974 -3.94981089
  -3.79512755]
 [-4.46165595 -4.29335712 -4.14907932 ... -3.96438562 -3.9489109
  -3.78621325]
 ...
 [-4.46165595 -4.29335712 -4.14907932 ... -3.96438562 -3.9489109
  -3.78621325]
 [-4.46165595 -4.29335712 -4.14907932 ... -3.96438562 -3.9489109
  -3.78621325]
 [-4.46165595 -4.29335712 -4.14907932 ... -3.96438562 -3.9489109
  -3.78621325]]
VGGish embedding:  [-0.43252096 -0.25330514 -0.03891924 -0.16376    -0.34991813 -0.5993693
 -0.05658102  0.16280255 -0.75551754 -0.08260241 -0.03138599 -0.8314715
 -0.10581703 -0.01420267 -0.11077996 -0.06599088 -0.22666308  0.8060125
 -0.56459844 -0.07349288 -0.06056745 -0.11864144 -0.2629044  -0.4155161
 -0.02423218  0.36676204  0.03564969 -0.5499773  -0.00279096 -0.28981668
 -0.5713452   0.381078    0.13668716  0.91885793  0.8064256  -0.05767

## Generate audio embeddings using pretrained VGGISH model

In [4]:
%%writefile feature_extractor.py
import os
import numpy as np
import soundfile as sf
import tensorflow.compat.v1 as tf
import vggish_input
import vggish_params
import vggish_postprocess
import vggish_slim

class FeatureExtractor:
    
    def __init__(self, checkpoint_path, pca_params_path):
        self.checkpoint_path = checkpoint_path
        self.pca_params_path = pca_params_path
        self.sess = tf.Session(graph=tf.Graph())
        self.pproc = None
        self.features_tensor = None
        self.embedding_tensor = None

    def load_model(self):
        with self.sess.graph.as_default():
            vggish_slim.define_vggish_slim(training=False)
            vggish_slim.load_vggish_slim_checkpoint(self.sess, self.checkpoint_path)
            self.features_tensor = self.sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
            self.embedding_tensor = self.sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)
            self.pproc = vggish_postprocess.Postprocessor(self.pca_params_path)

    def extract_features(self, waveform):
        examples = vggish_input.waveform_to_examples(waveform, vggish_params.SAMPLE_RATE)
        [embedding_batch] = self.sess.run([self.embedding_tensor],
                                          feed_dict={self.features_tensor: examples})
        postprocessed_batch = self.pproc.postprocess(embedding_batch)
        return postprocessed_batch

    def audio_embeddings_generator(self, audio_folder, samples_per_folder=100):
        audio_files = []
        labels = []

        for folder_name in os.listdir(audio_folder):
            folder_path = os.path.join(audio_folder, folder_name)
            if os.path.isdir(folder_path):
                audio_files_in_folder = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.mp3')]
                audio_files_in_folder = audio_files_in_folder[:samples_per_folder]
                audio_files.extend(audio_files_in_folder)
                labels.extend([self.extract_label_from_audio_file(file) for file in audio_files_in_folder])

        batch_embeddings = []
        batch_labels = []
        for audio_file, label in zip(audio_files, labels):
            waveform, _ = sf.read(audio_file)
            waveform = np.asarray(waveform)
            embedding = self.extract_features(waveform)
            batch_embeddings.append(embedding)
            batch_labels.append(label)
            if len(batch_embeddings) == samples_per_folder:
                yield np.array(batch_embeddings), np.array(batch_labels)
                batch_embeddings = []
                batch_labels = []

        if batch_embeddings:
            yield np.array(batch_embeddings), np.array(batch_labels)

    @staticmethod
    def extract_label_from_audio_file(audio_file):
        label = os.path.basename(os.path.dirname(audio_file))
        return label


Overwriting feature_extractor.py


In [5]:
%%writefile main.py
import csv
import numpy as np
import soundfile as sf
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import tqdm
import tensorflow.compat.v1 as tf
from feature_extractor import FeatureExtractor

# Define the necessary flags
tf.compat.v1.flags.DEFINE_string('audio_folder', '/kaggle/input/audio-dataset-with-10-indian-languages/Language Detection Dataset', 'Path to the folder containing audio samples')
tf.compat.v1.flags.DEFINE_string('checkpoint', '/kaggle/working/vggish_model.ckpt', 'Path to the VGGish checkpoint file')
tf.compat.v1.flags.DEFINE_string('pca_params', '/kaggle/working/vggish_pca_params.npz', 'Path to the VGGish PCA parameters file')
tf.compat.v1.flags.DEFINE_integer('samples_per_folder', 100, 'Number of samples to include per folder')

FLAGS = tf.compat.v1.flags.FLAGS


def main(_):
    feature_extractor = FeatureExtractor(FLAGS.checkpoint, FLAGS.pca_params)
    feature_extractor.load_model()

    embeddings = []
    labels = []
    progress_bar = tqdm.tqdm(feature_extractor.audio_embeddings_generator(FLAGS.audio_folder, FLAGS.samples_per_folder), desc='Extracting embeddings')
    for batch_embeddings, batch_labels in progress_bar:
        embeddings.append(batch_embeddings)
        labels.append(batch_labels)

    embeddings = np.concatenate(embeddings, axis=0)
    labels = np.concatenate(labels, axis=0)

    print('Embeddings shape:', embeddings.shape)
    print('Labels shape:', labels.shape)

    # Train models


if __name__ == '__main__':
    tf.compat.v1.app.run(main)


Overwriting main.py


In [6]:

%timeit
!python main.py

I0626 13:35:41.128499 134144635717440 saver.py:1395] Restoring parameters from /kaggle/working/vggish_model.ckpt
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/tensorflow/python/client/session.py", line 1377, in _do_call
    return fn(*args)
  File "/opt/conda/lib/python3.10/site-packages/tensorflow/python/client/session.py", line 1360, in _run_fn
    return self._call_tf_sessionrun(options, feed_dict, fetch_list,
  File "/opt/conda/lib/python3.10/site-packages/tensorflow/python/client/session.py", line 1453, in _call_tf_sessionrun
    return tf_session.TF_SessionRun_wrapper(self._session, options, feed_dict,
tensorflow.python.framework.errors_impl.ResourceExhaustedError: 2 root error(s) found.
  (0) RESOURCE_EXHAUSTED: SameWorkerRecvDone unable to allocate output tensor. Key: /job:localhost/replica:0/task:0/device:CPU:0;0000000000000001;/job:localhost/replica:0/task:0/device:GPU:0;edge_26_vggish_load_pretrained/RestoreV2;0:0
	 [[{{node vggish_load_p