In [None]:
import matplotlib.pyplot as plt
import numpy as np
import librosa
import librosa.display
import IPython.display as ipd
import os
import glob
import pandas as pd
import pickle
import time
import warnings
import tensorflow as tf
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc,roc_auc_score,precision_recall_curve,average_precision_score,f1_score,precision_score,recall_score
import scipy
warnings.filterwarnings("ignore")
import transformers as ppb
import huggingface_hub as hf_hub
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup, AutoModel
import imageio
from time import sleep
import timeit
import cv2
import shutil
from joblib import Parallel, delayed

In [None]:
#download audio dataset
if not os.path.exists('./ESC-50-master'):
    if not os.path.exists('./master'):
        !wget https://codeload.github.com/karolpiczak/ESC-50/zip/refs/heads/master
    !unzip master
    if os.path.exists('./master'):
        !rm master

# collect a list of audio files
audio_files = glob.glob('./ESC-50-master/audio/*.wav')

# divide audiofiles into train and test
train_files, test_files = train_test_split(audio_files, test_size=0.2, random_state=42)

# view folder structure and filenames
#!ls -l ESC-50-master


In [None]:

# view metadata 
#!cat ESC-50-master/meta/esc50.csv

In [None]:
class audio_search:
    def __init__(self):
        self.audio_files = glob.glob('./ESC-50-master/audio/*.wav')
        self.train_files, self.test_files = train_test_split(audio_files, test_size=0.2, random_state=42)
        self.df = pd.read_csv('ESC-50-master/meta/esc50.csv')
        # find sample rate of files
        self.sr = librosa.get_samplerate(self.audio_files[0])

    # convert audio to spectrogram 
    def to_spectrogram(self, audio_file, win=50, hop_length=100, n_fft=2000, log=True, RGB=False):
        if type(audio_file) == str:
            y, self.sr = librosa.load(audio_file)
            X = S = librosa.feature.melspectrogram(y=y, sr=self.sr, window=win, n_fft=n_fft, hop_length=hop_length)
            if RGB == False:
                if log: S = X = librosa.power_to_db(X, ref=np.max)

            if RGB:

                Y = librosa.power_to_db(X, ref=np.max)
                Z = librosa.feature.melspectrogram(y=y, sr=self.sr, window=200, n_fft=n_fft, hop_length=hop_length)
                S = np.dstack((X, Y, Z))
            return S
        else:
            S = librosa.feature.melspectrogram(y=audio_file, sr=self.sr, hop_length=hop_length, n_fft=n_fft, win_length=win)
            if log: S = librosa.power_to_db(S, ref=np.max)
            return S
    
    def from_spectrogram(self, spectrogram,  hop_length=100, n_fft=2048, win=scipy.signal.hann):
        # undo power_to_db
        #S = librosa.db_to_power(spectrogram, ref=np.max)
        return librosa.feature.inverse.mel_to_audio(spectrogram, sr=self.sr, n_fft=n_fft, hop_length=hop_length, window=win)

    def to_mfcc(self, audio_file):
        y, sr = librosa.load(audio_file)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
        return mfccs

    def from_mfcc(self, mfccs):
        return librosa.feature.inverse.mfcc_to_audio(mfccs)

    def to_fourier(self, audio_file):
        y, sr = librosa.load(audio_file)
        fourier = np.fft.fft(y)
        return fourier

    def from_fourier(self, fourier):
        return np.fft.ifft(fourier)

    def spectrogam2d_to_fourier(self, spectrogram):
        return np.fft.fft(spectrogram)

    def spectrogram2d_from_fourier(self, fourier):
        return np.fft.ifft(fourier)

    def to_wavelet(self, audio_file):
        y, sr = librosa.load(audio_file)
        wavelet = scipy.signal.cwt(y, scipy.signal.ricker, np.arange(1, 101))
        return wavelet

    def from_wavelet(self, wavelet):
        return scipy.signal.icwt(wavelet, scipy.signal.ricker, np.arange(1, 101))

    def to_wavelet_2d(self, spectrogram):
        wavelet = scipy.signal.cwt(spectrogram, scipy.signal.ricker, np.arange(1, 101))
        return wavelet

    def from_wavelet_2d(self, wavelet):
        return scipy.signal.icwt(wavelet, scipy.signal.ricker, np.arange(1, 101))

    def play_audio(self, audio_file):
        if type(audio_file) == str:
            return ipd.Audio(audio_file,  rate=self.sr)
        else:
            return ipd.Audio(audio_file, rate=self.sr)

    def visualize_spectrogram(self, audio_file):
        if type(audio_file) == str:
            log_S = self.to_spectrogram(audio_file)
            plt.figure(figsize=(12, 4))
            librosa.display.specshow(log_S, sr=self.sr, x_axis='time', y_axis='mel')
            plt.title('mel power spectrogram ')
            plt.colorbar(format='%+02.0f dB')
            plt.tight_layout()
            plt.show()
        
        else:
            plt.figure(figsize=(12, 4))
            librosa.display.specshow(audio_file, sr=self.sr, x_axis='time', y_axis='mel')
            plt.title('mel power spectrogram ')
            plt.colorbar(format='%+02.0f dB')
            plt.tight_layout()
            plt.show()

    def visualize_mfcc(self, audio_file):
        mfccs = self.to_mfcc(audio_file)
        plt.figure(figsize=(12, 4))
        librosa.display.specshow(mfccs, sr=self.sr, x_axis='time')
        plt.colorbar()
        plt.title('MFCC')
        plt.tight_layout()
        plt.show()

    def visualize_fourier(self, audio_file):
        fourier = self.to_fourier(audio_file)
        plt.figure(figsize=(12, 4))
        plt.plot(fourier)
        plt.title('Fourier')
        plt.show()

    def visualize_wavelet(self, audio_file):
        wavelet = self.to_wavelet(audio_file)
        plt.figure(figsize=(12, 4))
        plt.plot(wavelet)
        plt.title('Wavelet')
        plt.show()

    def visualize_wavelet_2d(self, audio_file):
        wavelet = self.to_wavelet_2d(audio_file)
        plt.figure(figsize=(12, 4))
        plt.plot(wavelet)
        plt.title('Wavelet 2D')
        plt.show()

    def visualize_spectrogram2d(self, audio_file):
        spectrogram = self.to_spectrogram(audio_file)
        plt.figure(figsize=(12, 4))
        plt.plot(spectrogram)
        plt.title('Spectrogram 2D')
        plt.show()

    def visualize_spectrogram2d_fourier(self, audio_file):
        spectrogram = self.to_spectrogram(audio_file)
        fourier = self.spectrogam2d_to_fourier(spectrogram)
        plt.figure(figsize=(12, 4))
        plt.plot(fourier)
        plt.title('Spectrogram 2D Fourier')
        plt.show()

    def visualize_spectrogram2d_wavelet(self, audio_file):
        spectrogram = self.to_spectrogram(audio_file)
        wavelet = self.to_wavelet_2d(spectrogram)
        plt.figure(figsize=(12, 4))
        plt.plot(wavelet)
        plt.title('Spectrogram 2D Wavelet')
        plt.show()

    def visualize_spectrogram2d_mfcc(self, audio_file):
        spectrogram = self.to_spectrogram(audio_file)
        mfcc = self.to_mfcc(spectrogram)
        plt.figure(figsize=(12, 4))
        plt.plot(mfcc)
        plt.title('Spectrogram 2D MFCC')
        plt.show()

    def visualize_spectrogram2d_mfcc_fourier(self, audio_file):
        spectrogram = self.to_spectrogram(audio_file)
        mfcc = self.to_mfcc(spectrogram)
        fourier = self.spectrogam2d_to_fourier(mfcc)
        plt.figure(figsize=(12, 4))
        plt.plot(fourier)
        plt.title('Spectrogram 2D MFCC Fourier')
        plt.show()

    def return_shape_of_audio(self, audio_file):
        y, sr = librosa.load(audio_file)
        return y.shape

    def return_shape_of_spectrogram(self, audio_file):
        log_S = self.to_spectrogram(audio_file)
        return log_S.shape

    def return_shape_of_mfcc(self, audio_file):
        mfccs = self.to_mfcc(audio_file)
        return mfccs.shape

    def return_shape_of_fourier(self, audio_file):
        fourier = self.to_fourier(audio_file)
        return fourier.shape

In [None]:
# test audio_search class
search = audio_search()
search.visualize_spectrogram('./ESC-50-master/audio/1-137-A-32.wav')


In [None]:
n_fft, hop_length=2000, 150
win=np.hanning(n_fft)
win = 50
spec = search.to_spectrogram('./ESC-50-master/audio/1-137-A-32.wav', n_fft=n_fft, hop_length=hop_length, win=win, log=False)
aud = search.from_spectrogram(spec, n_fft=n_fft, hop_length=hop_length, win=win)
#aud = aud / np.max(np.abs(aud)) * 32767
#play audio

search.play_audio(aud)

In [None]:
spec.shape

In [None]:
#if os.path.exists('./ESC-50-master/mel_spectrograms'):
 #   shutil.rmtree('./ESC-50-master/mel_spectrograms')

if not os.path.exists('./ESC-50-master/mel_spectrograms'):
    os.makedirs('./ESC-50-master/mel_spectrograms')

num_files = len(os.listdir('./ESC-50-master/audio'))
file_names = glob.glob('./ESC-50-master/audio/*.wav')

win=np.hanning(n_fft)

start = timeit.default_timer()

def process_files(wav_path, n_fft, hop_length, win, log, RGB = False):
    img = search.to_spectrogram(wav_path, n_fft=n_fft, hop_length=hop_length, win=win, log=True, RGB=RGB)
    if RGB == False: 
        img = img.reshape(img.shape[0], img.shape[1], 1)
        img = (img - np.min(img)) / (np.max(img) - np.min(img)) * 255
    else:
        img = (img - np.min(img)) / (np.max(img) - np.min(img)) * 255

    
    img_name = wav_path.split('/')[-1].split('.')[0]
    img_path = './ESC-50-master/mel_spectrograms/' + img_name + '.png'
    cv2.imwrite(img_path, img, [cv2.IMWRITE_PNG_COMPRESSION, 9])

#paralel using joblib
Parallel(n_jobs=4, backend='multiprocessing')(delayed(process_files)(wav_path, n_fft=n_fft, hop_length=hop_length, win=win, log=True, RGB=False) for wav_path in file_names)
  
end = timeit.default_timer()
print(end - start)
    

In [None]:
# hyperperameters for fourier transform

In [None]:


# VAE from huggingface transfomers https://huggingface.co/Fraser/transformer-vae
class VAE(nn.Module):
    def __init__(self, model_name, device, latent_dim=128, max_seq_length=512):
        super(VAE, self).__init__()
        self.model_name = model_name
        self.device = device
        self.latent_dim = latent_dim
        self.max_seq_length = max_seq_length
        self.model = AutoModel.from_pretrained(model_name)
        self.encoder = nn.Sequential(
            nn.Linear(self.model.config.hidden_size, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 2 * latent_dim),
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, self.model.config.hidden_size),
        )

    def encode(self, x):
        return self.encoder(x)

    def decode(self, z):
        return self.decoder(z)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        x = self.model(x)[0]
        x = x[:, 0, :]
        mu_logvar = self.encode(x).view(-1, 2, self.latent_dim)
        mu = mu_logvar[:, 0, :]
        logvar = mu_logvar[:, 1, :]
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

    def loss_function(self, recon_x, x, mu, logvar):
        BCE = F.mse_loss(recon_x, x, reduction='sum')
        KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
        return BCE + KLD

    def train(self, train_loader, optimizer, epoch):
        self.train()
        train_loss = 0
        for batch_idx, data in enumerate(train_loader):
            data = data.to(self.device)
            optimizer.zero_grad()
            recon_batch, mu, logvar = self(data)
            loss = self.loss_function(recon_batch, data, mu , logvar)
            loss.backward()
            train_loss += loss.item()
            optimizer.step()
            if batch_idx % 100 == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader),
                    loss.item() / len(data)))
        print('====> Epoch: {} Average loss: {:.4f}'.format(    
            epoch, train_loss / len(train_loader.dataset)))
    
    def generate(self, n=1):
        self.eval()
        with torch.no_grad():
            z = torch.randn(n, self.latent_dim).to(self.device)
            samples = self.decode(z)
        return samples

# create dataloader for VAE using audio_search() class
train_loader = torch.utils.data.DataLoader('./VQ-VAE-Search/ESC-50-master/mel_spectrograms/', batch_size=1, shuffle=True)

# hyperperameters for VAE
model_name = 'Fraser/transformer-vae'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
latent_dim = 128
max_seq_length = 512

# create VAE model
model = VAE(model_name, device, latent_dim, max_seq_length)
model = model.to(device)

# optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# train VAE
for epoch in range(1, 10):
    model.train(train_loader, optimizer, epoch)
    