In [1]:
import pandas as pd
from pathlib import Path

base_path = Path('/kaggle/input/audioset-speech-music-noise')

df_annot = pd.read_csv(base_path/'annotations.csv').set_index('ytid')
df_annot['file_path'] = df_annot.file_path.str.replace('\\','/', regex=False)
df_annot

In [None]:
# !pip install python_speech_features

In [2]:
import librosa as rosa
# import python_speech_features as psf
import numpy as np

def load_audio(file_path):
    audio, sampling_rate = rosa.load(file_path)
    return audio, sampling_rate

def remove_silence(audio, sampling_rate=22050, top_db=20, win_len=.025, win_step=.010, return_silence=False):
    split_inds = rosa.effects.split(audio, top_db=top_db, frame_length=int(win_len*sampling_rate), 
                                    hop_length=int(win_step*sampling_rate))

    isig = np.concatenate(list(np.arange(*v) for v in split_inds), 0)
    signal = audio[isig]
    if not return_silence:
        return signal
    else:
        isil = np.setdiff1d(np.arange(len(audio)), isig)
        silence = audio[isil]
        return signal, silence

# def normalize_volume(sig, sampling_rate=22050, win_len=.200, win_step=.010):
#     frms = psf.sigproc.framesig(sig, win_len*sampling_rate, win_step*sampling_rate)
#     frms = frms/(frms.std(axis=-1, keepdims=True)+1e-12)
#     sig = psf.sigproc.deframesig(frms, siglen=len(sig), 
#                                   frame_len=win_len*sampling_rate, frame_step= win_step*sampling_rate)
#     return sig

def load_file(file_path, dtype=np.float32):
    audio, sampling_rate = load_audio(file_path)
    signal = remove_silence(audio, sampling_rate)
#     signal = normalize_volume(signal, sampling_rate)
    return signal.astype(dtype), sampling_rate

In [3]:
import IPython.display as ipd

example_file = base_path/(df_annot.loc['zzfcNds_9_g', 'file_path']) #another ex: -0Gj8-vB1q4
loaded_example, sampling_rate = load_file(example_file)

ipd.display(ipd.Audio(loaded_example, rate=sampling_rate))

In [4]:
import tensorflow as tf
import tensorflow.signal as tfs
import numpy as np

def preemphasize(sig, coeff):
    return tf.concat([sig[...,0:1], sig[...,1:]-coeff*sig[...,:-1]], axis=-1)

def spectrogram(sig, fs=16000, winlen=.025, winstep=.010, 
                nfft=512, wfunc=None, preemph=0.97):
    
    sig=preemphasize(sig,preemph)
    wl_samp=round(winlen*fs)
    ws_samp=round(winstep*fs)
    
    siglen=tf.shape(sig)[-1]
    missing=(ws_samp-(siglen-wl_samp) % ws_samp)%ws_samp+tf.maximum(wl_samp-siglen,0)
    
    paddings=[(0,0)]*(sig.shape.rank-1)+[(0,missing)]
    sig=tf.pad(sig, paddings, 'CONSTANT')

    stft=tfs.stft(sig, wl_samp, ws_samp, nfft, wfunc)#, pad_end=True
    spec=(tf.abs(stft)**2)/nfft
    return spec


def log_spectrogram(sig=None, fs=16000, winlen=.025, winstep=.010, 
                    nfft=512, wfunc=None, preemph=0.97, spec=None):
    spec = spec if not spec is None else spectrogram(sig, fs, winlen, winstep,
                                                     nfft, wfunc, preemph)
    return tf.math.log(spec+1e-12)


def mfb(sig=None, fs=16000, nfilt=26, winlen=.025, winstep=.010,
         lowfreq=0, highfreq=None, nfft=512, wfunc=None, preemph=0.97, spec=None):
    highfreq=highfreq if not highfreq is None else fs//2
    spec=spec if not spec is None else spectrogram(sig, fs, winlen, winstep,
                                                   nfft, wfunc, preemph)
    
    bins=spec.shape[-1]
    melmat=tfs.linear_to_mel_weight_matrix(nfilt, bins, fs, 
                                           lowfreq, highfreq)
    mel_spec=tf.tensordot(spec, melmat,1)
    mel_spec.set_shape(spec.shape[:-1].concatenate(melmat.shape[-1:]))
    energy = tf.reduce_sum(spec, axis=-1)
    return mel_spec, energy


def log_mfb(sig=None, fs=16000, nfilt=26, winlen=.025, winstep=.010,
         lowfreq=0, highfreq=None, nfft=512, wfunc=None, preemph=0.97, 
            spec=None):
    highfreq=highfreq if not highfreq is None else fs//2
    mel_spec, energy=mfb(sig,fs,nfilt,winlen, winstep, lowfreq, highfreq,
                 nfft, wfunc, preemph, spec)
    log_mel_spec=tf.math.log(mel_spec+1e-12)
    return log_mel_spec,energy


def lifter(cepstra, L=22):
    if L > 0:
        ncoeff = cepstra.shape[-1].value
        n = tf.range(ncoeff, dtype=cepstra.dtype)
        lift = 1 + (L/2.)*tf.sin(3.14159265358979*n/L)
        return lift*cepstra
    else:
        return cepstra


def mfcc(sig=None, fs=16000, numcep=13, nfilt=26, winlen=.025, winstep=.010,
         lowfreq=0, highfreq=None, nfft=512, wfunc=None, preemph=0.97, ceplifter=22,include_energy=True,
         log_mel_spec=None, energy=None):
    highfreq=highfreq if not highfreq is None else fs//2
    lmfb, energy=(log_mel_spec, energy) if not log_mel_spec is None else log_mfb(sig,fs,nfilt,winlen, winstep, 
                                 lowfreq, highfreq, nfft, wfunc, preemph)
    mfc=tfs.mfccs_from_log_mel_spectrograms(lmfb)[...,:numcep]
    mfc=lifter(mfc,ceplifter)
    if include_energy:
        mfc=tf.concat([tf.log(energy+1e-12)[...,None],mfc[...,1:]],axis=-1)
    return mfc



In [5]:
lmfb, energy= log_mfb(loaded_example, sampling_rate, winlen=.025, winstep=.010, nfilt=64, nfft=1024)
print(lmfb)
print(energy)

# lmfb2 = psf.logfbank(loaded_example, sampling_rate, winlen=.025, winstep=.010, nfilt=64, nfft=1024)
# print(lmfb2)


In [6]:
def process_file(file_path, win_len=.025, win_step=.010, nfilt=64, nfft=1024, dtype=np.float32):
    audio, sampling_rate = load_file(file_path, dtype)
    mfb, energy = log_mfb(audio, sampling_rate, winlen=win_len, winstep=win_step, nfilt=nfilt, nfft=nfft)
    return mfb.numpy(), energy.numpy()

In [7]:
example_file = base_path/(df_annot.loc['zzfcNds_9_g', 'file_path'])
processed_example = process_file(example_file)
processed_example

In [8]:
from tqdm import tqdm

df_selected = df_annot.copy()
feat_path = Path('/kaggle/working/audioset_features')

# for ytid, row in tqdm(df_selected.iterrows(), total=len(df_selected)):
#     file_path = base_path/row.file_path
    
#     dest_path = feat_path/row.check_status/row.dataset_split/row.plausible_superclass/ytid
#     lmfb_path = dest_path/'log_mfb.npy'
#     energy_path = dest_path/'energy.npy'
    
#     if not energy_path.exists():
#         try:
#             dest_path.mkdir(parents=True, exist_ok=True)
#             lmfb, energy = process_file(file_path)

#             np.save(str(lmfb_path), lmfb)
#             np.save(str(energy_path), energy)
#         except ValueError:
#             pass

import zipfile

with zipfile.ZipFile(str(feat_path)+'.zip', 'w', zipfile.ZIP_DEFLATED, compresslevel=2) as zf:
    for ytid, row in tqdm(df_selected.iterrows(), total=len(df_selected)):
        file_path = base_path/row.file_path

        dest_path = Path(row.check_status)/row.dataset_split/row.plausible_superclass/ytid
        lmfb_path = dest_path/'log_mfb.npy'
        energy_path = dest_path/'energy.npy'

        try:
            lmfb, energy = process_file(file_path)

            with zf.open(str(lmfb_path),'w') as fl:
                np.save(fl, lmfb)
            with zf.open(str(energy_path), 'w') as fl:
                np.save(fl, energy)
        except ValueError:
            pass

In [9]:
!kaggle

In [11]:
%mkdir dataset
%mv audioset_features.zip ./dataset
%cd dataset
%ls

In [13]:
!mkdir /root/.kaggle
!echo '{"username":"snirjhar","key":"8ba8b87631f84545fb7bea84685d6c4e"}' > /root/.kaggle/kaggle.json