In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import os
import traceback
import sys
import random
import csv
import json
import glob
import numpy as np
import resampy
import tensorflow as tf
import soundfile as sf
import librosa
from metrics import evaluate, micro_averaged_auprc, macro_averaged_auprc
import oyaml as yaml

In [2]:
def get_output_path(filepath, suffix, output_dir=None):
    """
    Parameters
    ----------
    filepath : str
        Path to audio file to be processed
    suffix : str
        String to append to filename (including extension)
    output_dir : str or None
        Path to directory where file will be saved. If None, will use directory of given filepath.
    Returns
    -------
    output_path : str
        Path to output file
    """
    base_filename = os.path.splitext(os.path.basename(filepath))[0]
    if not output_dir:
        output_dir = os.path.dirname(filepath)

    if suffix[0] != '.':
        output_filename = "{}_{}".format(base_filename, suffix)
    else:
        output_filename = base_filename + suffix

    return os.path.join(output_dir, output_filename)

In [3]:
def get_tflite(model_path):
    
    interpreter = tf.lite.Interpreter(model_path=model_path)
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()

    input_shape = input_details[0]['shape'][1:]
    output_shape = output_details[0]['shape'][1:]
    input_index = input_details[0]['index']
    output_index = output_details[0]['index']
    emb_len = output_shape[-1]

    interpreter.allocate_tensors()
    
    return interpreter, input_index, output_index, output_shape, emb_len

In [4]:
def _center_audio(audio, frame_len):
    """Center audio so that first sample will occur in the middle of the first frame"""
    return np.pad(audio, (int(frame_len / 2.0), 0), mode='constant', constant_values=0)


def _pad_audio(audio, frame_len, hop_len):
    """Pad audio if necessary so that all samples are processed"""
    audio_len = audio.size
    if audio_len < frame_len:
        pad_length = frame_len - audio_len
    else:
        pad_length = int(np.ceil((audio_len - frame_len)/float(hop_len))) * hop_len \
                     - (audio_len - frame_len)

    if pad_length > 0:
        audio = np.pad(audio, (0, pad_length), mode='constant', constant_values=0)

    return audio

def _amplitude_to_db(S, amin=1e-10, dynamic_range=80.0):
    magnitude = np.abs(S)
    power = np.square(magnitude, out=magnitude)
    ref_value = power.max()

    log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
    log_spec -= log_spec.max()

    log_spec = np.maximum(log_spec, -dynamic_range)
    return log_spec

In [5]:
def get_embeddings_from_upstream(audio, sr, 
                                 interpreter,
                                 input_index=None,
                                 output_index=None,
                                 output_shape=None,
                                 emb_len=256,
                                 hop_size=0.1, 
                                 center=True,
                                 n_fft=None, 
                                 n_mels=None, 
                                 mel_hop_len=None, 
                                 fmax=None):
    """
    Computes and returns L3 embedding for given audio data
    """
    
    if not interpreter:
        raise ValueError('Tflite Model is missing')
    
    if audio.size == 0:
        raise ValueError('Got empty audio')

    # Resample if necessary
    if sr != TARGET_SR:
        audio = resampy.resample(audio, sr_orig=sr, sr_new=TARGET_SR, filter='kaiser_best')

    audio_len = audio.size
    frame_len = TARGET_SR
    hop_len = int(hop_size * TARGET_SR)

    if audio_len < frame_len:
        warnings.warn('Duration of provided audio is shorter than window size (1 second). Audio will be padded.',
                      L3Warning)

    if center:
        # Center audio
        audio = _center_audio(audio, frame_len)

    # Pad if necessary to ensure that we process all samples
    audio = _pad_audio(audio, frame_len, hop_len)

    # Split audio into frames, copied from librosa.util.frame
    frames = librosa.util.utils.frame(audio, frame_length=frame_len, hop_length=hop_len).T
    
    X = []
    for frame in frames:
        S = np.abs(librosa.core.stft(frame, n_fft=n_fft, hop_length=mel_hop_len,\
                                     window='hann', center=True, pad_mode='constant'))
        S = librosa.feature.melspectrogram(sr=sr, S=S, n_mels=n_mels, fmax=fmax,
                                           power=1.0, htk=True)
        S = _amplitude_to_db(np.array(S))
        X.append(S)

        
    predictions = []
    
    #embeddings per frame   
    for idx in range(len(X)):
        x = np.array(X[idx])[np.newaxis, :, :, np.newaxis].astype(np.float32)
        interpreter.set_tensor(input_index, x)
        interpreter.invoke()
        output = interpreter.get_tensor(output_index)
        predictions.append(output)
    
    predictions = np.array(predictions).reshape(-1, emb_len)
    return predictions

In [6]:
def get_output_from_classifier(X,
                               classifier,
                               input_index=None,
                               output_index=None,
                               output_shape=None,
                               n_classes=8):
    """
    Predicts the softmax output from the embeddings extracted ffrom upstream model
    """
    predictions = []
    
    #softmax per frame   
    for idx in range(len(X)):
        x = np.array(X[idx])[np.newaxis, :].astype(np.float32)
        classifier.set_tensor(input_index, x)
        classifier.invoke()
        output = classifier.get_tensor(output_index)
        predictions.append(output)
    
    predictions = np.array(predictions).reshape(-1, n_classes)
    return predictions

In [12]:
def write_to_output(output_path, test_file_list, y_pred, taxonomy):
    
    coarse_fine_labels = [["{}-{}_{}".format(coarse_id, fine_id, fine_label)
                           for fine_id, fine_label in fine_dict.items()]
                          for coarse_id, fine_dict in taxonomy['fine'].items()]
        
    full_fine_target_labels = [fine_label for fine_list in coarse_fine_labels
                               for fine_label in fine_list]
        
    coarse_target_labels = ["_".join([str(k), v])
                            for k, v in taxonomy['coarse'].items()]
        
    with open(output_path, 'w') as f:
        csvwriter = csv.writer(f)

        # Write fields
        fields = ["audio_filename"] + full_fine_target_labels + coarse_target_labels
        csvwriter.writerow(fields)

        # Write results for each file to CSV
        for filename, y, in zip(test_file_list, y_pred):
            row = [filename]

            # Add placeholder values for fine level
            row += [0.0 for _ in range(len(full_fine_target_labels))]
            # Add coarse level labels
            row += list(y)

            csvwriter.writerow(row)

In [13]:
def process_files(file_list, 
                  taxonomy, 
                  output_path, 
                  upstream_path=None, 
                  classifier_path=None,
                  hop_size=0.1,
                  n_fft=None, 
                  n_mels=None, 
                  mel_hop_len=None, 
                  fmax=None):
    """
    Computes and saves L3 embedding for audio files
    """
    y_pred_mean = []
    interpreter, input_index, output_index, output_shape, emb_len = get_tflite(upstream_path)
    classifier, cls_input_index, cls_output_index, cls_output_shape, _ = get_tflite(classifier_path)
        
    for filepath in file_list:
        try:
            audio, sr = sf.read(filepath)
        except Exception:
            raise ValueError('Could not open file "{}":\n{}'.format(filepath, traceback.format_exc()))
        
        # Embeddings output per frame 
        # Shape: (number_of_frames, emb_len)
        embeddings = get_embeddings_from_upstream(audio, sr,
                                                  interpreter,
                                                  input_index=input_index,
                                                  output_index=output_index,
                                                  output_shape=output_shape,
                                                  emb_len=emb_len,
                                                  hop_size=hop_size,
                                                  n_fft=n_fft,
                                                  n_mels=n_mels,
                                                  mel_hop_len=mel_hop_len,
                                                  fmax=fmax)

        if embeddings is None:
            LOGGER.error('Could not generate embedding for {}'.format(filepath))
            return
        
        # Softmax output per frame 
        # Shape: (number_of_frames, 8)
        output = get_output_from_classifier(embeddings,
                                            classifier,
                                            input_index=cls_input_index, 
                                            output_index=cls_output_index,
                                            output_shape=cls_output_shape,
                                            )
        
        #If you would want mean over all the softmax output from the frames
        y_pred_mean.append(output.mean(axis=0).tolist())
    
    write_to_output(output_path, file_list, y_pred_mean, taxonomy)

In [15]:
if __name__=='__main__':
    
    TARGET_SR = 8000 #sys.argv[1]
    n_mels = 64 #sys.argv[2]
    mel_hop_len = 160 #sys.argv[3]
    n_fft = 1024 #sys.argv[4]
    hop_size = 0.1 
    fmax = None
    TEST_DIR = os.path.dirname(os.path.realpath('__file__'))
    
    # Get the Data path (path to audio files or just one .wav file)
    try: 
        data_path = sys.argv[5]
        if os.path.isdir(data_path):
            data_path = glob.glob(data_path + '/*.wav')
        else:
            data_path = [data_path]
    except:
        TEST_AUDIO_DIR = os.path.join(TEST_DIR, 'data/ust_test')
        data_path = [random.choice(glob.glob(TEST_AUDIO_DIR + '/*.wav'))]
        
    # Get the upstream and classifier path from argv or hardcode the path in else
    if len(sys.argv) > 6:
        upstream_path = sys.argv[6]
        try:
            classifier_path = sys.argv[7]
        except:
            print('Input the path to the classifier also!')
            exit(0)
    else:
        TFLITE_MODELS_DIR = os.path.join(TEST_DIR, 'tflite_models')
        upstream_path = os.path.join(TFLITE_MODELS_DIR, 'quantized_default_int8.tflite')
        classifier_path = os.path.join(TFLITE_MODELS_DIR, 'mlp_ust.tflite')
        
    output_dir = os.path.join(TEST_DIR, 'output/sonyc_ust/rpi_test')
    output_path = os.path.join(output_dir, 'output_mean.csv')
        
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
        
    
    yaml_path = os.path.join(TEST_DIR, 'data/dcase-ust-taxonomy.yaml')

    with open(yaml_path) as f:
        taxonomy = yaml.load(f, Loader=yaml.FullLoader)

    # This function does the following 3 tasks:
    # 1. Get the embeddings from the upstream model
    # 2. Get the softmax output from the classifier
    # 3. Get the mean of softmax for all the frames of an audio sample and write to output_path
    process_files(data_path, 
                  taxonomy, 
                  output_path, 
                  upstream_path=upstream_path,
                  classifier_path=classifier_path,
                  hop_size=hop_size, 
                  n_mels=n_mels, 
                  n_fft=n_fft, 
                  mel_hop_len=mel_hop_len,
                  fmax=fmax)