In [1]:
# python common
import os
# python universe
import numpy as np
import matplotlib.pyplot as plt
# nn
import torch
import torch.nn as nn

In [41]:
# util dashboard imports
import csv
import soundfile as sf

In [2]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [3]:
import sys
sys.path.append("../lib/")

from mids_pytorch_model import Model, get_wav_for_path_pipeline

from util import active_BALD, detect_timestamps_BNN

In [19]:
# tasks
def _get_wav_for_path_pipeline(path, sr=8000):
    return get_wav_for_path_pipeline(path, sr)

def _predict_on_frames(signal, model, device, step_size, n_hop, batch_size):
    # padding is the difference between the win size and step size
    # The first window is silence prepended to the step size to fill the WindowsError
    # then the window slides by the step amount until the last frame is the step followed by
    # silence to fill the window
    pad_amt = (win_size - step_size) * n_hop
    pad_l = torch.zeros(1, pad_amt) + (0.1**0.5) * torch.randn(1, pad_amt)
    pad_r = torch.zeros(1, pad_amt) + (0.1**0.5) * torch.randn(1, pad_amt)
    padded_stepped_signal = torch.cat([pad_l, signal, pad_r], dim=1).unfold(
        1, win_size * n_hop, step_size * n_hop).transpose(0, 1).to(device)  # b, 1, s

    softmax = nn.Softmax(dim=1)

    prediction_list = []
    spectrogram_list = []
    with torch.no_grad():
        for batch_signals in torch.split(padded_stepped_signal, batch_size, 0):
            predictions = model(batch_signals)
            predction_probabilities = softmax(
                predictions['prediction']).cpu().detach()
            prediction_list.append(predction_probabilities)
            spectrogram_list.append(
                predictions['spectrogram'].cpu().detach().numpy())

    prediction_tensor = torch.cat(prediction_list)

    # align the predictions according to the sliding window so that
    # each step has all of it's prediciton windows stacked
    stacked_predictions = []
    prediction_length = signal.unfold(
        1, win_size * n_hop, step_size * n_hop).shape[1]
    for i in range(win_size // step_size):
        stacked_predictions.append(prediction_tensor[i:i + prediction_length])
    
    #get spectrograms w/o padding
    list_offset = (len(spectrogram_list)-prediction_length) // 2
    spectrograms = np.concatenate(spectrogram_list[list_offset:list_offset + prediction_length])

    return torch.stack(stacked_predictions, dim=-3).numpy(), spectrograms

In [24]:
# internal funcs
def _iterate_audiofiles(rootFolderPath, audio_format):
    for root, dirs, files in os.walk(rootFolderPath):
        for filename in files:
            if filename.endswith(audio_format):
                yield root, filename

In [62]:
def _build_timestmap_list(mean_predictions, G_X, U_X, time_to_sample, det_threshold):
    """Use the predictions to build an array of contiguous timestamps where the
    probability of detection is above threshold"""
    
    # find where the average 2nd element (positive score) is > threshold
    condition = mean_predictions[:, 1] > det_threshold
    preds_list = []
    for start, stop in _contiguous_regions(condition):
        # start and stop are frame indexes
        # so multiply by n_hop and step_size samples
        # then div by sample rate to get seconds
        preds_list.append([str(start * time_to_sample), str(stop * time_to_sample),
                           "{:.4f}".format(
                               np.mean(mean_predictions[start:stop][:, 1]))
                           + " PE: " +
                           "{:.4f}".format(np.mean(G_X[start:stop]))
                           + " MI: " + "{:.4f}".format(np.mean(U_X[start:stop]))])


In [6]:
!mkdir -p ../data/output

In [6]:
# cmd args
rootFolderPath = '/data'
audio_format = '.aac'
dir_out= '/data/output'
norm_per_sample = True
win_size = 360
step_size = 120
to_dash = True
n_samples = 10
det_threshold = 0.5

# default args
n_fft = 1024
n_hop = n_fft // 8
sr = 8000
batch_size = 16

In [8]:
device = torch.device('cuda:0' if torch.cuda.is_available() else torch.device("cpu"))
model = Model('convnext_base_384_in22ft1k', image_size=384, NFFT=n_fft, n_hop=n_hop)
checkpoint = torch.load('/models/model_e1_2022_04_07_11_52_08.pth')

sampling rate = 8000. Please make sure the sampling rate is correct in order toget a valid freq range
STFT kernels created, time used = 0.0204 seconds


In [9]:
model.load_state_dict(checkpoint)
model = model.to(device)
model.eval()
model_name = 'mids_v4'


In [25]:
for root, filename in _iterate_audiofiles(rootFolderPath, audio_format):
    print([os.path.join(root, filename)])
    signal, signal_length = _get_wav_for_path_pipeline([os.path.join(root, filename)], sr=sr)
    if signal_length < (n_hop * win_size) / sr:
        logger.info(f"{filename} too short. {signal_length} < {(n_hop * win_size) / sr}")
        break
    else:
        logger.info(f"Read {filename}.  Signal Length: {signal_length}")

    predictions, spectrograms = _predict_on_frames(
            signal, model, device, step_size, n_hop, batch_size)

#    timestamps = _build_timestmap_list(predictions, timestamps, signal_length * sr, step_size, n_hop)

['/data/R4_cleaned recording_17-10-17.aac']


  return f(*args, **kwargs)


In [None]:
frame_count = signal.unfold(1, win_size * n_hop, step_size * n_hop).shape[1]
G_X, U_X, _ = active_BALD(np.log(predictions), frame_count, 2)
mean_predictions = np.mean(predictions, axis=0)

In [66]:
frame_count = signal.unfold(1, win_size * n_hop, step_size * n_hop).shape[1]

G_X, U_X, _ = active_BALD(np.log(predictions), frame_count, 2)
mean_predictions = np.mean(predictions, axis=0)
# find where the average 2nd element (positive score) is > threshold
condition = mean_predictions[:,1] > det_threshold
preds_list = []
for start, stop in contiguous_regions(condition):
    # start and stop are frame indexes
    # so multiply by n_hop and step_size samples
    # then div by sample rate to get seconds
    preds_list.append([str(start * n_hop * step_size / sr), str(stop * n_hop * step_size / sr),
                      "{:.4f}".format(np.mean(mean_predictions[start:stop][:,1])) +
                      " PE: " + "{:.4f}".format(np.mean(G_X[start:stop])) + 
                      " MI: " + "{:.4f}".format(np.mean(U_X[start:stop]))])



In [67]:
preds_list

[['0.0', '1.92', '0.5481 PE: 0.9933 MI: 0.3878'],
 ['15.36', '32.64', '0.7459 PE: 0.7242 MI: 0.2905'],
 ['36.48', '44.16', '0.8418 PE: 0.5803 MI: 0.1224'],
 ['48.0', '495.36', '0.9101 PE: 0.3966 MI: 0.0287'],
 ['497.28', '650.88', '0.8908 PE: 0.4369 MI: 0.0503']]

In [65]:
def contiguous_regions(condition):
    """Finds contiguous True regions of the boolean array "condition". Returns
    a 2D array where the first column is the start index of the region and the
    second column is the end index."""

    # Find the indicies of changes in "condition"
    d = np.diff(condition)
    idx, = d.nonzero() 

    # We need to start things after the change in "condition". Therefore, 
    # we'll shift the index by 1 to the right.
    idx += 1

    if condition[0]:
        # If the start of condition is True prepend a 0
        idx = np.r_[0, idx]

    if condition[-1]:
        # If the end of condition is True, append the length of the array
        idx = np.r_[idx, condition.size] # Edit

    # Reshape the result into two columns
    idx.shape = (-1,2)
    return idx

In [61]:
G_X

array([0.98878771, 0.88365421, 0.34414648, 0.35089149, 0.35172807,
       0.34690555, 0.33684758, 0.85760864, 0.98811393, 0.63076574,
       0.95452906, 0.95686625, 0.94572555, 0.37507287, 0.33257134,
       0.38695776, 0.94721222, 0.98441313, 0.9846773 , 0.85937661,
       0.37115957, 0.33896366, 0.7517881 , 0.99979106, 0.99568621,
       0.96426813, 0.47100309, 0.61637149, 0.99084098, 0.99738637,
       0.96063076, 0.43300074, 0.32375946, 0.30491626, 0.321109  ,
       0.31851106, 0.33717914, 0.32729601, 0.34460322, 0.63797813,
       0.69143386, 0.71130158, 0.44774413, 0.35364813, 0.28115922,
       0.27119917, 0.27312852, 0.27977629, 0.28411984, 0.28487782,
       0.29444797, 0.30568511, 0.31794627, 0.31521198, 0.32473654,
       0.42613478, 0.44278118, 0.43019263, 0.33028798, 0.30627674,
       0.30336519, 0.28512884, 0.28750058, 0.3054964 , 0.39980418,
       0.52345543, 0.52631507, 0.45589214, 0.30978409, 0.28910371,
       0.2840834 , 0.28296902, 0.28264872, 0.28207278, 0.30047

In [60]:
np.mean(predictions, axis=0)

array([[0.4519141 , 0.54808587],
       [0.7053557 , 0.29464427],
       [0.9357353 , 0.06426468],
       [0.9339802 , 0.06601977],
       [0.9337612 , 0.06623877],
       [0.93501973, 0.06498031],
       [0.93761325, 0.06238679],
       [0.7184098 , 0.2815903 ],
       [0.43590578, 0.56409425],
       [0.15852357, 0.84147644],
       [0.37512907, 0.62487096],
       [0.37834707, 0.6216529 ],
       [0.36371717, 0.6362828 ],
       [0.0724696 , 0.92753047],
       [0.0612968 , 0.93870324],
       [0.07573163, 0.9242684 ],
       [0.36557305, 0.6344269 ],
       [0.5733656 , 0.4266344 ],
       [0.5727435 , 0.4272565 ],
       [0.28290328, 0.71709675],
       [0.07140891, 0.9285911 ],
       [0.06292895, 0.9370711 ],
       [0.2154587 , 0.7845413 ],
       [0.50850934, 0.4914907 ],
       [0.5386465 , 0.4613535 ],
       [0.3891797 , 0.6108203 ],
       [0.10063431, 0.8993657 ],
       [0.15262537, 0.8473746 ],
       [0.44371894, 0.55628103],
       [0.4699123 , 0.5300877 ],
       [0.

In [22]:
signal.shape

torch.Size([1, 28797952])

In [16]:
batch_signals = torch.split(padded_stepped_signal, batch_size, 0)

In [19]:
batch_signals[0].shape

torch.Size([16, 1, 15360])

In [31]:
# file names and output directories
file_suffix = f'_MIDS_win_{win_size}_step_{step_size}_{model_name}_{det_threshold}.txt'
if dir_out:
    root_out = root.replace(rootFolderPath, dir_out)
else:
    root_out = root
if not os.path.exists(root_out):
    os.makedirs(root_out)
output_filename = os.path.splitext(filename)[0]

text_output_filename = os.path.join(root_out, output_filename) + file_suffix

In [32]:
text_output_filename

'/data/output/R4_cleaned recording_17-10-17_MIDS_win_360_step_120_mids_v4_0.5.txt'

In [30]:
os.path.basename(filename)

'R4_cleaned recording_17-10-17.aac'

In [33]:
#--- write audio

In [57]:
'''Create output audio based on input. Returns wave format. Potential for speedup for video creation by returning
    the same filetype as was input, but not implemented due to downstream processing which utilises wav files for 
    compatibility.'''
mozz_audio_list = []
mozz_meta = []
has_mosquito = False
start_time = 0
with open(text_output_filename) as f:
    reader = csv.reader(f, delimiter='\t')
    for line in reader:
        p = float(line[2].split()[0])
        PE = float(line[2].split()[2])
        MI = float(line[2].split()[4])

        duration = float(line[1])-float(line[0])

        mozz_meta.append([str(start_time), str(start_time + duration), line[0] + '-' + line[1] + '   P: ' + line[2]]) 

        #print(f"{int(float(line[0])*sr)}:{int(float(line[1])*sr)}")
        mozz_audio_list.append(signal[0][int(float(line[0]) * sr):int(float(line[1]) * sr)])

        start_time += duration  # Append length of previous prediction to transfer i


In [None]:
mozz_audio_list[0]

tensor([ 0.0000,  0.0000,  0.0000,  ..., -0.0199, -0.0184, -0.0071])

In [None]:
audio_length = start_time
audio_output_filename = os.path.join(dir_out, output_filename) + '_mozz_pred'

if mozz_audio_list:
    sf.write(f'{audio_output_filename}.wav', np.hstack(mozz_audio_list), sr)


In [51]:
np.hstack(mozz_audio_list)

array([[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         1.1789799e-04,  1.5181303e-04, -9.7513199e-05]], dtype=float32)

In [54]:
mozz_audio_list[0]

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  1.1790e-04,
          1.5181e-04, -9.7513e-05]])

In [45]:
x = np.array([0,0.5,0.75, 1, 2]) # x.dtype is 'float64'
sf.write("x.wav", x, 1) # a wav at sampling rate 1 Hz

y, fs = sf.read("x.wav")

print(y)

[0.         0.5        0.75       0.99996948 0.99996948]
