In [16]:
%pip install matplotlib numpy pandas tensorflow tensorflow_hub tensorflow_io scipy

Note: you may need to restart the kernel to use updated packages.


In [17]:
# YAMNnet feature extraction https://www.tensorflow.org/tutorials/audio/transfer_learning_audio
#data from https://github.com/orcasound/orcadata/wiki/Pod.Cast-data-archive

import os

from IPython import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [18]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_io as tfio
import scipy
from scipy.io import wavfile
from IPython.display import Audio

In [45]:
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)
wav_folder_path = './TrainDataLatest_PodCastAllRounds_123567910/wav/'
# wav_folder_path = './TestDataLatest_PodCastAllRounds_123/wav/'

In [46]:
def ensure_sample_rate(original_sample_rate, waveform,
                       desired_sample_rate=16000):
  """Resample waveform if required."""
  if original_sample_rate != desired_sample_rate:
    desired_length = int(round(float(len(waveform)) /
                               original_sample_rate * desired_sample_rate))
    waveform = scipy.signal.resample(waveform, desired_length)
  return desired_sample_rate, waveform

In [47]:
def trim_wav(sample_rate, original_wav, start, duration):
    '''
    :param originalWavPath: the path to the source wav file
    :param newWavPath: output wav file * can be same path as original
    :param start: time in seconds
    :param end: time in seconds
    :return:
    '''
    sampleRate, wav_data = sample_rate, original_wav
    startSample = int( start * sampleRate )
    endSample = int( (start + duration) * sampleRate )
    return wav_data[startSample:endSample]

In [48]:
def normalize_wav(wav_data ):
    return wav_data / tf.int16.max

In [49]:
def get_wav_for_segment(wav_filename, start_time, duration):
    full_path = wav_folder_path + wav_filename
    sample_rate, wav_data = wavfile.read(full_path, 'rb')
    if len(wav_data.shape) > 1:    # for multi channel audio
        wav_data = tf.math.reduce_mean(wav_data, axis=1)
    sample_rate, wav_data = ensure_sample_rate(sample_rate, wav_data)
    wav_data = trim_wav(sample_rate, wav_data, start_time, duration)
    wav_data = normalize_wav(wav_data)
    return sample_rate, wav_data

In [50]:
def get_embedding(wav_filename, start, duration):
    sample_rate, wav_data = get_wav_for_segment(wav_filename, start, duration)
    scores, embeddings, spectrogram = yamnet_model(wav_data)
    embedded_wav = tf.math.reduce_mean(
        embeddings, axis=0, keepdims=False, name=None
    )
    return embedded_wav.numpy()

In [51]:
def run_example():
    wav_file_name = '1562337136_0004.wav'
    start, duration = 	49.76562500000002, 	2.450000000000003
    sample_rate, wav_data = get_wav_for_segment(wav_file_name, start, duration)

    # Show some basic information about the audio.
    duration = len(wav_data)/sample_rate
    print(f'Sample rate: {sample_rate} Hz')
    print(f'Total duration: {duration:.2f}s')
    print(f'Size of the input: {len(wav_data)}')

    # Listening to the wav file.
    Audio(wav_data, rate=sample_rate)

In [52]:
pd_annotations = pd.read_csv('./TestDataLatest_PodCastAllRounds_123/test.tsv', sep = '\t')
pd_annotations.head()

Unnamed: 0,dataset,wav_filename,start_time_s,duration_s,location,date,pst_or_master_tape_identifier
0,podcast_test_round1,OS_7_05_2019_08_24_00_.wav,52.172,1.118,orcasound_lab,1562340736,OS_7_05_2019_08_24_00_.wav
1,podcast_test_round1,OS_7_05_2019_08_24_00_.wav,54.877,1.104,orcasound_lab,1562340736,OS_7_05_2019_08_24_00_.wav
2,podcast_test_round1,OS_7_05_2019_08_24_00_.wav,69.701,2.691,orcasound_lab,1562340736,OS_7_05_2019_08_24_00_.wav
3,podcast_test_round1,OS_7_05_2019_08_24_00_.wav,72.765,0.795,orcasound_lab,1562340736,OS_7_05_2019_08_24_00_.wav
4,podcast_test_round1,OS_7_05_2019_08_24_00_.wav,73.51,0.925,orcasound_lab,1562340736,OS_7_05_2019_08_24_00_.wav


In [70]:
wav_file_name = 'rpi-bush-point_2020_09_27_23_45_00.wav'
start, duration = 	6.5, 	1.5
sample_rate, wav_data = get_wav_for_segment(wav_file_name, start, duration)

# Show some basic information about the audio.
duration = len(wav_data)/sample_rate
print(f'Sample rate: {sample_rate} Hz')
print(f'Total duration: {duration:.2f}s')
print(f'Size of the input: {len(wav_data)}')

# Listening to the wav file.
Audio(wav_data, rate=sample_rate)

Sample rate: 16000 Hz
Total duration: 1.50s
Size of the input: 24000


In [None]:
embeddings_arr = []
for index, row in pd_annotations.iterrows():
    # if index < 4034:
    #     continue
    start = row['start_time_s']
    duration = row['duration_s']
    wav_filename = row['wav_filename']
    if duration >= 0.96:    # min duration for yamnet
        embedding = get_embedding(wav_filename, start, duration)
        
        embeddings_arr.append([index] + embedding.tolist())
        print(index)
    # if index % 100 == 0:
    #     print(embeddings_arr)

wav_file_name = 'rpi-orcasound-lab_2020_09_01_16_34_00.wav'
start, duration = 	23.75, 	1.5
sample_rate, wav_data = get_wav_for_segment(wav_file_name, start, duration)

# Show some basic information about the audio.
duration = len(wav_data)/sample_rate
print(f'Sample rate: {sample_rate} Hz')
print(f'Total duration: {duration:.2f}s')
print(f'Size of the input: {len(wav_data)}')

# Listening to the wav file.
Audio(wav_data, rate=sample_rate)

In [43]:
df = pd.DataFrame(embeddings_arr)
df.to_csv('test_embeddings', index=False)

In [14]:
# # Utility functions for loading audio files and making sure the sample rate is correct.

# @tf.function
# def load_wav_16k_mono(filename):
#     """ Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio. """
#     file_contents = tf.io.read_file(filename)
#     wav, sample_rate = tf.audio.decode_wav(
#           file_contents,
#           desired_channels=1)
#     wav = tf.squeeze(wav, axis=-1)
#     sample_rate = tf.cast(sample_rate, dtype=tf.int64)
#     wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
#     return wav

In [15]:
wav_file_name = '1562337136_0004.wav'
start, duration = 	49.76562500000002, 	2.450000000000003
sample_rate, wav_data = get_wav_for_segment(wav_file_name, start, duration)
scores, embeddings, spectrogram = yamnet_model(wav_data)
embedded_wav = tf.math.reduce_mean(
    embeddings, axis=0, keepdims=False, name=None
)

In [None]:
run_example()

In [None]:
#