Download from https://www.kaggle.com/datasets/kenjee/z-by-hp-unlocked-challenge-3-signal-processing
Folder structure should be this:
data->contents of archive zip

In [None]:
import os
from matplotlib import pyplot as plt
import tensorflow as tf 
import tensorflow_io as tfio

In [None]:
CAPUCHIN_FILE = os.path.join("data", "Parsed_Capuchinbird_Clips", "XC3776-3.wav")
NOT_CAPUCHIN_FILE = os.path.join("data", "Parsed_Not_Capuchinbird_Clips", "afternoon-birds-song-in-forest-0.wav")

In [None]:
def load_wav_16k_mono(filename):
    file_contents = tf.io.read_file(filename)
    wav, sr = tf.audio.decode_wav(file_contents, desired_channels=1)
    wav = tf.squeeze(wav, axis=-1)
    sr = tf.cast(sr, dtype=tf.int64)
    wav = tfio.audio.resample(wav, rate_in=sr, rate_out=16000)
    return wav

In [None]:
wave = load_wav_16k_mono(CAPUCHIN_FILE)
nwave = load_wav_16k_mono(NOT_CAPUCHIN_FILE)

In [None]:
plt.plot(wave)
plt.plot(nwave)

In [None]:
POS = os.path.join("data", "Parsed_Capuchinbird_Clips")
NEG = os.path.join("data", "Parsed_Not_Capuchinbird_Clips")

In [None]:
pos = tf.data.Dataset.list_files(f"{POS}/*.wav")
neg = tf.data.Dataset.list_files(f"{NEG}/*.wav")

In [None]:
pos.as_numpy_iterator().next()

In [None]:
positives = tf.data.Dataset.zip((pos, tf.data.Dataset.from_tensor_slices(tf.ones(len(pos)))))
negatives = tf.data.Dataset.zip((neg, tf.data.Dataset.from_tensor_slices(tf.zeros(len(neg)))))
data = positives.concatenate(negatives)

In [None]:
# Find average capuchin call length
lengths = []
for file in os.listdir(POS):
    tensor_wave = load_wav_16k_mono(os.path.join(POS, file))
    lengths.append(len(tensor_wave))

In [None]:
print(tf.math.reduce_mean(lengths))
print(tf.math.reduce_min(lengths))
print(tf.math.reduce_max(lengths))

In [None]:
def preprocess(file_path, label):
    wav=load_wav_16k_mono(file_path)
    wav = wav[:48000]
    zero_padding = tf.zeros([48000] - tf.shape(wav), dtype=tf.float32)
    wav = tf.concat([zero_padding, wav], 0)
    spectrogram = tf.signal.stft(wav, frame_length=320, frame_step=32)
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.expand_dims(spectrogram, axis=2)
    return spectrogram, label

In [None]:
filepath, label = positives.shuffle(10000).as_numpy_iterator().next()

In [None]:
spectrogram, label = preprocess(filepath, label)

In [None]:
plt.figure(figsize=(30,20))
plt.imshow(tf.transpose(spectrogram)[0],origin='lower')
plt.show()

CREATE TRAINING AND TESTING PARTITIONS

In [None]:
data = data.map(preprocess)
data = data.cache()
data = data.shuffle(buffer_size=1000)
data = data.batch(16)
data = data.prefetch(8)

In [None]:
train = data.take(36)
test = data.skip(36).take(15)

In [None]:
samples, labels = train.as_numpy_iterator().next()
samples.shape

BUILD DEEP LEARNING MODEL

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Dense, Flatten, MaxPooling2D, Dropout

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus, 'GPU')
if gpus:
    for i in range(len(gpus)):
        try:
            
            tf.config.experimental.set_memory_growth(gpus[i], True)
        except RuntimeError as e:
            print(e)

In [None]:
with tf.device("/gpu:0"):
    model = Sequential()
    model.add(Conv2D(16, (3,3), activation='relu', input_shape=(1491, 257,1)))
    model.add(Conv2D(16, (3,3), activation='relu'))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

In [None]:
with tf.device("/gpu:0"):
    model.compile("Adam", loss="BinaryCrossentropy", metrics=[tf.keras.metrics.Recall(), tf.keras.metrics.Precision(), tf.keras.metrics.Accuracy()])

In [None]:
model.summary()

In [None]:
hist = model.fit(train, epochs=4, validation_data=test)

In [None]:
def load_mp3_16k_mono(filename):
    """ Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio. """
    res = tfio.audio.AudioIOTensor(filename)
    # Convert to tensor and combine channels 
    tensor = res.to_tensor()
    tensor = tf.math.reduce_sum(tensor, axis=1) / 2 
    # Extract sample rate and cast
    sample_rate = res.rate
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    # Resample to 16 kHz
    wav = tfio.audio.resample(tensor, rate_in=sample_rate, rate_out=16000)
    return wav

def preprocess_mp3(sample, index):
    sample = sample[0]
    zero_padding = tf.zeros([48000] - tf.shape(sample), dtype=tf.float32)
    wav = tf.concat([zero_padding, sample],0)
    spectrogram = tf.signal.stft(wav, frame_length=320, frame_step=32)
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.expand_dims(spectrogram, axis=2)
    return spectrogram


In [None]:
from itertools import groupby

In [None]:
mp3 = os.path.join("data", "Forest Recordings", "recording_00.mp3")

wav = load_mp3_16k_mono(mp3)

audio_slices = tf.keras.utils.timeseries_dataset_from_array(wav, wav, sequence_length=48000, sequence_stride=48000, batch_size=1)
audio_slices = audio_slices.map(preprocess_mp3)
audio_slices = audio_slices.batch(64)


In [None]:
yhat = model.predict(audio_slices)
yhat = [1 if prediction > 0.99 else 0 for prediction in yhat]
yhat


In [118]:
yhat = [key for key, group in groupby(yhat)]
calls = tf.math.reduce_sum(yhat).numpy()

calls

5

In [119]:
results = {}
for file in os.listdir(os.path.join('data', 'Forest Recordings')):
    FILEPATH = os.path.join('data','Forest Recordings', file)
    
    wav = load_mp3_16k_mono(FILEPATH)
    audio_slices = tf.keras.utils.timeseries_dataset_from_array(wav, wav, sequence_length=48000, sequence_stride=48000, batch_size=1)
    audio_slices = audio_slices.map(preprocess_mp3)
    audio_slices = audio_slices.batch(64)
    
    yhat = model.predict(audio_slices)
    
    results[file] = yhat



In [120]:
results

{'recording_38.mp3': array([[9.7767538e-01],
        [9.8636359e-01],
        [9.9260074e-01],
        [9.6637028e-01],
        [9.7718185e-01],
        [9.5241040e-01],
        [9.9224699e-01],
        [9.8277885e-01],
        [9.7075045e-01],
        [9.8420274e-01],
        [9.8366910e-01],
        [9.6963352e-01],
        [9.9406791e-01],
        [9.7536421e-01],
        [9.8794580e-01],
        [9.9602014e-01],
        [9.9366176e-01],
        [9.8045176e-01],
        [9.8285896e-01],
        [9.7655743e-01],
        [9.5090199e-01],
        [9.7941911e-01],
        [9.9013829e-01],
        [9.8530906e-01],
        [9.9299699e-01],
        [9.7376370e-01],
        [9.6379483e-01],
        [9.8571682e-01],
        [9.6570212e-01],
        [1.0000000e+00],
        [3.4470297e-04],
        [9.8573619e-01],
        [9.9216944e-01],
        [9.6237993e-01],
        [9.7482228e-01],
        [9.4461977e-01],
        [9.9274749e-01],
        [9.8166174e-01],
        [9.6849793e-01],
     

In [122]:
class_preds = {}
for file, logits in results.items():
    class_preds[file] = [1 if prediction > 0.99 else 0 for prediction in logits]
class_preds

{'recording_38.mp3': [0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0],
 'recording_43.mp3': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'recording_27.mp3': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'recording_29.mp3': [0,
  0,
  0,
  

In [123]:
postprocessed = {}
for file, scores in class_preds.items():
    postprocessed[file] = tf.math.reduce_sum([key for key, group in groupby(scores)]).numpy()
postprocessed

{'recording_38.mp3': 12,
 'recording_43.mp3': 5,
 'recording_27.mp3': 0,
 'recording_29.mp3': 0,
 'recording_11.mp3': 4,
 'recording_44.mp3': 1,
 'recording_22.mp3': 2,
 'recording_36.mp3': 0,
 'recording_37.mp3': 3,
 'recording_33.mp3': 0,
 'recording_10.mp3': 5,
 'recording_28.mp3': 6,
 'recording_04.mp3': 4,
 'recording_42.mp3': 0,
 'recording_21.mp3': 1,
 'recording_01.mp3': 0,
 'recording_39.mp3': 4,
 'recording_07.mp3': 2,
 'recording_30.mp3': 3,
 'recording_00.mp3': 5,
 'recording_45.mp3': 3,
 'recording_26.mp3': 2,
 'recording_19.mp3': 0,
 'recording_41.mp3': 0,
 'recording_40.mp3': 1,
 'recording_24.mp3': 0,
 'recording_05.mp3': 0,
 'recording_09.mp3': 0,
 'recording_18.mp3': 6,
 'recording_23.mp3': 6,
 'recording_31.mp3': 1,
 'recording_35.mp3': 0,
 'recording_02.mp3': 0,
 'recording_32.mp3': 2,
 'recording_34.mp3': 4,
 'recording_03.mp3': 0,
 'recording_25.mp3': 2,
 'recording_20.mp3': 0,
 'recording_08.mp3': 25,
 'recording_06.mp3': 5}

In [124]:
import csv

In [125]:
with open('results.csv', 'w', newline='') as f:
    writer = csv.writer(f, delimiter=',')
    writer.writerow(['recording', 'capuchin_calls'])
    for key, value in postprocessed.items():
        writer.writerow([key, value])