In [None]:
import numpy as np # linear algebre
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

We'll use librosa to read audio and perform some analysis

In [None]:
import librosa
import soundfile as sf
import librosa.display as lrd

import os

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns

Let's read both tables to identify true and false positives on the spectrum

In [None]:
tpdf = pd.read_csv('/kaggle/input/rfcx-species-audio-detection/train_tp.csv')
fpdf = pd.read_csv('/kaggle/input/rfcx-species-audio-detection/train_fp.csv')

In [None]:
tpdf

In [None]:
fpdf

## Number of samples per call type and species

In [None]:
tpdf['duration'] = tpdf.t_max-tpdf.t_min
tpdf['bandwidth'] = tpdf.f_max-tpdf.f_min

tpdf.pivot_table(index='species_id',columns='songtype_id',
                 values='duration',aggfunc='count').fillna(0)

## Metadata per species

Notice how the duration and frequency bands are highly characteristic of each species. 

The trouble will be to establish the event boundaries in new sound files

### Call duration


In [None]:
fig,ax = plt.subplots(1,figsize=(10,4))
sns.boxplot(data=tpdf,y='duration',x='species_id',hue='songtype_id')

### Lower frequency of call band

In [None]:
fig,ax = plt.subplots(1,figsize=(10,4))
sns.boxplot(data=tpdf,y='f_min',x='species_id',hue='songtype_id')

### Uper frequency of call band

In [None]:
fig,ax = plt.subplots(1,figsize=(10,4))
sns.boxplot(data=tpdf,y='f_max',x='species_id',hue='songtype_id')

### Bandwidth

In [None]:
fig,ax = plt.subplots(1,figsize=(10,4))
sns.boxplot(data=tpdf,y='bandwidth',x='species_id',hue='songtype_id')

## Investigate Species 0

In [None]:
tpdf

In [None]:
s0 = tpdf[tpdf['species_id'] == 0]
s0

In [None]:
r0 = s0[s0['recording_id'] == '00d442df7']
r0

In [None]:
t_min = r0['t_min'].values[0]
t_max = r0['t_max'].values[0]

In [None]:
!ls /kaggle/input/rfcx-species-audio-detection/train

In [None]:
import IPython
IPython.display.Audio("/kaggle/input/rfcx-species-audio-detection/train/00d442df7.flac")

In [None]:
data, samplerate = librosa.load('/kaggle/input/rfcx-species-audio-detection/train/00d442df7.flac', mono=True)
IPython.display.Audio(data, rate=samplerate)

In [None]:
samplerate

In [None]:
data.shape

In [None]:
data.shape[0] / samplerate

In [None]:
start_point = int(samplerate * t_min)
start_point

In [None]:
end_point = int(samplerate * t_max)
end_point

In [None]:
r0_cut = data[start_point:end_point]

IPython.display.Audio(r0_cut, rate=samplerate)

In [None]:
r0_cut.shape

In [None]:
S0_cut = np.abs(librosa.stft(r0_cut))

In [None]:
S0_cut.shape

In [None]:
fig, ax = plt.subplots()
img = librosa.display.specshow(librosa.amplitude_to_db(S0_cut, ref=np.max), y_axis='log', x_axis='time', ax=ax)
ax.set_title('Power spectrogram')
fig.colorbar(img, ax=ax, format="%+2.0f dB")

In [None]:
IS0_cut = librosa.istft(S0_cut)

IPython.display.Audio(IS0_cut, rate=samplerate)

In [None]:
tpdf['species_id'].unique()

In [None]:
# 0-23 different species
# species_unique_list = sorted(tpdf['species_id'].unique())
species_unique_list = [0]
species_0 = {}
for species in species_unique_list:
    species_df = tpdf[tpdf['species_id'] == species]
    
    for i in species_df.iterrows():
        record_id = i[1][0]
        t_min = i[1][3]
        t_max = i[1][5]
        
        data, samplerate = sf.read('/kaggle/input/rfcx-species-audio-detection/train/'+record_id+'.flac')
        
        start_point = int(samplerate * t_min)
        end_point = int(samplerate * t_max)
    
        cut = data[start_point:end_point]
        
        stft_cut = np.abs(librosa.stft(cut))
        
        # Currently we get only same sized sounds
        if stft_cut.shape[1] != 75:
            continue
        
        species_0[record_id] = stft_cut

species_0

## VAE Keras

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
# We have 40 record for species 0. 10 of them are longer so we dont use them.
data = np.array(list(species_0.values()))
data.shape

In [None]:
# 1 is for channel
data = data.reshape(40, 1025, 75, 1)
data.shape

In [None]:
class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

In [None]:
# ENCODER

latent_dim = 2

encoder_inputs = keras.Input(shape=(1025, 75, 1))
x = layers.Conv2D(32, 3, activation="relu", strides=(5, 5), padding="same")(encoder_inputs)
x = layers.Conv2D(64, 3, activation="relu", strides=(5, 3), padding="same")(x)
x = layers.Flatten()(x)
x = layers.Dense(16, activation="relu")(x)
z_mean = layers.Dense(latent_dim, name="z_mean")(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])
encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
encoder.summary()

In [None]:
# DECODER

latent_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(41 * 5 * 64, activation="relu")(latent_inputs)
x = layers.Reshape((41, 5, 64))(x)
x = layers.Conv2DTranspose(64, 3, activation="relu", strides=(5, 3), padding="same")(x)
x = layers.Conv2DTranspose(32, 3, activation="relu", strides=(5, 5), padding="same")(x)
decoder_outputs = layers.Conv2DTranspose(1, 3, activation="sigmoid", padding="same")(x)
decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
decoder.summary()

In [None]:
class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def train_step(self, data):
        if isinstance(data, tuple):
            data = data[0]
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = encoder(data)
            reconstruction = decoder(z)
            reconstruction_loss = tf.reduce_mean(
                keras.losses.mse(data, reconstruction)
            )
            reconstruction_loss *= 1025 * 75
            kl_loss = 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
            kl_loss = tf.reduce_mean(kl_loss)
            kl_loss *= -0.5
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        return {
            "loss": total_loss,
            "reconstruction_loss": reconstruction_loss,
            "kl_loss": kl_loss,
        }

In [None]:
# Early Stopping
# Model Checkpoint
es = EarlyStopping(monitor='reconstruction_loss', mode='min', patience=50)
mc = ModelCheckpoint('best_model.h5', monitor='reconstruction_loss', mode='min', save_best_only=True)

In [None]:
# TRAIN

vae = VAE(encoder, decoder)
vae.compile(optimizer=keras.optimizers.Adam())
vae.fit(data, epochs=30000, batch_size=128, callbacks=[es, mc])

In [None]:
example_stft = data[0].reshape(1025, 75)
example_sound = librosa.istft(example_stft)

IPython.display.Audio(example_sound, rate=samplerate)

In [None]:
example_stft = example_stft.reshape(1, 1025, 75, 1)
encoder_outputs = encoder.predict(example_stft)

In [None]:
decoded_stft = decoder.predict(encoder_outputs)
decoded_stft = decoded_stft.reshape(1025, 75)
decoded_sound = librosa.istft(decoded_stft)

IPython.display.Audio(decoded_sound, rate=samplerate)

## Classification

We will use 25(5val+20train) generated and 25(5val+20train) real records for species_0

40(10val+30train) real records for species_1

40(10val+30train) real records for species_2

Total train size 100 and validation size will be 30

#### Species 0

In [None]:
# We will use 25(5val+20train) generated and 25(5val+20train) real records for species_0
# 40(10val+30train) real records for species_1
# 40(10val+30train) real records for species_2

# Total train size 100 and validation size will be 30
species_0_list = np.array(list(species_0.values()))
species_0_real_train = species_0_list[:20]
species_0_real_train.shape

In [None]:
species_0_real_validation = species_0_list[20:25]
species_0_real_validation.shape

In [None]:
species_0_generated_train = []
for i in range(20):
    noise = tf.random.normal([1, 100])
    generated_record = generator(noise, training=False)

    generated_record = generated_record.numpy().reshape(-1, 64000)
    species_0_generated_train.append(generated_record)

    
species_0_generated_train = np.array(species_0_generated_train).squeeze()
species_0_generated_train.shape

In [None]:
species_0_generated_validation = []
for i in range(5):
    noise = tf.random.normal([1, 100])
    generated_record = generator(noise, training=False)

    generated_record = generated_record.numpy().reshape(-1, 64000)
    species_0_generated_validation.append(generated_record)
    
species_0_generated_validation = np.array(species_0_generated_validation).squeeze()
species_0_generated_validation.shape

In [None]:
# 40 species_0 record will use as train(20 generated and 20 real)
train_species_0 = np.concatenate((species_0_generated_train, species_0_real_train))
train_species_0.shape

In [None]:
# Create labels species_0 is 0 and species_1 is 1 and species_3 is 2
train_species_0_label = np.zeros(len(train_species_0))
train_species_0_label.shape

In [None]:
# 20 species_0 record will use as train(5 generated and 5 real)
validation_species_0 = np.concatenate((species_0_generated_validation, species_0_real_validation))
validation_species_0.shape

In [None]:
validation_species_0_label = np.zeros(len(validation_species_0))
validation_species_0_label.shape

#### Species 1

In [None]:
# 0-23 different species
# species_unique_list = sorted(tpdf['species_id'].unique())
species_unique_list = [1]
species_1 = {}
for species in species_unique_list:
    species_df = tpdf[tpdf['species_id'] == species]
    
    for i in species_df.iterrows():
        record_id = i[1][0]
        t_min = i[1][3]
        t_max = i[1][5]
        
        data, samplerate = sf.read('/kaggle/input/rfcx-species-audio-detection/train/'+record_id+'.flac')
        
        start_point = int(samplerate * t_min)
        end_point = int(samplerate * t_max)
    
        cut = data[start_point:end_point]
        
        species_1[record_id] = cut

species_1

In [None]:
# Padding every record to 64000 samples

'''
max_len = -1
for record_data in species_0.values():
    length = len(record_data)
    if length > max_len:
        max_len = length
'''
max_len = 64000
'''
a = [1, 2, 3, 4, 5]
np.pad(a, (3), 'constant', constant_values=(6))

>>> array([6, 6, 6, 1, 2, 3, 4, 5, 6, 6, 6])
'''
for record_id in species_1.keys():
    if len(species_1[record_id]) < max_len:
        species_1[record_id] = np.pad(species_1[record_id], (0, (max_len - len(species_1[record_id]))), 'constant', constant_values=(0))

species_1

In [None]:
IPython.display.Audio(species_1['0151b7d20'], rate=samplerate)

In [None]:
species_1_list = np.array(list(species_1.values()))
train_species_1 = species_1_list[:30]
train_species_1.shape

In [None]:
# Create labels species_0 is 0 and species_1 is 1 and species_3 is 2
train_species_1_label = np.ones(len(train_species_1))
train_species_1_label.shape

In [None]:
validation_species_1 = species_1_list[30:40]
validation_species_1.shape

In [None]:
# Create labels species_0 is 0 and species_1 is 1 and species_3 is 2
validation_species_1_label = np.ones(len(validation_species_1))
validation_species_1_label.shape

#### Species 3

In [None]:
# 0-23 different species
# species_unique_list = sorted(tpdf['species_id'].unique())
species_unique_list = [3]
species_3 = {}
for species in species_unique_list:
    species_df = tpdf[tpdf['species_id'] == species]
    
    for i in species_df.iterrows():
        record_id = i[1][0]
        t_min = i[1][3]
        t_max = i[1][5]
        
        data, samplerate = sf.read('/kaggle/input/rfcx-species-audio-detection/train/'+record_id+'.flac')
        
        start_point = int(samplerate * t_min)
        end_point = int(samplerate * t_max)
    
        cut = data[start_point:end_point]
        
        species_3[record_id] = cut

species_3

In [None]:
# Padding every record to 64000 samples

'''
max_len = -1
for record_data in species_0.values():
    length = len(record_data)
    if length > max_len:
        max_len = length
'''
max_len = 64000
'''
a = [1, 2, 3, 4, 5]
np.pad(a, (3), 'constant', constant_values=(6))

>>> array([6, 6, 6, 1, 2, 3, 4, 5, 6, 6, 6])
'''
for record_id in species_3.keys():
    if len(species_3[record_id]) < max_len:
        species_3[record_id] = np.pad(species_3[record_id], (0, (max_len - len(species_3[record_id]))), 'constant', constant_values=(0))

species_3

In [None]:
species_3_list = np.array(list(species_3.values()))
train_species_3 = species_3_list[:30]
train_species_3.shape

In [None]:
# Create labels species_0 is 0 and species_1 is 1 and species_3 is 2
train_species_3_label = np.ones(len(train_species_3)) * 2
train_species_3_label.shape

In [None]:
validation_species_3 = species_3_list[30:40]
validation_species_3.shape

In [None]:
# Create labels species_0 is 0 and species_1 is 1 and species_3 is 2
validation_species_3_label = np.ones(len(validation_species_3)) * 2
validation_species_3_label.shape

#### Concatenate data

In [None]:
X_train = np.concatenate((train_species_0, train_species_1, train_species_3)).reshape(100, 64000, 1)
y_train = np.concatenate((train_species_0_label, train_species_1_label, train_species_3_label)).reshape(100, 1)
X_validation = np.concatenate((validation_species_0, validation_species_1, validation_species_3)).reshape(30, 64000, 1)
y_validation = np.concatenate((validation_species_0_label, validation_species_1_label, validation_species_3_label)).reshape(30, 1)

In [None]:
print('X_train shape,', X_train.shape)
print('y_train shape,', y_train.shape)
print('X_validation shape,', X_validation.shape)
print('y_validation shape,', y_validation.shape)

In [None]:
y_validation

#### Preprocess

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
enc = OneHotEncoder()
y_validation = enc.fit_transform(y_validation).toarray()
y_validation

In [None]:
enc = OneHotEncoder()
y_train = enc.fit_transform(y_train).toarray()

#### Create model

In [None]:
def make_classification_model():
    model = tf.keras.Sequential()
    model.add(layers.Conv1D(32, (25), strides=(4), padding='same',
                                     input_shape=[64000,1]))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))

    model.add(layers.Conv1D(32, (25), strides=(4), padding='same'))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))
    
    model.add(layers.Conv1D(64, (25), strides=(4), padding='same'))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))
    
    model.add(layers.Conv1D(32, (25), strides=(4), padding='same'))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))
    
    model.add(layers.Conv1D(32, (25), strides=(4), padding='same'))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))

    model.add(layers.Flatten())
    
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(3, activation='softmax'))

    return model

In [None]:
classification_model = make_classification_model()

In [None]:
classification_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
classification_model.fit(X_train, y_train, validation_data=(X_validation, y_validation), epochs=150, batch_size=8)

#### Test model

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
# We never use species_0 records from 25 to 50
species_0_test = species_0_list[25:50]
species_0_test.shape

In [None]:
species_0_test = species_0_test.reshape(25, 64000, 1)
species_0_test.shape

In [None]:
classification_model.predict_classes(species_0_test)

In [None]:
accuracy_score(classification_model.predict_classes(species_0_test), np.zeros(len(species_0_test)))

In [None]:
#We never use species_1 records from 40 to 50
species_1_test = species_1_list[40:50]
species_1_test.shape

In [None]:
species_1_test = species_1_test.reshape(10, 64000, 1)
species_1_test.shape

In [None]:
classification_model.predict_classes(species_1_test)

In [None]:
accuracy_score(classification_model.predict_classes(species_1_test), np.ones(len(species_1_test)))

In [None]:
#We never use species_3 records from 40 to 50
species_3_test = species_3_list[40:50]
species_3_test.shape

In [None]:
species_3_test = species_3_test.reshape(10, 64000, 1)
species_3_test.shape

In [None]:
classification_model.predict_classes(species_3_test)

In [None]:
accuracy_score(classification_model.predict_classes(species_3_test), np.ones(len(species_3_test))*2 )

## Classification 2

We never use real records for species_0 this time

#### Species 0

In [None]:
# We will use 40(10val+30train) generated records for species_0
# 40(10val+30train) real records for species_1
# 40(10val+30train) real records for species_2

# Total train size 90 and validation size will be 30

In [None]:
species_0_generated = []
for i in range(40):
    noise = tf.random.normal([1, 100])
    generated_record = generator(noise, training=False)

    generated_record = generated_record.numpy().reshape(-1, 64000)
    species_0_generated.append(generated_record)

    
species_0_generated = np.array(species_0_generated).squeeze()
species_0_generated.shape

In [None]:
train_species_0 = species_0_generated[:30]
train_species_0.shape

In [None]:
validation_species_0 = species_0_generated[30:40]
validation_species_0.shape

In [None]:
# Create labels species_0 is 0 and species_1 is 1 and species_3 is 2
train_species_0_label = np.zeros(len(train_species_3))
train_species_0_label.shape

In [None]:
# Create labels species_0 is 0 and species_1 is 1 and species_3 is 2
validation_species_0_label = np.zeros(len(validation_species_3))
validation_species_0_label.shape

#### Concatenate

In [None]:
X_train = np.concatenate((train_species_0, train_species_1, train_species_3)).reshape(90, 64000, 1)
y_train = np.concatenate((train_species_0_label, train_species_1_label, train_species_3_label)).reshape(90, 1)
X_validation = np.concatenate((validation_species_0, validation_species_1, validation_species_3)).reshape(30, 64000, 1)
y_validation = np.concatenate((validation_species_0_label, validation_species_1_label, validation_species_3_label)).reshape(30, 1)

In [None]:
print('X_train shape,', X_train.shape)
print('y_train shape,', y_train.shape)
print('X_validation shape,', X_validation.shape)
print('y_validation shape,', y_validation.shape)

#### Preprocces

In [None]:
enc = OneHotEncoder()
y_validation = enc.fit_transform(y_validation).toarray()
y_validation

In [None]:
enc = OneHotEncoder()
y_train = enc.fit_transform(y_train).toarray()

#### Create second model

In [None]:
classification_model_2 = make_classification_model()

In [None]:
classification_model_2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
classification_model.fit(X_train, y_train, validation_data=(X_validation, y_validation), epochs=150, batch_size=8)

In [None]:
# We never use real species_0 records
species_0_test = species_0_list[0:50]
species_0_test.shape

In [None]:
species_0_test = species_0_test.reshape(50, 64000, 1)
species_0_test.shape

In [None]:
classification_model_2.predict_classes(species_0_test)

In [None]:
accuracy_score(classification_model_2.predict_classes(species_0_test), np.zeros(len(species_0_test)))

In [None]:
# We never use last 10 species_1 records
species_1_test = species_1_list[40:50]
species_1_test.shape

In [None]:
species_1_test = species_1_test.reshape(10, 64000, 1)
species_1_test.shape

In [None]:
classification_model_2.predict_classes(species_1_test)

In [None]:
accuracy_score(classification_model_2.predict_classes(species_1_test), np.ones(len(species_1_test)))

In [None]:
# We never use last 10 species_3 records
species_3_test = species_3_list[40:50]
species_3_test.shape

In [None]:
species_3_test = species_3_test.reshape(10, 64000, 1)
species_3_test.shape

In [None]:
classification_model_2.predict_classes(species_3_test)

In [None]:
accuracy_score(classification_model_2.predict_classes(species_3_test), np.ones(len(species_3_test))*2)