In [None]:
# !pip install kaggle==1.5.12

In [None]:
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 /content/kaggle.json

In [None]:
# !kaggle datasets download -d ronanhansel/librisevoc-extracted-spectrogram -p /content/LibriSeVoc --unzip

# DATA PROCESSING

In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras import Input, Model
from sklearn.model_selection import train_test_split
import csv
import torch
from tqdm import tqdm
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.optimizers.schedules import ExponentialDecay

In [None]:
RUNNER = "Duc"
SEGMENT_LENGTH = 1
NUM_SEGMENT = 30
SR = 24000
BATCH_SIZE = 32
LR = 0.0001
EPOCHS = 30
SAVE_INTERVAL = 100
SAVE_PATH = './output/train_data_checkpoint/train_data_partial'
TRAINING_FILE = 'train_data_partial' # or DEMO


In [None]:
# Check if MPS (Metal Performance Shaders) is available and set the device accordingly
device = torch.device('mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {device}")

Using device: cuda


In [None]:
if RUNNER == "Nam":
  SAVE_PATH = "/content/LibriSeVoc/" + TRAINING_FILE
elif RUNNER == "Duc":
  from google.colab import drive
  drive.mount('/content/drive')
  SAVE_PATH = "/content/drive/MyDrive/Datasets/" + TRAINING_FILE

# Load train and validation data
data = torch.load(SAVE_PATH + '.pt')
train_segments = data['segments']
train_labels = data['labels']

train_segments = np.array([segment.cpu().numpy() for segment in train_segments])
train_segments = np.transpose(train_segments, (0, 1, 3, 4, 2))
train_labels = train_labels.cpu().numpy().astype(int)
print(f"Train segments shape: {train_segments.shape}, Train labels shape: {train_labels.shape}")

Mounted at /content/drive


  data = torch.load(SAVE_PATH + '.pt')


Train segments shape: (21120, 30, 128, 47, 1), Train labels shape: (21120,)


In [None]:
# Split the train data further into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_segments, train_labels, test_size=0.2, random_state=42, stratify=train_labels)

print(f"Train segments shape: {X_train.shape}, Train labels shape: {y_train.shape}")
print(f"Validation segments shape: {X_val.shape}, Validation labels shape: {y_val.shape}")

Train segments shape: (16896, 30, 128, 47, 1), Train labels shape: (16896,)
Validation segments shape: (4224, 30, 128, 47, 1), Validation labels shape: (4224,)


In [None]:
# Remove variables to save on RAM, it works!
import gc

del train_segments
del train_labels
del data

gc.collect()

30

# MODEL

In [None]:
initializer = tf.keras.initializers.GlorotNormal()

def create_cnn_model(input_shape):
    model = models.Sequential([
        layers.Conv2D(32, (5, 5), input_shape=input_shape, padding='same', kernel_initializer=initializer),
        layers.BatchNormalization(),
        layers.LeakyReLU(alpha=0.01),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (5, 5), padding='same', kernel_initializer=initializer),
        layers.BatchNormalization(),
        layers.LeakyReLU(alpha=0.01),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(128, (5, 5), padding='same', kernel_initializer=initializer),
        layers.BatchNormalization(),
        layers.LeakyReLU(alpha=0.01),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(256, (5, 5), padding='same', kernel_initializer=initializer),
        layers.BatchNormalization(),
        layers.LeakyReLU(alpha=0.01),
        layers.MaxPooling2D((2, 2)),
    ])
    return model

# Assume input_shape is (num_freq_bins, time_steps, 1)
input_shape = (128, 47, 1)  # Example input shape based on typical spectrogram size
cnn_model = create_cnn_model(input_shape)

# Input for multiple segments
num_segments = NUM_SEGMENT  # Example number of segments per audio file
segment_input = Input(shape=(num_segments, *input_shape))

# Apply CNN model to each segment
cnn_features = layers.TimeDistributed(cnn_model)(segment_input)  # Shape: (batch, num_segments, feature_dim)
flatten_layer = layers.TimeDistributed(layers.Flatten())  # Flatten spatial dimensions
cnn_features_flattened = flatten_layer(cnn_features)  # Shape: (batch, num_segments, 8*2*256)

# Add LSTM layers (batch_size, timesteps, input_dim)
lstm_layer = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(cnn_features_flattened)
lstm_layer = layers.Dropout(0.5)(lstm_layer)
lstm_layer = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(lstm_layer)
lstm_layer = layers.Dropout(0.5)(lstm_layer)
lstm_layer = layers.Bidirectional(layers.LSTM(128, return_sequences=False))(lstm_layer)

# Classification Layer
x = layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.05))(lstm_layer)
x = layers.Dropout(0.5)(x)
x = layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.05))(lstm_layer)
x = layers.Dropout(0.5)(x)
output = layers.Dense(1, activation='sigmoid')(x)

# Final model
model = Model(inputs=segment_input, outputs=output)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
model.summary()

In [None]:
initial_learning_rate = 0.0001 # Your initial learning rate
decay_steps = 10000  # Number of steps to decay over
decay_rate = 0.9  # Decay rate

lr_schedule = ExponentialDecay(
    initial_learning_rate,
    decay_steps=decay_steps,
    decay_rate=decay_rate,
    staircase=True  # Use staircase decay (optional)
)

optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, clipvalue=0.5)
# optimizer = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9)  # SGD with momentum

In [None]:
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# TRAINING

In [None]:
MODEL_PATH = './output/best_model.keras'
HISTORY_PATH = f'./output/train_history_{TRAINING_FILE}.csv'
LOG_PATH = './output/logs'

In [None]:
import json
from keras.callbacks import Callback, TensorBoard, CSVLogger

checkpoint_callback = ModelCheckpoint(
    filepath=MODEL_PATH,  # Save weights with epoch number
    monitor='val_loss',
    save_best_only=True,  # Save weights every epoch, not just the best
    mode='min',
    verbose=1
)

early_stopping = EarlyStopping(monitor="val_loss", patience=7, restore_best_weights=True)
save_history_callback = CSVLogger(HISTORY_PATH, separator=',', append=True)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=LOG_PATH, histogram_freq=1)

callbacks = [checkpoint_callback, early_stopping, save_history_callback, tensorboard_callback]

In [None]:
X_train.shape

(16896, 30, 128, 47, 1)

In [None]:
initial_epoch = 0
old_history = None

history = model.fit(X_train, y_train, epochs=EPOCHS, batch_size=128, shuffle=True, validation_data=(X_val, y_val), callbacks=callbacks, initial_epoch=initial_epoch)

Epoch 1/30
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 473ms/step - accuracy: 0.7198 - loss: 5.3311
Epoch 1: val_loss improved from inf to 4.08593, saving model to ./output/best_model.keras
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 564ms/step - accuracy: 0.7204 - loss: 5.3279 - val_accuracy: 0.9162 - val_loss: 4.0859
Epoch 2/30
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 471ms/step - accuracy: 0.9334 - loss: 3.7793
Epoch 2: val_loss improved from 4.08593 to 2.93279, saving model to ./output/best_model.keras
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 504ms/step - accuracy: 0.9336 - loss: 3.7771 - val_accuracy: 0.9818 - val_loss: 2.9328
Epoch 3/30
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 472ms/step - accuracy: 0.9750 - loss: 2.7629
Epoch 3: val_loss improved from 2.93279 to 2.15446, saving model to ./output/best_model.keras
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[