<a href="https://colab.research.google.com/github/shaja-asm/cry-detection/blob/main/tf_lite_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tensorflow.keras.utils import Sequence
import datetime
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from tensorflow.keras.optimizers import Adam


# gpus = tf.config.experimental.list_physical_devices('GPU')
# if gpus:
#     try:
#         for gpu in gpus:
#             tf.config.experimental.set_memory_growth(gpu, True)
#     except RuntimeError as e:
#         print(e)
# print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

In [None]:
AUDIO_PATH = 'CryCorpusFinal'
CRY_FOLDER = os.path.join(AUDIO_PATH, 'cry')
NOTCRY_FOLDER = os.path.join(AUDIO_PATH, 'notcry')
IMG_SIZE = (64, 64)
BATCH_SIZE = 32
EPOCHS = 25

In [None]:
def load_audio_files(folder):
    files = []
    for filename in os.listdir(folder):
        if filename.endswith('.wav'):
            files.append(os.path.join(folder, filename))
    return files

def compute_spectrogram(y, sr, n_fft=2048, hop_length=512):
    D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length)
    D_dB = librosa.amplitude_to_db(np.abs(D), ref=np.max)
    return D_dB

In [None]:
def save_spectrogram_to_disk(D_dB, save_path):
    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    np.save(save_path, D_dB)

cry_files = load_audio_files(CRY_FOLDER)
notcry_files = load_audio_files(NOTCRY_FOLDER)

data = []
labels = []

for idx, file in enumerate(cry_files):
    y, sr = librosa.load(file, sr=None)
    y = librosa.util.normalize(y)
    D_dB = compute_spectrogram(y, sr)
    save_path = os.path.join(f'{0}/spectrograms'.format(AUDIO_PATH), f'cry_{idx}.npy')
    save_spectrogram_to_disk(D_dB, save_path)
    data.append(save_path)
    labels.append(1)

for idx, file in enumerate(notcry_files):
    y, sr = librosa.load(file, sr=None)
    y = librosa.util.normalize(y)
    D_dB = compute_spectrogram(y, sr)
    save_path = os.path.join(f'{0}/spectrograms'.format(AUDIO_PATH), f'notcry_{idx}.npy')
    save_spectrogram_to_disk(D_dB, save_path)
    data.append(save_path)
    labels.append(0)

data = np.array(data)
labels = np.array(labels)



In [None]:
# Split the datasets
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, random_state=42)

class OnTheFlyDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, file_paths, labels, batch_size, img_size, shuffle=True, augment=False):
        self.file_paths = file_paths
        self.labels = labels
        self.batch_size = batch_size
        self.img_size = img_size
        self.shuffle = shuffle
        self.augment = augment
        self.indices = np.arange(len(self.file_paths))
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.file_paths) / self.batch_size))

    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        batch_file_paths = [self.file_paths[i] for i in batch_indices]
        batch_labels = [self.labels[i] for i in batch_indices]

        X, y = self.__data_generation(batch_file_paths, batch_labels)
        return X, y

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

    def __data_generation(self, batch_file_paths, batch_labels):
        X = np.empty((len(batch_file_paths), *self.img_size, 1), dtype=np.float32)
        y = np.empty((len(batch_file_paths),), dtype=int)

        for i, file_path in enumerate(batch_file_paths):
            D_dB = np.load(file_path)
            D_dB = D_dB[..., np.newaxis]  # Add channel dimension
            D_dB = tf.image.resize(D_dB, self.img_size).numpy()
            if self.augment:
                D_dB = tf.image.random_flip_left_right(D_dB)
                D_dB = tf.image.random_flip_up_down(D_dB)
                D_dB = tf.image.random_brightness(D_dB, max_delta=0.2)
            X[i,] = D_dB
            y[i] = batch_labels[i]

        return X, y

# Augment training data
train_generator = OnTheFlyDataGenerator(X_train, y_train, BATCH_SIZE, IMG_SIZE, shuffle=True, augment=True)
val_generator = OnTheFlyDataGenerator(X_val, y_val, BATCH_SIZE, IMG_SIZE, shuffle=False, augment=False)

# Define the model with batch normalization and l2 regularization
l2_regularizer = tf.keras.regularizers.l2(0.001)

model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_SIZE[0], IMG_SIZE[1], 1), kernel_regularizer=l2_regularizer),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        Conv2D(64, (3, 3), activation='relu', kernel_regularizer=l2_regularizer),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        Conv2D(128, (3, 3), activation='relu', kernel_regularizer=l2_regularizer),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        Flatten(),
        Dense(128, activation='relu', kernel_regularizer=l2_regularizer),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])

# Compile the model with the specified learning rate
optimizer = Adam(learning_rate=1e-4)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Set up callbacks
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch='500,520')

# Save the best model during training
checkpoint_callback = ModelCheckpoint('cry_detection_model.keras', monitor='val_loss', save_best_only=True, mode='min')

# Learning rate scheduler
lr_callback = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

# Train the model with the added callbacks
history = model.fit(
        train_generator,
        epochs=EPOCHS,
        validation_data=val_generator,
        callbacks=[tensorboard_callback, checkpoint_callback, lr_callback]
    )

print("Training complete. Model saved as 'cry_detection_model.keras'")


In [22]:
y_pred = model.predict(val_generator)
y_pred = (y_pred > 0.5).astype(int)
acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

print(f'Accuracy: {acc}')
print(f'F1 Score: {f1}')

model.save('cry_detection_model.keras')

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 99ms/step
Accuracy: 0.9215686274509803
F1 Score: 0.9111111111111111


In [24]:
import pathlib
tflite_models_dir = pathlib.Path("tflite_models")
tflite_models_dir.mkdir(exist_ok=True, parents=True)


converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

tflite_model_file = tflite_models_dir/"cry_detection_model.tflite"
tflite_model_file.write_bytes(tflite_model)

converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_types = [tf.float16]

tflite_fp16_model = converter.convert()
tflite_model_fp16_file = tflite_models_dir/"cry_detection_model_quant_f16.tflite"
tflite_model_fp16_file.write_bytes(tflite_fp16_model)

INFO:tensorflow:Assets written to: /tmp/tmp460j4yag/assets


INFO:tensorflow:Assets written to: /tmp/tmp460j4yag/assets


Saved artifact at '/tmp/tmp460j4yag'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 64, 64, 1), dtype=tf.float32, name='keras_tensor_17')
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  140275275206848: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140275275211072: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140275275207728: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140275275213360: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140275275209840: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140275275211600: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140275275215472: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140275275217760: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140275275207024: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140275275218288: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1402752752124

W0000 00:00:1723022124.594840   79590 tf_tfl_flatbuffer_helpers.cc:392] Ignored output_format.
W0000 00:00:1723022124.594905   79590 tf_tfl_flatbuffer_helpers.cc:395] Ignored drop_control_dependency.
2024-08-07 14:45:24.595165: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /tmp/tmp460j4yag
2024-08-07 14:45:24.596989: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2024-08-07 14:45:24.597017: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /tmp/tmp460j4yag
2024-08-07 14:45:24.614182: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2024-08-07 14:45:24.714847: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /tmp/tmp460j4yag
2024-08-07 14:45:24.742380: I tensorflow/cc/saved_model/loader.cc:462] SavedModel load for tags { serve }; Status: success: OK. Took 147219 microseconds.


INFO:tensorflow:Assets written to: /tmp/tmpkbv26fua/assets


INFO:tensorflow:Assets written to: /tmp/tmpkbv26fua/assets


Saved artifact at '/tmp/tmpkbv26fua'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 64, 64, 1), dtype=tf.float32, name='keras_tensor_17')
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  140275275206848: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140275275211072: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140275275207728: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140275275213360: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140275275209840: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140275275211600: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140275275215472: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140275275217760: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140275275207024: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140275275218288: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1402752752124

W0000 00:00:1723022126.716757   79590 tf_tfl_flatbuffer_helpers.cc:392] Ignored output_format.
W0000 00:00:1723022126.716838   79590 tf_tfl_flatbuffer_helpers.cc:395] Ignored drop_control_dependency.
2024-08-07 14:45:26.717112: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /tmp/tmpkbv26fua
2024-08-07 14:45:26.721117: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2024-08-07 14:45:26.721158: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /tmp/tmpkbv26fua
2024-08-07 14:45:26.745270: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2024-08-07 14:45:26.886137: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /tmp/tmpkbv26fua
2024-08-07 14:45:26.927222: I tensorflow/cc/saved_model/loader.cc:462] SavedModel load for tags { serve }; Status: success: OK. Took 210114 microseconds.


1374136

In [26]:
# Load the TFLite model and allocate tensors
interpreter = tf.lite.Interpreter(model_path="tflite_models/cry_detection_model.tflite")
interpreter.allocate_tensors()

# Get input and output tensors
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Function to preprocess the audio file and convert it to a spectrogram
def preprocess_audio(file_path, img_size):
    y, sr = librosa.load(file_path, sr=None)
    y = librosa.util.normalize(y)
    D = librosa.stft(y, n_fft=2048, hop_length=512)
    D_dB = librosa.amplitude_to_db(np.abs(D), ref=np.max)
    D_dB_resized = tf.image.resize(D_dB[..., np.newaxis], img_size).numpy()
    return D_dB_resized

def predict(file_path, img_size=(32, 32)):
    # Preprocess the audio file
    input_data = preprocess_audio(file_path, img_size)
    
    # Add batch dimension
    input_data = np.expand_dims(input_data, axis=0).astype(np.float32)
    
    # Set the tensor to point to the input data to be inferred
    interpreter.set_tensor(input_details[0]['index'], input_data)
    
    # Run inference
    interpreter.invoke()
    
    # Extract output data
    output_data = interpreter.get_tensor(output_details[0]['index'])
    
    return output_data

# Function to process all .wav files in a folder and calculate prediction accuracy
def process_folder(folder_path, img_size=(64, 64)):
    correct_predictions = 0
    total_files = 0
    results = []
    
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.wav'):
            file_path = os.path.join(folder_path, file_name)
            prediction = predict(file_path, img_size)
            prediction_label = 'Cry' if prediction > 0.5 else 'Not Cry'
            results.append((file_name, prediction_label))
            
            # Determine ground truth from the file name
            ground_truth = 'Cry' if '_cry.wav' in file_name else 'Not Cry'
            
            if prediction_label == ground_truth:
                correct_predictions += 1
            
            total_files += 1

    # Calculate prediction accuracy
    accuracy = (correct_predictions / total_files) * 100 if total_files > 0 else 0
    
    return results, accuracy

# Example usage
folder_path = 'CryCorpusFinal/Test'  # Replace with the path to your folder containing .wav files
predictions, accuracy = process_folder(folder_path)

for file_name, prediction_label in predictions:
    print(f"File: {file_name}, Prediction: {prediction_label}")

print(f"Prediction Accuracy: {accuracy:.2f}%")


File: P19_195_cry.wav, Prediction: Not Cry
File: P19_607_notcry.wav, Prediction: Not Cry
File: P19_630_cry.wav, Prediction: Not Cry
File: P19_365_cry.wav, Prediction: Cry
File: P26_824_cry.wav, Prediction: Cry
File: P36_411_notcry.wav, Prediction: Not Cry
File: P19_516_cry.wav, Prediction: Cry
File: P19_45_notcry.wav, Prediction: Not Cry
File: P19_476_cry.wav, Prediction: Cry
File: P19_801_cry.wav, Prediction: Not Cry
File: P19_426_cry.wav, Prediction: Cry
File: P19_291_cry.wav, Prediction: Cry
File: P15_3463_cry.wav, Prediction: Cry
File: P15_3571_cry.wav, Prediction: Cry
File: P19_215_cry.wav, Prediction: Cry
File: P19_849_cry.wav, Prediction: Not Cry
File: P19_465_cry.wav, Prediction: Cry
File: P17_46_cry.wav, Prediction: Cry
File: P19_915_cry.wav, Prediction: Not Cry
File: P19_24_cry.wav, Prediction: Cry
File: P36_54_notcry.wav, Prediction: Not Cry
File: P15_3513_cry.wav, Prediction: Cry
File: P17_744_cry.wav, Prediction: Cry
File: P19_546_cry.wav, Prediction: Cry
File: P19_196_cry