<a href="https://colab.research.google.com/github/shaja-asm/cry-detection/blob/main/tf_lite_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tensorflow.keras.utils import Sequence
import datetime
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, TensorBoard, EarlyStopping
from tensorflow.keras.optimizers import Adam
from scipy.ndimage import zoom


gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [4]:
AUDIO_PATH = '/content/drive/MyDrive/CryCorpusFinal'
CRY_FOLDER = os.path.join(AUDIO_PATH, 'cry')
NOTCRY_FOLDER = os.path.join(AUDIO_PATH, 'notcry')
IMG_SIZE = (64, 64)
BATCH_SIZE = 32
EPOCHS = 25

In [5]:
def load_audio_files(folder):
    files = []
    for filename in os.listdir(folder):
        if filename.endswith('.wav'):
            files.append(os.path.join(folder, filename))
    return files

def compute_spectrogram(y, sr, n_fft=2048, hop_length=512):
    D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length)
    D_dB = librosa.amplitude_to_db(np.abs(D), ref=np.max)
    return D_dB

In [6]:
def save_spectrogram_to_disk(D_dB, save_path):
    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    np.save(save_path, D_dB)

cry_files = load_audio_files(CRY_FOLDER)
notcry_files = load_audio_files(NOTCRY_FOLDER)

data = []
labels = []

for idx, file in enumerate(cry_files):
    y, sr = librosa.load(file, sr=None)
    y = librosa.util.normalize(y)
    D_dB = compute_spectrogram(y, sr)
    save_path = os.path.join(f'{0}/spectrograms'.format(AUDIO_PATH), f'cry_{idx}.npy')
    save_spectrogram_to_disk(D_dB, save_path)
    data.append(save_path)
    labels.append(1)

for idx, file in enumerate(notcry_files):
    y, sr = librosa.load(file, sr=None)
    y = librosa.util.normalize(y)
    D_dB = compute_spectrogram(y, sr)
    save_path = os.path.join(f'{0}/spectrograms'.format(AUDIO_PATH), f'notcry_{idx}.npy')
    save_spectrogram_to_disk(D_dB, save_path)
    data.append(save_path)
    labels.append(0)

data = np.array(data)
labels = np.array(labels)



In [7]:
# Split the datasets
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, random_state=42)

class OnTheFlyDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, file_paths, labels, batch_size, img_size, shuffle=True, augment=False):
        self.file_paths = file_paths
        self.labels = labels
        self.batch_size = batch_size
        self.img_size = img_size
        self.shuffle = shuffle
        self.augment = augment
        self.indices = np.arange(len(self.file_paths))
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.file_paths) / self.batch_size))

    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        batch_file_paths = [self.file_paths[i] for i in batch_indices]
        batch_labels = [self.labels[i] for i in batch_indices]

        X, y = self.__data_generation(batch_file_paths, batch_labels)
        return X, y

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

    def __data_generation(self, batch_file_paths, batch_labels):
        X = np.empty((len(batch_file_paths), *self.img_size, 1), dtype=np.float32)
        y = np.empty((len(batch_file_paths),), dtype=int)

        for i, file_path in enumerate(batch_file_paths):
            D_dB = np.load(file_path)
            D_dB = D_dB[..., np.newaxis]  # Add channel dimension

            # Calculate zoom factors for resizing
            zoom_factors = [self.img_size[0] / D_dB.shape[0], self.img_size[1] / D_dB.shape[1], 1]
            D_dB = zoom(D_dB, zoom_factors, order=3)  # order=3 for cubic interpolation

            if self.augment:
                if np.random.rand() > 0.5:
                    D_dB = np.flip(D_dB, axis=1)  # Flip left-right
                if np.random.rand() > 0.5:
                    D_dB = np.flip(D_dB, axis=0)  # Flip up-down
                if np.random.rand() > 0.5:
                    D_dB = D_dB + np.random.uniform(-0.2, 0.2, size=D_dB.shape)  # Random brightness

            X[i,] = D_dB
            y[i] = batch_labels[i]

        return X, y

# Augment training data
train_generator = OnTheFlyDataGenerator(X_train, y_train, BATCH_SIZE, IMG_SIZE, shuffle=True, augment=True)
val_generator = OnTheFlyDataGenerator(X_val, y_val, BATCH_SIZE, IMG_SIZE, shuffle=False, augment=False)

# Define the model with batch normalization and l2 regularization
l2_regularizer = tf.keras.regularizers.l2(0.001)

model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_SIZE[0], IMG_SIZE[1], 1), kernel_regularizer=l2_regularizer),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        Conv2D(64, (3, 3), activation='relu', kernel_regularizer=l2_regularizer),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        Conv2D(128, (3, 3), activation='relu', kernel_regularizer=l2_regularizer),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        Flatten(),
        Dense(128, activation='relu', kernel_regularizer=l2_regularizer),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])

# Compile the model with the specified learning rate
optimizer = Adam(learning_rate=1e-4)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Set up callbacks
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch='500,520')

# Save the best model during training
checkpoint_callback = ModelCheckpoint('cry_detection_model.keras', monitor='val_loss', save_best_only=True, mode='min')

# Learning rate scheduler
lr_callback = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

# EarlyStopping Callback
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

class_weights = {0: 1., 1: 1.}

# Train the model with the added callbacks
history = model.fit(
    train_generator,
    epochs=EPOCHS,
    validation_data=val_generator,
    class_weight=class_weights,
    callbacks=[tensorboard_callback, checkpoint_callback, lr_callback, early_stopping_callback]
)

print("Training complete. Model saved as 'cry_detection_model.keras'")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/25


  self._warn_if_super_not_called()


[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 537ms/step - accuracy: 0.6131 - loss: 1.3753 - val_accuracy: 0.4755 - val_loss: 3.5631 - learning_rate: 1.0000e-04
Epoch 2/25
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 365ms/step - accuracy: 0.8593 - loss: 0.7391 - val_accuracy: 0.4755 - val_loss: 2.6608 - learning_rate: 1.0000e-04
Epoch 3/25
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 385ms/step - accuracy: 0.8595 - loss: 0.6982 - val_accuracy: 0.5000 - val_loss: 1.3082 - learning_rate: 1.0000e-04
Epoch 4/25
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 387ms/step - accuracy: 0.8853 - loss: 0.6447 - val_accuracy: 0.7819 - val_loss: 0.7788 - learning_rate: 1.0000e-04
Epoch 5/25
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 382ms/step - accuracy: 0.8572 - loss: 0.6867 - val_accuracy: 0.9093 - val_loss: 0.5939 - learning_rate: 1.0000e-04
Epoch 6/25
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [8]:
y_pred = model.predict(val_generator)
y_pred = (y_pred > 0.5).astype(int)
acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

print(f'Accuracy: {acc}')
print(f'F1 Score: {f1}')

model.save('cry_detection_model.keras')

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 281ms/step
Accuracy: 0.9240196078431373
F1 Score: 0.9168900804289545


In [9]:
import pathlib
tflite_models_dir = pathlib.Path("tflite_models")
tflite_models_dir.mkdir(exist_ok=True, parents=True)


converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

tflite_model_file = tflite_models_dir/"cry_detection_model.tflite"
tflite_model_file.write_bytes(tflite_model)

converter.optimizations = [tf.lite.Optimize.DEFAULT]

tflite_fp16_model = converter.convert()
tflite_model_fp16_file = tflite_models_dir/"cry_detection_model_quant.tflite"
tflite_model_fp16_file.write_bytes(tflite_fp16_model)

# converter.target_spec.supported_types = [tf.float16]
# tflite_quant_model = converter.convert()
# tflite_model_quant_file = tflite_models_dir/"cry_detection_model_quant_f16.tflite"
# tflite_model_fp16_file.write_bytes(tflite_fp16_model)


Saved artifact at '/tmp/tmp4roknkdo'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 64, 64, 1), dtype=tf.float32, name='keras_tensor')
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  135461469057824: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135461469060816: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135461469047792: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135461469058352: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135461469060640: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135461469058176: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135461464812432: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135461466644976: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135461472355424: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135461467823568: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135461472355600:

696328

In [12]:
interpreter = tf.lite.Interpreter(model_path="tflite_models/cry_detection_model_quant.tflite")
interpreter.allocate_tensors()

# Get input and output tensors
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Function to preprocess the audio file and convert it to a spectrogram
def preprocess_audio(file_path, img_size):
    y, sr = librosa.load(file_path, sr=None)
    y = librosa.util.normalize(y)
    D = librosa.stft(y, n_fft=2048, hop_length=512)
    D_dB = librosa.amplitude_to_db(np.abs(D), ref=np.max)

    # Calculate zoom factors for resizing
    zoom_factors = [img_size[0] / D_dB.shape[0], img_size[1] / D_dB.shape[1]]
    D_dB_resized = zoom(D_dB, zoom_factors, order=3)  # order=3 for cubic interpolation

    # Add channel dimension to match the original function's output
    D_dB_resized = D_dB_resized[..., np.newaxis]

    return D_dB_resized

def predict(file_path, img_size=(64, 64)):
    # Preprocess the audio file
    input_data = preprocess_audio(file_path, img_size)

    # Add batch dimension
    input_data = np.expand_dims(input_data, axis=0).astype(np.float32)

    # Set the tensor to point to the input data to be inferred
    interpreter.set_tensor(input_details[0]['index'], input_data)

    # Run inference
    interpreter.invoke()

    # Extract output data
    output_data = interpreter.get_tensor(output_details[0]['index'])

    return output_data

# Function to process all .wav files in a folder and calculate prediction accuracy
def process_folder(folder_path, img_size=(64, 64)):
    correct_predictions = 0
    total_files = 0
    results = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.wav'):
            file_path = os.path.join(folder_path, file_name)
            prediction = predict(file_path, img_size)
            prediction_label = 'Cry' if prediction > 0.5 else 'Not Cry'
            results.append((file_name, prediction_label))

            # Determine ground truth from the file name
            ground_truth = 'Cry' if '_cry.wav' in file_name else 'Not Cry'

            if prediction_label == ground_truth:
                correct_predictions += 1

            total_files += 1

    # Calculate prediction accuracy
    accuracy = (correct_predictions / total_files) * 100 if total_files > 0 else 0

    return results, accuracy

# Example usage
folder_path = '{0}/Test'.format(AUDIO_PATH)
predictions, accuracy = process_folder(folder_path)

for file_name, prediction_label in predictions:
    print(f"File: {file_name}, Prediction: {prediction_label}")

print(f"Prediction Accuracy: {accuracy:.2f}%")


File: P15_3484_cry.wav, Prediction: Cry
File: P15_3686_cry.wav, Prediction: Cry
File: P15_3886_cry.wav, Prediction: Cry
File: P15_3476_cry.wav, Prediction: Cry
File: P15_3463_cry.wav, Prediction: Cry
File: P15_3548_cry.wav, Prediction: Cry
File: P15_3743_cry.wav, Prediction: Cry
File: P15_3571_cry.wav, Prediction: Cry
File: P15_3513_cry.wav, Prediction: Cry
File: P15_3062_cry.wav, Prediction: Cry
File: P15_3134_cry.wav, Prediction: Cry
File: P17_975_cry.wav, Prediction: Cry
File: P17_744_cry.wav, Prediction: Not Cry
File: P17_5_notcry.wav, Prediction: Not Cry
File: P17_2_notcry.wav, Prediction: Not Cry
File: P17_20_notcry.wav, Prediction: Cry
File: P17_43_cry.wav, Prediction: Cry
File: P17_41_cry.wav, Prediction: Cry
File: P17_46_cry.wav, Prediction: Cry
File: P26_221_cry.wav, Prediction: Cry
File: P26_11_cry.wav, Prediction: Cry
File: P26_823_cry.wav, Prediction: Cry
File: P26_826_cry.wav, Prediction: Cry
File: P26_820_cry.wav, Prediction: Cry
File: P26_829_cry.wav, Prediction: Cry
Fi

In [13]:
# Load TensorFlow Lite C library
lib = ctypes.cdll.LoadLibrary('{0}/libtensorflowlite_c.so'.format(AUDIO_PATH))

# Define types for the C API functions
lib.TfLiteModelCreate.restype = ctypes.POINTER(ctypes.c_void_p)
lib.TfLiteInterpreterCreate.restype = ctypes.POINTER(ctypes.c_void_p)
lib.TfLiteInterpreterOptionsCreate.restype = ctypes.POINTER(ctypes.c_void_p)
lib.TfLiteInterpreterOptionsSetNumThreads.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_int]
lib.TfLiteInterpreterOptionsDelete.argtypes = [ctypes.POINTER(ctypes.c_void_p)]
lib.TfLiteInterpreterDelete.argtypes = [ctypes.POINTER(ctypes.c_void_p)]
lib.TfLiteModelDelete.argtypes = [ctypes.POINTER(ctypes.c_void_p)]
lib.TfLiteInterpreterGetInputTensor.restype = ctypes.POINTER(ctypes.c_void_p)
lib.TfLiteInterpreterGetOutputTensor.restype = ctypes.POINTER(ctypes.c_void_p)

# Load the TFLite model
model_path = b"tflite_models/cry_detection_model_quant.tflite"
with open(model_path, 'rb') as f:
    model_data = f.read()

model = lib.TfLiteModelCreate(ctypes.c_char_p(model_data), ctypes.c_size_t(len(model_data)))

# Create interpreter options and set number of threads
options = lib.TfLiteInterpreterOptionsCreate()

# Set number of threads (e.g., 2 threads)
lib.TfLiteInterpreterOptionsSetNumThreads(options, 2)

# Create the interpreter with the custom options
interpreter = lib.TfLiteInterpreterCreate(model, options)

# Allocate tensors
status = lib.TfLiteInterpreterAllocateTensors(interpreter)

# Get input and output tensor details
input_tensor = lib.TfLiteInterpreterGetInputTensor(interpreter, 0)
output_tensor = lib.TfLiteInterpreterGetOutputTensor(interpreter, 0)

# Function to preprocess the audio file and convert it to a spectrogram
# def preprocess_audio(file_path, img_size):
#     y, sr = librosa.load(file_path, sr=None)
#     y = librosa.util.normalize(y)
#     D = librosa.stft(y, n_fft=2048, hop_length=512)
#     D_dB = librosa.amplitude_to_db(np.abs(D), ref=np.max)

#     # Rescale the spectrogram to the target img_size
#     # zoom_factors = [img_size[0] / D_dB.shape[0], img_size[1] / D_dB.shape[1]]
#     # D_dB_resized = zoom(D_dB, zoom_factors).astype(np.float32)

#     # Resize using TensorFlow
#     # D_dB_resized = tf.image.resize(D_dB[..., np.newaxis], img_size).numpy()
#     # D_dB_resized = np.squeeze(D_dB_resized, axis=-1).astype(np.float32)

#     # Convert the spectrogram to an image
#     D_dB_img = Image.fromarray(D_dB)

#     # Resize the image using PIL with LANCZOS resampling
#     D_dB_resized = D_dB_img.resize(img_size, Image.Resampling.LANCZOS)

#     # Convert back to NumPy array
#     D_dB_resized = np.array(D_dB_resized).astype(np.float32)

#     return D_dB_resized

def preprocess_audio(file_path, img_size):
    y, sr = librosa.load(file_path, sr=None)
    y = librosa.util.normalize(y)
    D = librosa.stft(y, n_fft=2048, hop_length=512)
    D_dB = librosa.amplitude_to_db(np.abs(D), ref=np.max)

    # Calculate zoom factors for resizing
    zoom_factors = [img_size[0] / D_dB.shape[0], img_size[1] / D_dB.shape[1]]
    D_dB_resized = zoom(D_dB, zoom_factors, order=3)  # order=3 for cubic interpolation

    # Add channel dimension to match the original function's output
    D_dB_resized = D_dB_resized[..., np.newaxis]

    return D_dB_resized

def predict(file_path, img_size=(64, 64)):
    # Preprocess the audio file
    input_data = preprocess_audio(file_path, img_size)

    # Add batch dimension
    input_data = np.expand_dims(input_data, axis=0).astype(np.float32)

    # Set the tensor to point to the input data to be inferred
    lib.TfLiteTensorCopyFromBuffer(input_tensor, input_data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), ctypes.c_size_t(input_data.nbytes))

    # Run inference
    lib.TfLiteInterpreterInvoke(interpreter)

    # Extract output data
    output_size = 1  # Update this based on your model's output tensor size
    output_data = np.empty(output_size, dtype=np.float32)
    lib.TfLiteTensorCopyToBuffer(output_tensor, output_data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), ctypes.c_size_t(output_data.nbytes))

    return output_data

# Function to process all .wav files in a folder and calculate prediction accuracy
def process_folder(folder_path, img_size=(64, 64)):
    correct_predictions = 0
    total_files = 0
    results = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.wav'):
            file_path = os.path.join(folder_path, file_name)
            prediction = predict(file_path, img_size)
            prediction_label = 'Cry' if prediction > 0.5 else 'Not Cry'
            results.append((file_name, prediction_label))

            # Determine ground truth from the file name
            ground_truth = 'Cry' if '_cry.wav' in file_name else 'Not Cry'

            if prediction_label == ground_truth:
                correct_predictions += 1

            total_files += 1

    # Calculate prediction accuracy
    accuracy = (correct_predictions / total_files) * 100 if total_files > 0 else 0

    return results, accuracy

# Example usage
folder_path = '{0}/Test'.format(AUDIO_PATH)
predictions, accuracy = process_folder(folder_path)

for file_name, prediction_label in predictions:
    print(f"File: {file_name}, Prediction: {prediction_label}")

print(f"Prediction Accuracy: {accuracy:.2f}%")

# Clean up
lib.TfLiteInterpreterDelete(interpreter)
lib.TfLiteInterpreterOptionsDelete(options)
lib.TfLiteModelDelete(model)

print("All operations completed successfully.")


File: P15_3484_cry.wav, Prediction: Cry
File: P15_3686_cry.wav, Prediction: Cry
File: P15_3886_cry.wav, Prediction: Cry
File: P15_3476_cry.wav, Prediction: Cry
File: P15_3463_cry.wav, Prediction: Cry
File: P15_3548_cry.wav, Prediction: Cry
File: P15_3743_cry.wav, Prediction: Cry
File: P15_3571_cry.wav, Prediction: Cry
File: P15_3513_cry.wav, Prediction: Cry
File: P15_3062_cry.wav, Prediction: Cry
File: P15_3134_cry.wav, Prediction: Cry
File: P17_975_cry.wav, Prediction: Cry
File: P17_744_cry.wav, Prediction: Not Cry
File: P17_5_notcry.wav, Prediction: Not Cry
File: P17_2_notcry.wav, Prediction: Not Cry
File: P17_20_notcry.wav, Prediction: Cry
File: P17_43_cry.wav, Prediction: Cry
File: P17_41_cry.wav, Prediction: Cry
File: P17_46_cry.wav, Prediction: Cry
File: P26_221_cry.wav, Prediction: Cry
File: P26_11_cry.wav, Prediction: Cry
File: P26_823_cry.wav, Prediction: Cry
File: P26_826_cry.wav, Prediction: Cry
File: P26_820_cry.wav, Prediction: Cry
File: P26_829_cry.wav, Prediction: Cry
Fi