<a href="https://colab.research.google.com/github/shaja-asm/cry-detection/blob/main/cry_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import numpy as np
import librosa
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense, LayerNormalization, MultiHeadAttention, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.optimizers import Adam
from sklearn.decomposition import PCA

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)


Num GPUs Available:  1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
CRY_PATH = '/content/drive/MyDrive/CryCorpusFinal/cry'
NOCRY_PATH = '/content/drive/MyDrive/CryCorpusFinal/notcry'
Fs = 22050

def load_audio_files(path):
    files = []
    for file_name in os.listdir(path):
        if file_name.endswith('.wav'):
            files.append(os.path.join(path, file_name))
    return files

cry_files = load_audio_files(CRY_PATH)
nocry_files = load_audio_files(NOCRY_PATH)

print(f'Loaded {len(cry_files)} cry files and {len(nocry_files)} nocry files.')


Loaded 972 cry files and 1066 nocry files.


In [None]:
def segment_audio(y, segment_length=0.093, fs=22050):
    segment_samples = int(segment_length * fs)
    segments = []
    for start in range(0, len(y) - segment_samples + 1, segment_samples):
        segment = y[start:start + segment_samples]
        segments.append(segment)
    return np.array(segments)

def extract_mfccs(file_paths, fs=22050, n_mfcc=38):
    mfccs = []
    for file_path in file_paths:
        y, _ = librosa.load(file_path, sr=fs)
        segments = segment_audio(y, fs=fs)
        for segment in segments:
            mfcc = librosa.feature.mfcc(y=segment, sr=fs, n_mfcc=n_mfcc).T
            mfccs.append(mfcc)
    return np.vstack(mfccs)

# Take a sample of cry and nocry files to fit the PCA
sample_cry_files = cry_files[:50]
sample_nocry_files = nocry_files[:50]

# Extract MFCCs from the sample files
mfcc_sample = extract_mfccs(sample_cry_files + sample_nocry_files)

# Fit PCA on the extracted MFCCs
pca = PCA(n_components=8)
pca.fit(mfcc_sample)

print("PCA model fitted on sample MFCC data")

# Test
test_file = cry_files[0]
y, _ = librosa.load(test_file, sr=Fs)
segments = segment_audio(y)
print(f'Segments: {segments.shape[0]}')


PCA model fitted on sample MFCC data
Segments: 53


In [None]:
def extract_features(segment, fs=22050):
    # MFCCs
    # mfcc = librosa.feature.mfcc(y=segment, sr=fs, n_mfcc=38)
    # mfcc_mean = np.mean(mfcc, axis=1)
    mfcc = librosa.feature.mfcc(y=segment, sr=fs, n_mfcc=38)
    if pca is not None:
      mfcc = pca.transform(mfcc.T).T  # Transform the MFCCs using PCA
    mfcc_flat = mfcc.flatten()  # Flatten the PCA-transformed MFCCs

    # Short-time energy (STE)
    ste = np.sum(segment ** 2)

    # Zero-crossing rate (ZCR)
    zcr = np.mean(librosa.feature.zero_crossing_rate(segment))

    # Pitch median value within a segment
    pitches, magnitudes = librosa.core.piptrack(y=segment, sr=fs)
    pitch_median = np.median(pitches[pitches > 0]) if np.any(pitches > 0) else 0

    # Run-length of pitch (number of consecutive voiced frames where pitch was detected)
    voiced_frames = pitches > 0
    run_length = np.sum(np.diff(voiced_frames.astype(int)) == -1)

    # Spectral rolloff point
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=segment, sr=fs))

    # First formant (approximated using linear predictive coding)
    lpc = librosa.lpc(segment, order=2)
    roots = np.roots(lpc)
    roots = roots[np.imag(roots) >= 0]
    angles = np.arctan2(np.imag(roots), np.real(roots))
    frequencies = angles * (fs / (2 * np.pi))
    frequencies = np.sort(frequencies)
    first_formant = frequencies[0] if len(frequencies) > 0 else 0

    # Energy ratio(Ratio (in dB) between the spectral energy in the frequency bands [0, 3.5]kHz and [3.5, 22.5]kHz)
    energy = np.abs(librosa.stft(segment))
    energy_low = np.sum(energy[(0 <= librosa.fft_frequencies(sr=fs)) & (librosa.fft_frequencies(sr=fs) <= 3500)])
    energy_high = np.sum(energy[(3500 < librosa.fft_frequencies(sr=fs)) & (librosa.fft_frequencies(sr=fs) <= 22500)])
    energy_ratio = 10 * np.log10(energy_low / energy_high) if energy_high > 0 else 0

    return np.hstack([mfcc_flat, ste, zcr, pitch_median, run_length, first_formant, energy_ratio, spectral_rolloff])


#Test
features = extract_features(segments[0])
print(f'Extracted features shape: {features.shape}')

Extracted features shape: (47,)


In [None]:
def process_files(file_paths, label):
    data = []
    for file_path in file_paths:
        y, _ = librosa.load(file_path, sr=Fs)
        segments = segment_audio(y)
        for segment in segments:
            features = extract_features(segment)
            data.append(np.hstack([features, label]))
    return np.array(data)

cry_data = process_files(cry_files, 1)
nocry_data = process_files(nocry_files, 0)
data = np.vstack([cry_data, nocry_data])
print(f'Total dataset size: {data.shape}')

Total dataset size: (108014, 48)


In [None]:
X = data[:, :-1]
y = data[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Train set: {X_train.shape}, Test set: {X_test.shape}')

# Create mask for NaN values
train_mask = ~np.isnan(X_train).any(axis=1)
test_mask = ~np.isnan(X_test).any(axis=1)

# Apply mask to both X and y
X_train = X_train[train_mask]
y_train = y_train[train_mask]

X_test = X_test[test_mask]
y_test = y_test[test_mask]

print(f'Train set after NaN removal: {X_train.shape}, Test set after NaN removal: {X_test.shape}')

#Normalize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Reshape the data for the model input
X_train = X_train[..., np.newaxis]
X_test = X_test[..., np.newaxis]



Train set: (86411, 47), Test set: (21603, 47)
Train set after NaN removal: (86411, 47), Test set after NaN removal: (21603, 47)


In [None]:
def build_transformer_model(input_shape, learning_rate=0.0001):
    inputs = Input(shape=input_shape)

    # Add Transformer Encoder Layer
    attention_output = MultiHeadAttention(num_heads=4, key_dim=64)(inputs, inputs)
    attention_output = LayerNormalization(epsilon=1e-6)(attention_output)

    # Add Feed Forward Network
    ff_output = Dense(64, activation='relu')(attention_output)
    ff_output = Dense(input_shape[1], activation='relu')(ff_output)
    ff_output = LayerNormalization(epsilon=1e-6)(ff_output)

    # Pooling
    x = GlobalAveragePooling1D()(ff_output)

    # Output Layer
    outputs = Dense(1, activation='sigmoid')(x)

    optimizer = Adam(learning_rate=learning_rate, clipnorm=1.0)
    model = Model(inputs, outputs)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    return model

input_shape = (X_train.shape[1], 1)
model = build_transformer_model(input_shape, learning_rate=0.0001)
model.summary()


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 47, 1)]              0         []                            
                                                                                                  
 multi_head_attention_1 (Mu  (None, 47, 1)                1793      ['input_2[0][0]',             
 ltiHeadAttention)                                                   'input_2[0][0]']             
                                                                                                  
 layer_normalization_2 (Lay  (None, 47, 1)                2         ['multi_head_attention_1[0][0]
 erNormalization)                                                   ']                            
                                                                                            

In [None]:
history = model.fit(
    X_train[..., np.newaxis], y_train,
    validation_data=(X_test[..., np.newaxis], y_test),
    epochs=20,
    batch_size=32
)


In [None]:
test_loss, test_accuracy = model.evaluate(X_test[..., np.newaxis], y_test)
print(f'Test accuracy: {test_accuracy:.4f}')


In [None]:
model.save('cry_detection_model.h5')

In [None]:
# Load the trained model
model = tf.keras.models.load_model('cry_detection_model.h5')

TEST_PATH = 'CryCorpusFinal/Test'
Fs = 22050

def load_audio_files(path):
    files = []
    for file_name in os.listdir(path):
        if file_name.endswith('.wav'):
            files.append(os.path.join(path, file_name))
    return files

test_files = load_audio_files(TEST_PATH)
print(f'Loaded {len(test_files)} test files.')

def segment_audio(y, segment_length=0.093, overlap=0.5, fs=22050):
    segment_samples = int(segment_length * fs)
    step_samples = int(segment_samples * (1 - overlap))
    segments = []
    for start in range(0, len(y) - segment_samples + 1, step_samples):
        segment = y[start:start + segment_samples]
        segments.append(segment)
    return np.array(segments)

def extract_features(segment, fs=22050):
    mfcc = librosa.feature.mfcc(y=segment, sr=fs, n_mfcc=38).flatten()
    ste = np.sum(segment ** 2)
    zcr = np.mean(librosa.feature.zero_crossing_rate(segment))

    pitches, magnitudes = librosa.core.piptrack(y=segment, sr=fs)
    pitch_median = np.median(pitches[pitches > 0])

    harmonicity = np.sum(magnitudes ** 2) / np.sum(magnitudes)
    hapr = harmonicity / np.mean(magnitudes)

    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=segment, sr=fs))

    return np.hstack([mfcc, ste, zcr, pitch_median, harmonicity, hapr, spectral_rolloff])

def predict_cry(file_path, model, fs=22050):
    y, _ = librosa.load(file_path, sr=fs)
    segments = segment_audio(y)
    predictions = []
    for segment in segments:
        features = extract_features(segment)
        features = features.reshape(1, -1, 1)  # Reshape for the model input
        prediction = model.predict(features)
        predictions.append(prediction)
    return np.mean(predictions)  # Return the average prediction

# Predict on the test files
results = []
for file_path in test_files:
    prediction = predict_cry(file_path, model)
    label = 'cry' if prediction > 0.5 else 'nocry'
    results.append((file_path, prediction, label))
    print(f'File: {file_path}, Prediction: {prediction:.4f}, Label: {label}')
