## Importing the modules

In [9]:
import librosa.display
import numpy as np
import random
import audiomentations as A
import soundfile as sf
import os
import librosa
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

## Define the data folder path

In [10]:
# Define constants for data preprocessing
num_mfcc_coefficients = 13  # Number of MFCC coefficients (adjust as needed)
desired_shape = (408, num_mfcc_coefficients, 1)  # Replace with your model's input shape
main_folder = ['belly_pain', 'burping', 'discomfort', 'hungry', 'tired']  # Replace with your class names
folder_name = 'donateacry'  # Replace with your dataset folder path
num_classes = len(main_folder)  # Number of classes

## Data augmentation, Data pre-processing & Data Normalization

In [11]:
# Function to preprocess an audio file
def preprocess_audio(audio_folder):
    preprocessed_data = []
    preprocessed_labels = []
    for index, cls in enumerate(audio_folder):
        class_folder = os.path.join(folder_name, cls)
        for file in os.listdir(class_folder)[:15]:
            audio_file = os.path.join(class_folder, file)
            original_audio, sr = sf.read(audio_file)
            # Define the augmentation pipeline
            augment1 = A.Compose([A.AddGaussianNoise(p=0.2)])
            augment2 = A.Compose([A.TimeStretch(p=0.2)])
            augment3 = A.Compose([A.PitchShift(p=0.2)])
            augment4 = A.Compose([A.Shift(p=0.2)])
            augment5 = A.Compose([A.TimeMask(p=0.2)])

            # Apply augmentation to create augmented audio
            augmented_audio1 = augment1(samples=original_audio, sample_rate=sr)
            augmented_audio2 = augment2(samples=original_audio, sample_rate=sr)
            augmented_audio3 = augment3(samples=original_audio, sample_rate=sr)
            augmented_audio4 = augment4(samples=original_audio, sample_rate=sr)
            augmented_audio5 = augment5(samples=original_audio, sample_rate=sr)
            

            # Perform feature extraction (e.g., MFCCs)
            for audio in [original_audio, augmented_audio1, augmented_audio2, augmented_audio3, augmented_audio4, augmented_audio5]:
                mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=num_mfcc_coefficients)

                # Normalize the MFCCs (optional but recommended)
                mfccs = (mfccs - np.mean(mfccs)) / np.std(mfccs)

                # Reshape or pad the MFCCs to match the desired input shape
                num_frames = mfccs.shape[1]
                if num_frames < desired_shape[0]:
                    mfccs = np.pad(mfccs, ((0, 0), (0, desired_shape[0] - num_frames)), mode='constant')
                elif num_frames > desired_shape[0]:
                    mfccs = mfccs[:, :desired_shape[0]]

                # Append the preprocessed data and label
                preprocessed_data.append(mfccs.T[:, :, np.newaxis])  # Transpose the data
                preprocessed_labels.append(index)

    return np.array(preprocessed_data),np.array(preprocessed_labels)

In [12]:
data,label = preprocess_audio(main_folder)
print("Total audio files:",len(data))
print("Total labels:",len(label))

Total audio files: 408
Total labels: 408


In [13]:
data.shape

(408, 408, 13, 1)

## Model training

In [14]:
# Define the CNN model
model = keras.Sequential([
    layers.Input(shape=desired_shape),  # Specify the input shape (e.g., (num_frames, num_features, num_channels))
    
    # Convolutional layers
    layers.Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D(pool_size=(2, 2)),
    layers.Conv2D(128, kernel_size=(3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D(pool_size=(2, 2)),
    layers.Conv2D(128, kernel_size=(3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D(pool_size=(2, 2)),
    
    # Flatten the output
    layers.Flatten(),
    
    # Fully connected layers
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),  # Dropout layer to reduce overfitting
    layers.Dense(num_classes, activation='softmax')  # Output layer with the number of classes
])

In [15]:
# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',  # Use 'categorical_crossentropy' if one-hot encoding
              metrics=['accuracy'])

# Print the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 408, 13, 64)       640       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 204, 6, 64)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 204, 6, 128)       73856     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 102, 3, 128)      0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 102, 3, 128)       147584    
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 51, 1, 128)       0

## Data sampling

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    data, label, test_size=0.1, random_state=42
)

In [17]:
# Early stopping to avoid overfitting of model
early_stop=EarlyStopping(monitor='val_accuracy',mode='max', verbose=1, patience=15, restore_best_weights=True)

In [18]:
# Fit the model
model.fit(X_train, y_train, epochs=30, batch_size=36,callbacks=[early_stop], validation_data=(X_test,y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x241c266ea60>

## Loss & Accuracy of Training data

In [19]:
loss, accuracy = model.evaluate(X_train, y_train)
print(f"Training loss: {loss:.4f}")
print(f"Training accuracy: {accuracy:.4f}")

Training loss: 0.0193
Training accuracy: 1.0000


## Loss & Accuracy of Testing data

In [20]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Testing loss: {loss:.4f}")
print(f"Testing accuracy: {accuracy:.4f}")

Testing loss: 0.1365
Testing accuracy: 0.9756


## Preprocessing of testing audio file

In [21]:
def Test_preprocess_audio(audio_file):
        
        Test_preprocess_data = []
        original_audio, sr = sf.read(audio_file)
        # Define the augmentation pipeline
        augment1 = A.Compose([A.AddGaussianNoise(p=0.2)])
        augment2 = A.Compose([A.TimeStretch(p=0.2)])
        augment3 = A.Compose([A.PitchShift(p=0.2)])
        augment4 = A.Compose([A.Shift(p=0.2)])
        augment5 = A.Compose([A.TimeMask(p=0.2)])

        # Apply augmentation to create augmented audio
        augmented_audio1 = augment1(samples=original_audio, sample_rate=sr)
        augmented_audio2 = augment2(samples=original_audio, sample_rate=sr)
        augmented_audio3 = augment3(samples=original_audio, sample_rate=sr)
        augmented_audio4 = augment4(samples=original_audio, sample_rate=sr)
        augmented_audio5 = augment5(samples=original_audio, sample_rate=sr)
        

        # Perform feature extraction (e.g., MFCCs)
        for audio in [original_audio, augmented_audio1, augmented_audio2, augmented_audio3, augmented_audio4, augmented_audio5]:
                mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=num_mfcc_coefficients)

                # Normalize the MFCCs (optional but recommended)
                mfccs = (mfccs - np.mean(mfccs)) / np.std(mfccs)

                # Reshape or pad the MFCCs to match the desired input shape
                num_frames = mfccs.shape[1]
                if num_frames < desired_shape[0]:
                    mfccs = np.pad(mfccs, ((0, 0), (0, desired_shape[0] - num_frames)), mode='constant')
                elif num_frames > desired_shape[0]:
                    mfccs = mfccs[:, :desired_shape[0]]

                # Append the preprocessed data and label
                Test_preprocess_data.append(mfccs.T[:, :, np.newaxis])  # Transpose the data

        return np.array(Test_preprocess_data)

## Predicting the audio label

In [22]:
##['belly_pain', 'burping', 'discomfort', 'hungry', 'tired']
def Predict_Label(audio_file):
    processed_data = (Test_preprocess_audio(audio_file))
    y_pred=model.predict(processed_data)
    y_pred=np.argmax(y_pred,axis=1)
    y_pred = int(np.median(y_pred))
    print(y_pred)
    if y_pred == 0:
        print('belly_pain')
    if y_pred == 1:
        print('burping')
    if y_pred == 2:
        print('discomfort')
    if y_pred == 3:
        print('hungry')
    if y_pred == 4:
        print('tired')

In [36]:
Predict_Label("donateacry/burping/F24DE44B-762C-4149-AC92-96A5E57ED118-1430816949-1.0-m-04-bu.wav")

1
burping


In [24]:
Predict_Label("donateacry/belly_pain/549a46d8-9c84-430e-ade8-97eae2bef787-1430130772174-1.7-m-48-bp.wav")

0
belly_pain


In [25]:
Predict_Label("donateacry/hungry/0f257dac-7d6f-4575-9192-e3b4dcd3d4ef-1430185441581-1.7-f-26-hu.wav")

3
hungry


In [26]:
Predict_Label("donateacry/discomfort/64acb345-a61e-4ef3-a5a6-cf83c04b83f1-1430058990597-1.7-m-72-dc.wav")

2
discomfort


In [27]:
Predict_Label("donateacry/tired/7A22229D-06C2-4AAA-9674-DE5DF1906B3A-1436891957-1.1-m-72-ti.wav")

4
tired


## Saving the trained model

In [28]:
model.save("Neonatal_cry_model.h5")

## Loading the saved model

In [29]:
model = keras.models.load_model("Neonatal_cry_model.h5")

In [30]:
Predict_Label("donateacry/tired/7A22229D-06C2-4AAA-9674-DE5DF1906B3A-1436891957-1.1-m-72-ti.wav")

4
tired
