In [1]:
import os
import numpy as np
import librosa
import cv2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Conv2D, MaxPooling2D, UpSampling2D, GRU, Dense, 
                                     Flatten, TimeDistributed, Dropout, BatchNormalization)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
# Preprocessing functions
def preprocess_audio(file_path, sr=22050, n_mels=128, fixed_length=128):
    signal, _ = librosa.load(file_path, sr=sr, mono=True)
    mel_spectrogram = librosa.feature.melspectrogram(y=signal, sr=sr, n_mels=n_mels)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)

    if mel_spectrogram_db.shape[1] < fixed_length:
        mel_spectrogram_db = np.pad(mel_spectrogram_db, ((0, 0), (0, fixed_length - mel_spectrogram_db.shape[1])), mode='constant')
    else:
        mel_spectrogram_db = mel_spectrogram_db[:, :fixed_length]

    return mel_spectrogram_db

In [3]:
def load_audio_files(data_path, fixed_length=128):
    dataset = []
    for label_dir in os.listdir(data_path):
        label_path = os.path.join(data_path, label_dir)
        if os.path.isdir(label_path):
            for file_name in os.listdir(label_path):
                file_path = os.path.join(label_path, file_name)
                if file_path.endswith('.wav'):
                    spectrogram = preprocess_audio(file_path, fixed_length=fixed_length)
                    dataset.append(spectrogram)
    return np.array(dataset)

In [4]:
def resize_data(data, target_shape):
    resized_data = np.array([cv2.resize(sample, target_shape[:2]) for sample in data])
    return resized_data[..., np.newaxis]

In [5]:
def safe_normalize(data):
    max_val = np.max(data)
    if max_val == 0:
        return data
    return data / max_val

In [6]:
# Define DAE model
def build_dae(input_shape):
    input_img = Input(shape=input_shape)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(input_img)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    encoded = MaxPooling2D((2, 2), padding='same')(x)

    x = Conv2D(32, (3, 3), activation='relu', padding='same')(encoded)
    x = BatchNormalization()(x)
    x = UpSampling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = UpSampling2D((2, 2))(x)
    decoded = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)

    return Model(input_img, decoded, name="DenoisingAutoencoder")

In [7]:
# Define GCRNN model
def build_gcrnn(input_shape):
    inputs = Input(shape=input_shape)
    x = TimeDistributed(Conv2D(64, (3, 3), activation='relu', padding='same'))(inputs)
    x = TimeDistributed(BatchNormalization())(x)
    x = TimeDistributed(MaxPooling2D((2, 2)))(x)
    x = TimeDistributed(Flatten())(x)
    x = GRU(64, activation='relu', return_sequences=True, dropout=0.2)(x)
    x = GRU(32, activation='relu', dropout=0.2)(x)
    outputs = Dense(1, activation='sigmoid')(x)
    return Model(inputs, outputs, name="GCRNN")

In [8]:
# Paths
train_data_path = "C:/Users/HP/Downloads/archive/Raw Audio/"
test_data_path = "C:/Users/HP/Desktop/Test Data/"

In [9]:
# Load and preprocess data
train_data = load_audio_files(train_data_path)
train_data = resize_data(train_data, target_shape=(32, 32))
train_data = safe_normalize(train_data)
train_data = train_data[..., np.newaxis]

In [10]:
# Split into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

In [11]:
# Build and train DAE
dae = build_dae(input_shape=(32, 32, 1))
dae.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
callbacks = [
    EarlyStopping(patience=5, restore_best_weights=True),
    ReduceLROnPlateau(patience=3)
]
dae.fit(train_data, train_data, epochs=50, batch_size=32, validation_data=(val_data, val_data), callbacks=callbacks)

Epoch 1/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 83ms/step - loss: 1676.9875 - val_loss: 1682.7087 - learning_rate: 0.0010
Epoch 2/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 54ms/step - loss: 1638.9170 - val_loss: 1681.7561 - learning_rate: 0.0010
Epoch 3/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 55ms/step - loss: 1676.2111 - val_loss: 1681.4937 - learning_rate: 0.0010
Epoch 4/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 55ms/step - loss: 1676.2505 - val_loss: 1681.3605 - learning_rate: 0.0010
Epoch 5/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 56ms/step - loss: 1707.2524 - val_loss: 1681.3127 - learning_rate: 0.0010
Epoch 6/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 56ms/step - loss: 1691.1111 - val_loss: 1681.2957 - learning_rate: 0.0010
Epoch 7/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 54ms/step - loss: 1725.5681 - 

<keras.src.callbacks.history.History at 0x1b225360ed0>

In [12]:
# Extract latent representations
latent_train = dae.predict(train_data)
latent_val = dae.predict(val_data)

[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step


In [13]:
# Load and preprocess test data
test_data = load_audio_files(test_data_path)
test_data = resize_data(test_data, target_shape=(32, 32))
test_data = safe_normalize(test_data)
test_data = test_data[..., np.newaxis]
latent_test = dae.predict(test_data)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step


In [14]:
# Prepare data for GCRNN
latent_train = latent_train[:, np.newaxis, :, :, :]
latent_val = latent_val[:, np.newaxis, :, :, :]
latent_test = latent_test[:, np.newaxis, :, :, :]

In [15]:
# Train GCRNN
gcrnn = build_gcrnn(input_shape=latent_train.shape[1:])
gcrnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
gcrnn.fit(latent_train, np.zeros(len(latent_train)), epochs=50, batch_size=32, 
          validation_data=(latent_val, np.zeros(len(latent_val))), callbacks=callbacks)

Epoch 1/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 107ms/step - accuracy: 0.9192 - loss: 0.1361 - val_accuracy: 1.0000 - val_loss: 0.1768 - learning_rate: 0.0010
Epoch 2/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 60ms/step - accuracy: 1.0000 - loss: 2.0038e-07 - val_accuracy: 1.0000 - val_loss: 0.0837 - learning_rate: 0.0010
Epoch 3/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 60ms/step - accuracy: 1.0000 - loss: 1.7433e-07 - val_accuracy: 1.0000 - val_loss: 0.0388 - learning_rate: 0.0010
Epoch 4/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 60ms/step - accuracy: 1.0000 - loss: 6.9254e-07 - val_accuracy: 1.0000 - val_loss: 0.0162 - learning_rate: 0.0010
Epoch 5/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 60ms/step - accuracy: 1.0000 - loss: 9.5642e-08 - val_accuracy: 1.0000 - val_loss: 0.0063 - learning_rate: 0.0010
Epoch 6/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x1b22ec35b50>

In [36]:
# Evaluate on validation and test sets
for dataset_name, data, labels in [
    ("Validation", latent_val, np.zeros(len(latent_val))),
    ("Testing", latent_test, np.zeros(len(latent_test))),
]:
    predictions = gcrnn.predict(data)
    threshold = np.percentile(predictions, 95)
    binary_predictions = (predictions > threshold).astype(int)

    accuracy = accuracy_score(labels, binary_predictions)
    precision = precision_score(labels, binary_predictions, average='binary', zero_division=1)
    recall = recall_score(labels, binary_predictions, average='binary', zero_division=1)
    f1 = f1_score(labels, binary_predictions, average='binary', zero_division=1)
    conf_matrix = confusion_matrix(labels, binary_predictions)

    print(f"{dataset_name} Set Evaluation:")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Confusion Matrix:\n", conf_matrix)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
Validation Set Evaluation:
Accuracy: 0.9469964664310954
Precision: 0.0
Recall: 1.0
F1 Score: 0.0
Confusion Matrix:
 [[268  15]
 [  0   0]]
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
Testing Set Evaluation:
Accuracy: 0.9467680608365019
Precision: 0.0
Recall: 1.0
F1 Score: 0.0
Confusion Matrix:
 [[249  14]
 [  0   0]]
