In [1]:
import librosa
import librosa.display
import numpy as np
import os
import matplotlib.pyplot as plt
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, LSTM, Dense, TimeDistributed, Conv1D, GRU
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import tensorflow as tf


In [8]:
import os
import numpy as np
import librosa
import librosa.display

def preprocess_audio(audio_path, target_sr=22050, n_fft=2048, hop_length=512, n_mels=128, fixed_length=128):
    """
    Preprocess audio by loading, converting to mono, and computing a fixed-size Mel-spectrogram.
    Args:
        audio_path (str): Path to the audio file.
        target_sr (int): Target sampling rate.
        n_fft (int): Number of FFT components.
        hop_length (int): Hop length for STFT.
        n_mels (int): Number of Mel bands.
        fixed_length (int): Fixed time dimension for the spectrogram.
    Returns:
        mel_spectrogram_db (np.ndarray): Preprocessed Mel-spectrogram with fixed shape.
    """
    signal, sr = librosa.load(audio_path, sr=target_sr, mono=True)
    mel_spectrogram = librosa.feature.melspectrogram(y=signal, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    
    # Ensure a fixed length along the time axis
    if mel_spectrogram_db.shape[1] < fixed_length:
        # Pad with zeros if shorter
        mel_spectrogram_db = np.pad(mel_spectrogram_db, ((0, 0), (0, fixed_length - mel_spectrogram_db.shape[1])), mode='constant')
    else:
        # Truncate if longer
        mel_spectrogram_db = mel_spectrogram_db[:, :fixed_length]
    
    return mel_spectrogram_db

def load_dataset(data_path, fixed_length=128):
    """
    Load and preprocess the dataset into spectrograms and labels.
    Args:
        data_path (str): Path to the dataset.
        fixed_length (int): Fixed time dimension for the spectrogram.
    Returns:
        dataset (np.ndarray): Array of preprocessed spectrograms.
        labels (np.ndarray): Array of labels.
    """
    dataset = []
    labels = []
    for folder in os.listdir(data_path):
        folder_path = os.path.join(data_path, folder)
        # Skip non-directory files
        if not os.path.isdir(folder_path):
            continue
        label = 1 if folder == 'car_crash' else 0
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            # Check if the file is an audio file
            if file_path.endswith('.wav'):
                spectrogram = preprocess_audio(file_path, fixed_length=fixed_length)
                dataset.append(spectrogram)
                labels.append(label)
    return np.array(dataset), np.array(labels)

# Path to the dataset containing "car_crash" and other folders
data_path = "C:/Users/HP/Downloads/archive/Raw Audio/"

# Load dataset
spectrograms, labels = load_dataset(data_path, fixed_length=128)

# Split into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(spectrograms, labels, test_size=0.2, random_state=42)

# Normalize data
X_train = X_train / np.max(X_train)
X_test = X_test / np.max(X_test)

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")


Training data shape: (1342, 128, 128)
Test data shape: (336, 128, 128)


In [11]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D
from tensorflow.keras.optimizers import Adam

# Define DAE architecture
def build_dae(input_shape):
    input_img = Input(shape=input_shape)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(input_img)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
    encoded = MaxPooling2D((2, 2), padding='same')(x)

    x = Conv2D(32, (3, 3), activation='relu', padding='same')(encoded)
    x = UpSampling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)
    decoded = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)

    return Model(input_img, decoded, name="DenoisingAutoencoder")

# Normalize data to [0, 1]
X_train_dae = X_train / np.max(X_train)
X_test_dae = X_test / np.max(X_test)

# Reshape data for the DAE
X_train_dae = X_train_dae[..., np.newaxis]
X_test_dae = X_test_dae[..., np.newaxis]

# Build and compile the DAE model
dae = build_dae(X_train_dae[0].shape)
dae.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')  # Use MSE for continuous data

# Train the DAE
history = dae.fit(
    X_train_dae,
    X_train_dae,
    epochs=50,
    batch_size=32,
    validation_split=0.1
)

# Evaluate the DAE
loss = dae.evaluate(X_test_dae, X_test_dae)
print(f"Test Loss: {loss}")


Epoch 1/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 266ms/step - loss: 117357179568128.0000 - val_loss: 113018725728256.0000
Epoch 2/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 267ms/step - loss: 116613042929664.0000 - val_loss: 113018725728256.0000
Epoch 3/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 307ms/step - loss: 119993610010624.0000 - val_loss: 113018725728256.0000
Epoch 4/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 306ms/step - loss: 116747134828544.0000 - val_loss: 113018725728256.0000
Epoch 5/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 303ms/step - loss: 118839622762496.0000 - val_loss: 113018725728256.0000
Epoch 6/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 304ms/step - loss: 118381931921408.0000 - val_loss: 113018725728256.0000
Epoch 7/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 305ms/step - loss: 11474961432

In [15]:
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GRU, Dense, Flatten, Input, Reshape, TimeDistributed
from tensorflow.keras.models import Sequential

# Define GCRNN architecture
def build_gcrnn(input_shape):
    model = Sequential()
    
    # TimeDistributed wrapper for Conv2D to process time steps separately
    model.add(TimeDistributed(Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same'), 
                               input_shape=input_shape))
    model.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))
    model.add(TimeDistributed(Flatten()))  # Flatten spatial dimensions while keeping time
    model.add(GRU(64, activation='relu', return_sequences=True))  # GRU to capture temporal patterns
    model.add(GRU(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Output layer
    return model

# Use latent representations from the DAE as input
latent_train = dae.predict(X_train_dae)  # Shape: (batch_size, height, width, 1)
latent_test = dae.predict(X_test_dae)    # Shape: (batch_size, height, width, 1)

# Add a "time" dimension to reshape the data for TimeDistributed layers
latent_train = np.expand_dims(latent_train, axis=1)  # Shape: (batch_size, time_steps=1, height, width, channels=1)
latent_test = np.expand_dims(latent_test, axis=1)    # Shape: (batch_size, time_steps=1, height, width, channels=1)

# Build and compile the GCRNN
gcrnn = build_gcrnn(latent_train.shape[1:])  # Shape: (time_steps, height, width, channels)
gcrnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the GCRNN
gcrnn.fit(latent_train, y_train, epochs=50, batch_size=32, validation_split=0.1)


[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 100ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 95ms/step
Epoch 1/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 535ms/step - accuracy: 0.9470 - loss: 0.4247 - val_accuracy: 0.9333 - val_loss: 0.2450
Epoch 2/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 519ms/step - accuracy: 0.9353 - loss: 0.2401 - val_accuracy: 0.9333 - val_loss: 0.2537
Epoch 3/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 519ms/step - accuracy: 0.9494 - loss: 0.2005 - val_accuracy: 0.9333 - val_loss: 0.2461
Epoch 4/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 520ms/step - accuracy: 0.9395 - loss: 0.2310 - val_accuracy: 0.9333 - val_loss: 0.2470
Epoch 5/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 521ms/step - accuracy: 0.9419 - loss: 0.2234 - val_accuracy: 0.9333 - val_loss: 0.2466
Epoch 6/50
[1m38/38[0m [32m━━━━━━

<keras.src.callbacks.history.History at 0x21387ff2dd0>

In [17]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, roc_auc_score

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights))

# Train the model
gcrnn.fit(latent_train, y_train, epochs=50, batch_size=32, validation_split=0.1, class_weight=class_weights)

# Make predictions with a custom threshold
threshold = 0.3
y_pred_proba = gcrnn.predict(latent_test)
y_pred = (y_pred_proba > threshold).astype(int)

# Evaluate performance
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"AUC-ROC: {roc_auc:.2f}")


Epoch 1/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 519ms/step - accuracy: 0.9404 - loss: 1.0842 - val_accuracy: 0.9333 - val_loss: 0.6617
Epoch 2/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 519ms/step - accuracy: 0.9469 - loss: 0.6814 - val_accuracy: 0.9333 - val_loss: 0.6764
Epoch 3/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 521ms/step - accuracy: 0.9517 - loss: 0.6519 - val_accuracy: 0.9333 - val_loss: 0.6671
Epoch 4/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 521ms/step - accuracy: 0.7752 - loss: 0.7364 - val_accuracy: 0.9333 - val_loss: 0.6903
Epoch 5/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 519ms/step - accuracy: 0.9436 - loss: 0.7006 - val_accuracy: 0.9333 - val_loss: 0.6843
Epoch 6/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 520ms/step - accuracy: 0.9492 - loss: 0.6672 - val_accuracy: 0.9333 - val_loss: 0.6838
Epoch 7/50
[1m38/38[

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
