In [21]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import librosa
import cv2
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [22]:
def load_audio_files(directory):
    data, labels = [], []
    for label, folder in enumerate(os.listdir(directory)):
        folder_path = os.path.join(directory, folder)
        if not os.path.isdir(folder_path):  # Skip files, only process directories
            continue
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            if not file_path.endswith(('.wav', '.mp3')):  # Add valid audio file extensions
                continue
            signal, sr = librosa.load(file_path, sr=22050)
            
            # Generate Mel spectrogram with 128 Mel bands
            mel_spec = librosa.feature.melspectrogram(y=signal, sr=sr, n_mels=128)
            mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
            
            # Resize to (128, 128)
            mel_spec_resized = cv2.resize(mel_spec, (128, 128))

            # Expand dimensions to add channel (for CNN input compatibility)
            data.append(mel_spec_resized)
            labels.append(label)

    # Convert to numpy arrays and add channel dimension
    data = np.array(data)
    data = np.expand_dims(data, -1)  # Shape will be (num_samples, 128, 128, 1)
    labels = np.array(labels)
    
    return data, labels


In [23]:
# Resize data for VAE-GAN input
def resize_data(data, target_shape=(32, 32)):
    resized_data = [cv2.resize(img, target_shape) for img in data]
    resized_data = np.expand_dims(np.array(resized_data), -1)
    return resized_data

In [24]:
# Load data
data, labels = load_audio_files('C:/Users/HP/Downloads/archive/Raw Audio/')
data_resized = resize_data(data, target_shape=(32, 32))

# Split data into train, validation, and test sets
train_data, temp_data, train_labels, temp_labels = train_test_split(data_resized, labels, test_size=0.4, random_state=42)
val_data, test_data, val_labels, test_labels = train_test_split(temp_data, temp_labels, test_size=0.5, random_state=42)

In [25]:
# Define VAE-GAN components and training process
class Sampling(layers.Layer):
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

def build_encoder(latent_dim):
    inputs = keras.Input(shape=(32, 32, 1))
    x = layers.Conv2D(32, (3, 3), activation="relu", strides=2, padding="same")(inputs)
    x = layers.Conv2D(64, (3, 3), activation="relu", strides=2, padding="same")(x)
    x = layers.Flatten()(x)
    x = layers.Dense(16, activation="relu")(x)
    z_mean = layers.Dense(latent_dim, name="z_mean")(x)
    z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
    z = Sampling()([z_mean, z_log_var])
    return keras.Model(inputs, [z_mean, z_log_var, z], name="encoder")

def build_decoder(latent_dim):
    inputs = keras.Input(shape=(latent_dim,))
    x = layers.Dense(8 * 8 * 64, activation="relu")(inputs)
    x = layers.Reshape((8, 8, 64))(x)
    x = layers.Conv2DTranspose(64, (3, 3), activation="relu", strides=2, padding="same")(x)
    x = layers.Conv2DTranspose(32, (3, 3), activation="relu", strides=2, padding="same")(x)
    outputs = layers.Conv2DTranspose(1, (3, 3), activation="sigmoid", padding="same")(x)
    return keras.Model(inputs, outputs, name="decoder")

def build_discriminator():
    inputs = keras.Input(shape=(32, 32, 1))
    x = layers.Conv2D(64, (3, 3), activation="relu", strides=2, padding="same")(inputs)
    x = layers.Conv2D(128, (3, 3), activation="relu", strides=2, padding="same")(x)
    x = layers.Flatten()(x)
    x = layers.Dense(64, activation="relu")(x)
    outputs = layers.Dense(1)(x)
    return keras.Model(inputs, outputs, name="discriminator")

In [26]:
class VAE_GAN(tf.keras.Model):
    def __init__(self, encoder, decoder, discriminator):
        super(VAE_GAN, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.discriminator = discriminator

    def compile(self, vae_optimizer, disc_optimizer, gen_optimizer, **kwargs):
        super(VAE_GAN, self).compile(**kwargs)
        self.vae_optimizer = vae_optimizer
        self.disc_optimizer = disc_optimizer
        self.gen_optimizer = gen_optimizer
        self.reconstruction_loss_fn = keras.losses.MeanSquaredError()
        self.gan_loss_fn = keras.losses.BinaryCrossentropy(from_logits=True)

    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstruction = self.decoder(z)
        return reconstruction

    def train_step(self, data):
        # Handle tuple input (inputs, labels) or standalone inputs
        if isinstance(data, tuple):
            inputs, _ = data
        else:
            inputs = data

        with tf.GradientTape(persistent=True) as tape:
            # VAE forward pass
            z_mean, z_log_var, z = self.encoder(inputs)
            reconstruction = self.decoder(z)

            # VAE Loss
            reconstruction_loss = tf.reduce_mean(self.reconstruction_loss_fn(inputs, reconstruction))
            kl_loss = -0.5 * tf.reduce_mean(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            vae_loss = reconstruction_loss + kl_loss

            # Discriminator Loss
            real_labels = tf.ones((tf.shape(inputs)[0], 1))
            fake_labels = tf.zeros((tf.shape(inputs)[0], 1))

            disc_loss_real = self.gan_loss_fn(real_labels, self.discriminator(inputs))
            disc_loss_fake = self.gan_loss_fn(fake_labels, self.discriminator(reconstruction))
            disc_loss = (disc_loss_real + disc_loss_fake) / 2

            # Generator Loss
            gen_loss = self.gan_loss_fn(real_labels, self.discriminator(reconstruction))

        # Compute Gradients
        vae_gradients = tape.gradient(vae_loss, self.encoder.trainable_weights + self.decoder.trainable_weights)
        disc_gradients = tape.gradient(disc_loss, self.discriminator.trainable_weights)
        gen_gradients = tape.gradient(gen_loss, self.decoder.trainable_weights)

        # Apply Gradients
        self.vae_optimizer.apply_gradients(zip(vae_gradients, self.encoder.trainable_weights + self.decoder.trainable_weights))
        self.disc_optimizer.apply_gradients(zip(disc_gradients, self.discriminator.trainable_weights))
        self.gen_optimizer.apply_gradients(zip(gen_gradients, self.decoder.trainable_weights))

        # Return loss values for logging
        return {"vae_loss": vae_loss, "disc_loss": disc_loss, "gen_loss": gen_loss}

# Define model components
latent_dim = 16
encoder = build_encoder(latent_dim)  # Replace with your actual encoder
decoder = build_decoder(latent_dim)  # Replace with your actual decoder
discriminator = build_discriminator()  # Replace with your actual discriminator

# Create and compile the VAE-GAN model
vae_gan = VAE_GAN(encoder, decoder, discriminator)
vae_gan.compile(
    vae_optimizer=keras.optimizers.Adam(),
    disc_optimizer=keras.optimizers.Adam(),
    gen_optimizer=keras.optimizers.Adam(),
    loss=lambda y_true, y_pred: 0.0  # Dummy loss to satisfy compile() requirements
)

# Train the model
vae_gan.fit(
    train_data,
    epochs=50,
    batch_size=32,
    validation_data=(val_data, val_data)
)


Epoch 1/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 89ms/step - disc_loss: 0.0508 - gen_loss: 6.7977 - vae_loss: 2693.4761 - val_loss: 0.0000e+00
Epoch 2/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - disc_loss: 3.1768e-08 - gen_loss: 17.1507 - vae_loss: 2674.0107 - val_loss: 0.0000e+00
Epoch 3/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - disc_loss: 5.8806e-09 - gen_loss: 18.2596 - vae_loss: 2676.4641 - val_loss: 0.0000e+00
Epoch 4/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step - disc_loss: 5.5791e-09 - gen_loss: 18.3111 - vae_loss: 2693.8945 - val_loss: 0.0000e+00
Epoch 5/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 50ms/step - disc_loss: 5.5631e-09 - gen_loss: 18.3140 - vae_loss: 2678.6885 - val_loss: 0.0000e+00
Epoch 6/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 53ms/step - disc_loss: 5.5573e-09 - gen_loss: 18.3150 - vae

<keras.src.callbacks.history.History at 0x14725517010>

In [27]:
# Calculate reconstruction errors
def calculate_reconstruction_error(data, model):
    z_mean, z_log_var, z = model.encoder(data)
    reconstruction = model.decoder(z)
    return tf.reduce_mean(tf.square(data - reconstruction), axis=[1, 2, 3]).numpy()

# Evaluate performance
for dataset_name, dataset_data, dataset_labels in [
    ("Validation", val_data, val_labels),
    ("Testing", test_data, test_labels)
]:
    errors = calculate_reconstruction_error(dataset_data, vae_gan)
    threshold = np.percentile(errors, 95)
    predictions = errors > threshold

    binary_labels = (dataset_labels == 1)
    accuracy = accuracy_score(binary_labels, predictions)
    precision = precision_score(binary_labels, predictions, zero_division=1)
    recall = recall_score(binary_labels, predictions, zero_division=1)
    f1 = f1_score(binary_labels, predictions, zero_division=1)
    conf_matrix = confusion_matrix(binary_labels, predictions)

    print(f"{dataset_name} Set Evaluation:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1 Score: {f1:.4f}")
    print(f"  Confusion Matrix:\n{conf_matrix}\n")

Validation Set Evaluation:
  Accuracy: 0.9107
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
  Confusion Matrix:
[[306  17]
 [ 13   0]]

Testing Set Evaluation:
  Accuracy: 0.9018
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
  Confusion Matrix:
[[303  17]
 [ 16   0]]

