In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import (Input, Conv1D, MaxPooling1D, GRU, Dense,
                                     Dropout, BatchNormalization, LayerNormalization,
                                     Bidirectional, Add, Attention)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K
from tensorflow.keras.regularizers import l2
from sklearn.metrics import (confusion_matrix, f1_score, roc_curve, auc,
                             classification_report)
import librosa
import soundfile as sf
import noisereduce as nr
import matplotlib.pyplot as plt
from scipy.signal import butter, sosfilt
from scipy.fftpack import dct
import seaborn as sns
from scipy.optimize import brentq
from scipy.interpolate import interp1d
import time # For timing evaluation

# Optional: Suppress TensorFlow/CUDA warnings for cleaner output in notebook
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
tf.get_logger().setLevel('ERROR')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Filter TF messages

2025-04-10 00:44:43.052756: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-10 00:44:43.103385: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744226083.133203 1975018 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744226083.141637 1975018 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744226083.165251 1975018 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
# Configure TensorFlow to use GPU and manage memory growth
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        # Use only the first GPU
        tf.config.set_visible_devices(physical_devices[0], 'GPU')
        # Allow memory growth to prevent allocating all GPU memory at once
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
        print(f"Using GPU: {physical_devices[0]}")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)
else:
    print("No GPU devices found, using CPU.")

Using GPU: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [3]:
def load_and_preprocess_audio(file_path, sr=16000, duration=4, augment=True):
    """Loads audio, applies augmentation (optional), normalizes, and pads/truncates."""
    try:
        target_len = sr * duration
        audio, current_sr = librosa.load(file_path, sr=sr, duration=None) # Load full duration initially

        # Data Augmentation (only if augment=True, typically for training)
        if augment and np.random.random() < 0.5:  # 50% chance
            choice = np.random.choice(['noise', 'pitch', 'speed'])
            if choice == 'noise':
                noise_amp = 0.005 * np.random.uniform(0.5, 1.5) * np.max(np.abs(audio))
                audio = audio + noise_amp * np.random.normal(size=audio.shape[0])
            elif choice == 'pitch':
                pitch_factor = np.random.uniform(-2.5, 2.5)
                audio = librosa.effects.pitch_shift(y=audio, sr=sr, n_steps=pitch_factor)
            elif choice == 'speed':
                speed_factor = np.random.uniform(0.85, 1.15)
                audio = librosa.effects.time_stretch(y=audio, rate=speed_factor)

        # Trim silence from beginning and end
        audio, _ = librosa.effects.trim(audio, top_db=25)

        # Pad or truncate to target duration
        if len(audio) > target_len:
            # Take a random crop if longer
            start = np.random.randint(0, len(audio) - target_len + 1)
            audio = audio[start:start + target_len]
        elif len(audio) < target_len:
            # Pad with zeros if shorter
            audio = np.pad(audio, (0, target_len - len(audio)), mode='constant')
        else:
            audio = audio[:target_len] # Ensure exact length


        # Normalize audio to [-1, 1]
        max_amp = np.max(np.abs(audio))
        if max_amp > 1e-5: # Avoid division by zero
             audio = audio / max_amp

        return audio
    except Exception as e:
        print(f"Error loading/processing {file_path}: {e}")
        return None

In [4]:
# --- Feature Parameters ---
SR = 16000
N_FFT = 512 # Reduced for lighter computation
HOP_LENGTH = 160 # Reduced hop length for more frames
N_MELS = 64   # Reduced number of Mel bins
N_MFCC = 13   # Standard MFCC count
N_CQT_BINS = N_MELS # Use same number of bins as Mels for simplicity/consistency
BINS_PER_OCTAVE = 12
N_CQCC = 13 # Similar count to MFCC

# --- Calculate Total Features ---
TOTAL_FEATURES = N_MELS + N_MFCC + N_CQT_BINS + N_CQCC # 64 + 13 + 64 + 13 = 154

def extract_features(audio, sr=SR, n_mels=N_MELS, n_mfcc=N_MFCC, n_cqt_bins=N_CQT_BINS,
                     bins_per_octave=BINS_PER_OCTAVE, n_cqcc=N_CQCC,
                     n_fft=N_FFT, hop_length=HOP_LENGTH):
    """
    Extracts Mel Spectrogram, MFCC, CQT (magnitude), and CQCC features,
    normalizes each individually, aligns time steps, and concatenates them.
    Returns shape (time_steps, features).
    """
    if audio is None:
        return None

    features_list = []
    min_time_steps = float('inf') # Keep track of minimum time steps

    # 1. Mel Spectrogram
    try:
        mel_spec = librosa.feature.melspectrogram(
            y=audio, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length
        )
        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
        log_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / (np.std(log_mel_spec) + 1e-8)
        features_list.append(log_mel_spec)
        min_time_steps = min(min_time_steps, log_mel_spec.shape[1])
    except Exception as e:
        print(f"Error extracting Mel Spectrogram: {e}")
        return None # Fail if fundamental features missing

    # 2. MFCC
    try:
        mfccs = librosa.feature.mfcc(
            y=audio, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels
        )
        mfccs = (mfccs - np.mean(mfccs)) / (np.std(mfccs) + 1e-8)
        features_list.append(mfccs)
        min_time_steps = min(min_time_steps, mfccs.shape[1])
    except Exception as e:
        print(f"Error extracting MFCC: {e}")
        return None

    # 3. CQT (Magnitude)
    try:
        # Adjust CQT parameters if needed to match time dimension roughly
        cqt = librosa.cqt(
            y=audio, sr=sr, hop_length=hop_length,
            n_bins=n_cqt_bins, bins_per_octave=bins_per_octave,
            fmin=librosa.note_to_hz('C2'), # Adjusted fmin slightly
            tuning=0.0 # No tuning offset
        )
        log_cqt_mag = librosa.amplitude_to_db(np.abs(cqt), ref=np.max)
        log_cqt_mag = (log_cqt_mag - np.mean(log_cqt_mag)) / (np.std(log_cqt_mag) + 1e-8)
        features_list.append(log_cqt_mag)
        min_time_steps = min(min_time_steps, log_cqt_mag.shape[1])

        # 4. CQCC (derived from CQT)
        try:
            cqccs = dct(log_cqt_mag, axis=0, type=2, norm='ortho')[:n_cqcc, :]
            cqccs = (cqccs - np.mean(cqccs)) / (np.std(cqccs) + 1e-8)
            features_list.append(cqccs)
            min_time_steps = min(min_time_steps, cqccs.shape[1])
        except Exception as e:
            print(f"Error extracting CQCC: {e}")
            return None # Fail if CQCC fails

    except Exception as e:
        print(f"Error extracting CQT (required for CQCC): {e}")
        return None # Fail if CQT fails


    # Align time steps by truncating to the minimum length found
    aligned_features = [f[:, :min_time_steps] for f in features_list]

    # Concatenate features along the feature axis (axis=0)
    combined_features = np.concatenate(aligned_features, axis=0)

    # Transpose to get (time_steps, features)
    return combined_features.T

In [5]:
def analyze_class_distribution(data_path):
    """Analyzes and prints the class distribution in a given directory."""
    try:
        real_path = os.path.join(data_path, 'real')
        fake_path = os.path.join(data_path, 'fake')

        real_count = len([f for f in os.listdir(real_path) if f.lower().endswith(('.wav', '.flac'))]) if os.path.exists(real_path) else 0
        fake_count = len([f for f in os.listdir(fake_path) if f.lower().endswith(('.wav', '.flac'))]) if os.path.exists(fake_path) else 0
        total = real_count + fake_count

        print(f"\nClass Distribution for {data_path}:")
        if total > 0:
            print(f"Real: {real_count} ({real_count/total*100:.2f}%)")
            print(f"Fake: {fake_count} ({fake_count/total*100:.2f}%)")
        else:
            print("Real: 0 (0.00%)")
            print("Fake: 0 (0.00%)")
            print("Warning: No audio files found in specified directory.")
        return {'real': real_count, 'fake': fake_count}
    except FileNotFoundError:
        print(f"Error: Directory not found - {data_path}")
        return {'real': 0, 'fake': 0}
    except Exception as e:
        print(f"Error analyzing {data_path}: {e}")
        return {'real': 0, 'fake': 0}

In [6]:
def data_generator(data_path, batch_size=64, shuffle=True, augment=True):
    """Generates batches of data (X, y, sample_weights) with dynamic padding."""
    try:
        real_path = os.path.join(data_path, 'real')
        fake_path = os.path.join(data_path, 'fake')
        real_files = [os.path.join(real_path, f) for f in os.listdir(real_path) if f.lower().endswith(('.wav', '.flac'))] if os.path.exists(real_path) else []
        fake_files = [os.path.join(fake_path, f) for f in os.listdir(fake_path) if f.lower().endswith(('.wav', '.flac'))] if os.path.exists(fake_path) else []
    except FileNotFoundError:
        print(f"Error: Cannot find 'real' or 'fake' subdirectories in {data_path}")
        yield np.array([]), np.array([]), np.array([])
        return

    if not real_files and not fake_files:
        print(f"Warning: No audio files found in {data_path}")
        yield np.array([]), np.array([]), np.array([])
        return

    all_files = real_files + fake_files
    labels = [1] * len(real_files) + [0] * len(fake_files) # Real=1, Fake=0

    total_samples = len(all_files)
    weight_for_0 = (1 / len(fake_files)) * (total_samples / 2.0) if len(fake_files) > 0 else 0
    weight_for_1 = (1 / len(real_files)) * (total_samples / 2.0) if len(real_files) > 0 else 0
    class_weights = {0: weight_for_0, 1: weight_for_1}

    file_label_list = list(zip(all_files, labels))

    while True:
        if shuffle:
            np.random.shuffle(file_label_list)

        for i in range(0, len(file_label_list), batch_size):
            batch_list = file_label_list[i:i+batch_size]
            batch_files, batch_labels = zip(*batch_list) if batch_list else ([], [])

            batch_x_features = []
            batch_y_labels = []
            batch_sample_weights = []
            max_len_in_batch = 0

            for file_path, label in zip(batch_files, batch_labels):
                audio = load_and_preprocess_audio(file_path, sr=SR, augment=augment)
                if audio is None: continue

                features = extract_features(audio, sr=SR)
                if features is None: continue

                if features.shape[0] > 0:
                    batch_x_features.append(features)
                    batch_y_labels.append(label)
                    batch_sample_weights.append(class_weights[label])
                    max_len_in_batch = max(max_len_in_batch, features.shape[0])

            if not batch_x_features:
                continue

            padded_batch_x = np.zeros((len(batch_x_features), max_len_in_batch, TOTAL_FEATURES), dtype=np.float32)
            for idx, x in enumerate(batch_x_features):
                seq_len = x.shape[0]
                padded_batch_x[idx, :seq_len, :] = x

            # <<<--- START DEBUG CHECKS --- >>>
            if np.isnan(padded_batch_x).any() or np.isinf(padded_batch_x).any():
                print(f"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
                print(f"WARNING: NaN or Inf found in batch features from {data_path} at step {i//batch_size}!")
                print(f"Problematic files might be in batch starting with: {batch_files[0] if batch_files else 'N/A'}")
                print(f"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
                # Decide how to handle: skip batch (continue) or raise error?
                # For now, let's skip the batch to avoid crashing training
                continue # Skip this batch
            # Optional: Print stats occasionally to check feature scale
            # if i % (100 * batch_size) == 0: # Print stats every 100 batches
            #     print(f"Batch {i//batch_size} stats: Max={np.max(padded_batch_x):.2f}, Min={np.min(padded_batch_x):.2f}, Mean={np.mean(padded_batch_x):.2f}")
            # <<<--- END DEBUG CHECKS --- >>>

            # Final check to ensure batch is not empty after potential skips
            if padded_batch_x.shape[0] > 0:
                 yield padded_batch_x, np.array(batch_y_labels), np.array(batch_sample_weights)

In [7]:
class MFM(Layer):
    """Max-Feature-Map activation function."""
    def __init__(self, **kwargs):
        super(MFM, self).__init__(**kwargs)

    def call(self, inputs):
        # Assumes channels_last format (batch, time_steps, features)
        shape = tf.shape(inputs)
        features_dim = inputs.shape[-1]
        if features_dim is None or features_dim % 2 != 0:
             raise ValueError("MFM activation requires an even number of filters/features.")
        # Split features in half along the last dimension
        split1 = inputs[..., :features_dim // 2]
        split2 = inputs[..., features_dim // 2:]
        return tf.maximum(split1, split2)

    def compute_output_shape(self, input_shape):
        output_shape = list(input_shape)
        output_shape[-1] //= 2
        return tuple(output_shape)

    def get_config(self):
        base_config = super(MFM, self).get_config()
        return base_config

In [8]:
def create_lightweight_cnn_bigru_model(input_shape, l2_reg=0.01):
    """Creates a *further* lightweight CNN-BiGRU model."""
    inputs = Input(shape=input_shape)

    x = BatchNormalization(name='input_bn')(inputs)

    # --- Lightweight Convolutional Blocks (Same as before) ---
    x = Conv1D(32, 5, padding='same', kernel_regularizer=l2(l2_reg), name='conv1')(x)
    x = MFM(name='mfm1')(x) # Output features: 16
    x = MaxPooling1D(pool_size=2, name='pool1')(x)
    x = Dropout(0.3, name='drop1')(x)

    x = Conv1D(64, 3, padding='same', kernel_regularizer=l2(l2_reg), name='conv2')(x)
    x = MFM(name='mfm2')(x) # Output features: 32
    x = MaxPooling1D(pool_size=2, name='pool2')(x)
    x = Dropout(0.3, name='drop2')(x) # x shape: (None, time_steps_pooled, 32)

    # --- Residual Bidirectional GRU Blocks ---
    # <<<--- REDUCED UNITS --->>>
    for i, units in enumerate([16, 8]): # Reduced effective units (e.g., 16 then 8)
        shortcut = x # Store the input to the block

        # Main path: BiGRU
        gru = Bidirectional(GRU(units // 2, return_sequences=True, # units//2 for each direction
                                kernel_regularizer=l2(l2_reg)), name=f'bi_gru_{i+1}')
        gru_output = gru(x) # Output shape: (None, time_steps, units)

        # Shortcut path: Project the original input `shortcut` if needed
        shortcut_channels = K.int_shape(shortcut)[-1]
        if shortcut_channels != units:
            # Project shortcut to match the output dimension of the GRU layer
            shortcut_proj = Dense(units, kernel_regularizer=l2(l2_reg), name=f'shortcut_proj_{i+1}')(shortcut)
        else:
            shortcut_proj = shortcut

        # Add the main path output (gru_output) and the (possibly projected) shortcut path
        x = Add(name=f'add_res_{i+1}')([shortcut_proj, gru_output])

        # Apply post-addition layers
        x = LayerNormalization(name=f'ln_gru_{i+1}')(x)
        x = Dropout(0.4, name=f'drop_gru_{i+1}')(x) # x shape: (..., units) -> (..., 16), then (..., 8)


    # --- Attention Mechanism (Kept for now) ---
    attention_out = Attention(name='attention')([x, x])
    x = Add(name='add_attn')([x, attention_out]) # x shape still (..., 8)

    # --- Final Bi-GRU Aggregation ---
    # <<<--- REDUCED UNITS --->>>
    # units // 2 for GRU layer = 4
    x = Bidirectional(GRU(4, kernel_regularizer=l2(l2_reg)), name='final_bi_gru')(x) # Output: (None, 8)
    x = LayerNormalization(name='ln_final_gru')(x)
    x = Dropout(0.4, name='drop_final_gru')(x)

    # --- Classification Head ---
    # <<<--- REDUCED DENSE UNITS --->>>
    x = Dense(8, activation='relu', kernel_regularizer=l2(l2_reg), name='dense1')(x) # Reduced from 16
    x = Dropout(0.5, name='drop_dense1')(x)

    outputs = Dense(1, activation='sigmoid', name='output')(x)

    model = Model(inputs=inputs, outputs=outputs)
    return model

In [9]:
# --- Paths to your dataset ---
# !!! IMPORTANT: Update these paths to match your dataset structure !!!
base_data_path = 'datasetNEW' # Or wherever your train/dev/eval folders are
train_data_path = os.path.join(base_data_path, 'train')
dev_data_path = os.path.join(base_data_path, 'dev')
eval_data_path = os.path.join(base_data_path, 'eval')

# --- Analyze class distribution ---
print("Analyzing Data Distribution:")
dist_train = analyze_class_distribution(train_data_path)
dist_dev = analyze_class_distribution(dev_data_path)
dist_eval = analyze_class_distribution(eval_data_path)

# Check if datasets were found
if dist_train['real'] + dist_train['fake'] == 0:
    print("\nError: No training data found. Please check 'train_data_path'.")
if dist_dev['real'] + dist_dev['fake'] == 0:
    print("\nError: No validation data found. Please check 'dev_data_path'.")
if dist_eval['real'] + dist_eval['fake'] == 0:
    print("\nError: No evaluation data found. Please check 'eval_data_path'.")

Analyzing Data Distribution:

Class Distribution for datasetNEW/train:
Real: 2580 (10.17%)
Fake: 22800 (89.83%)

Class Distribution for datasetNEW/dev:
Real: 2548 (10.26%)
Fake: 22296 (89.74%)

Class Distribution for datasetNEW/eval:
Real: 7355 (10.32%)
Fake: 63882 (89.68%)


In [10]:
#Cell 10: Generator Instantiation and Step Calculation

def count_files(path):
    """Counts audio files in 'real' and 'fake' subdirectories."""
    count = 0
    try:
        real_path = os.path.join(path, 'real')
        fake_path = os.path.join(path, 'fake')
        if os.path.exists(real_path):
            count += len([f for f in os.listdir(real_path) if f.lower().endswith(('.wav', '.flac'))])
        if os.path.exists(fake_path):
            count += len([f for f in os.listdir(fake_path) if f.lower().endswith(('.wav', '.flac'))])
    except FileNotFoundError:
        # Error already printed by analyze_class_distribution
        pass
    return count

# --- Create generators ---
BATCH_SIZE = 64 # Can adjust based on GPU memory (64 is often reasonable)
train_gen = data_generator(train_data_path, batch_size=BATCH_SIZE, shuffle=True, augment=True)
dev_gen = data_generator(dev_data_path, batch_size=BATCH_SIZE, shuffle=False, augment=False) # No augmentation for validation
eval_gen_for_eval = data_generator(eval_data_path, batch_size=BATCH_SIZE, shuffle=False, augment=False) # For final evaluate()
eval_gen_for_predict = data_generator(eval_data_path, batch_size=BATCH_SIZE, shuffle=False, augment=False) # Separate instance for predict()

# --- Calculate steps ---
train_samples_count = count_files(train_data_path)
dev_samples_count = count_files(dev_data_path)
eval_samples_count = count_files(eval_data_path)

if BATCH_SIZE == 0:
    raise ValueError("Batch size cannot be zero.")

steps_per_epoch = train_samples_count // BATCH_SIZE if train_samples_count > 0 else 1
validation_steps = dev_samples_count // BATCH_SIZE if dev_samples_count > 0 else 1
eval_steps = eval_samples_count // BATCH_SIZE if eval_samples_count > 0 else 1 # For evaluate/predict loops

print(f"\n--- Generator Setup ---")
print(f"Batch Size: {BATCH_SIZE}")
print(f"Total Train Samples: {train_samples_count}, Steps Per Epoch: {steps_per_epoch}")
print(f"Total Validation Samples: {dev_samples_count}, Validation Steps: {validation_steps}")
print(f"Total Evaluation Samples: {eval_samples_count}, Evaluation Steps: {eval_steps}")

# Simple check generator output shape once
if train_samples_count > 0:
    try:
        sample_x, sample_y, sample_w = next(train_gen)
        print(f"Sample Batch Shapes - X: {sample_x.shape}, y: {sample_y.shape}, weights: {sample_w.shape}")
        # Reset generator after check if needed, or create a separate one for check
        train_gen = data_generator(train_data_path, batch_size=BATCH_SIZE, shuffle=True, augment=True) # Recreate
    except StopIteration:
        print("Could not get a sample batch from the training generator.")
    except Exception as e:
        print(f"Error getting sample batch: {e}")


--- Generator Setup ---
Batch Size: 64
Total Train Samples: 25380, Steps Per Epoch: 396
Total Validation Samples: 24844, Validation Steps: 388
Total Evaluation Samples: 71237, Evaluation Steps: 1113
Sample Batch Shapes - X: (64, 401, 154), y: (64,), weights: (64,)


In [11]:
# Callbacks
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,         # Reduce LR by a factor of 5
    patience=4,         # Reduce LR if no improvement for 4 epochs
    min_lr=1e-7,        # Minimum learning rate
    verbose=1
)

early_stopping = EarlyStopping(
    monitor='val_loss', # Monitor validation loss
    patience=10,        # Stop training if no improvement for 10 epochs
    restore_best_weights=True, # Restore weights from the epoch with the best val_loss
    verbose=1
)

# Optional: Save the best model checkpoint
checkpoint_filepath = 'best_lightweight_model.keras'
model_checkpoint = ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_loss',
    save_best_only=True, # Only save when val_loss improves
    mode='min',
    verbose=1
)

callbacks_list = [reduce_lr, early_stopping, model_checkpoint]

In [12]:
# --- Feature Parameters (Ensure these match Cell 4 if changed) ---
SR = 16000
N_FFT = 512
HOP_LENGTH = 160
N_MELS = 64
N_MFCC = 13
N_CQT_BINS = N_MELS
BINS_PER_OCTAVE = 12
N_CQCC = 13
TOTAL_FEATURES = N_MELS + N_MFCC + N_CQT_BINS + N_CQCC # 64 + 13 + 64 + 13 = 154


# Create and compile the model using the defined lightweight architecture
input_shape_combined = (None, TOTAL_FEATURES) # (Time steps, Features)
model = create_lightweight_cnn_bigru_model(input_shape_combined) # Using the updated (simpler) model

# <<<--- LOWER LEARNING RATE and ADD GRADIENT CLIPPING --- >>>
model.compile(
    optimizer=Adam(learning_rate=1e-4, clipnorm=1.0), # Lower LR and add clipnorm
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')] # Assign name to AUC metric
)

# Print model summary
model.summary()

I0000 00:00:1744226094.629008 1975018 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2143 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3050 Ti Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


In [None]:
# --- Feature Parameters (Ensure these match Cell 4 if changed) ---
SR = 16000
N_FFT = 512
HOP_LENGTH = 160
N_MELS = 64
N_MFCC = 13
N_CQT_BINS = N_MELS
BINS_PER_OCTAVE = 12
N_CQCC = 13
TOTAL_FEATURES = N_MELS + N_MFCC + N_CQT_BINS + N_CQCC # 64 + 13 + 64 + 13 = 154


# Create and compile the model using the defined lightweight architecture
input_shape_combined = (None, TOTAL_FEATURES) # (Time steps, Features)
model = create_lightweight_cnn_bigru_model(input_shape_combined) # Using the updated (simpler) model

# <<<--- LOWER LEARNING RATE and ADD GRADIENT CLIPPING --- >>>
model.compile(
    optimizer=Adam(learning_rate=1e-4, clipnorm=1.0), # Lower LR and add clipnorm
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')] # Assign name to AUC metric
)

# Print model summary
model.summary()

I0000 00:00:1744226094.629008 1975018 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2143 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3050 Ti Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


In [13]:
# Train the model
EPOCHS = 60 # Adjust as needed, early stopping will likely trigger sooner

print("\n--- Starting Model Training ---")
history = None
if train_samples_count > 0 and dev_samples_count > 0:
    history = model.fit(
        train_gen,
        steps_per_epoch=steps_per_epoch,
        epochs=EPOCHS,
        validation_data=dev_gen,
        validation_steps=validation_steps,
        callbacks=callbacks_list,
        # Use class_weight from the generator if you didn't include sample_weight
        # class_weight=class_weights, # If generator yielded only X, y
        # Use sample_weight if generator yields X, y, sample_weight (as implemented)
    )
    print("\n--- Training Finished ---")
else:
    print("\nSkipping training due to missing training or validation data.")

# Load best weights saved by ModelCheckpoint (redundant if restore_best_weights=True in EarlyStopping, but safe)
if os.path.exists(checkpoint_filepath):
     print(f"Loading best weights from {checkpoint_filepath}")
     model.load_weights(checkpoint_filepath)


--- Starting Model Training ---
Epoch 1/60


I0000 00:00:1744226112.945243 1975354 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m396/396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.4952 - auc: 0.5121 - loss: 2.9907
Epoch 1: val_loss improved from inf to 2.63510, saving model to best_lightweight_model.keras
[1m396/396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5205s[0m 13s/step - accuracy: 0.4952 - auc: 0.5120 - loss: 2.9904 - val_accuracy: 0.2471 - val_auc: 0.5705 - val_loss: 2.6351 - learning_rate: 1.0000e-04
Epoch 2/60
[1m396/396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - accuracy: 0.5108 - auc: 0.5116 - loss: 2.6140
Epoch 2: val_loss improved from 2.63510 to 2.36430, saving model to best_lightweight_model.keras
[1m396/396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2104s[0m 5s/step - accuracy: 0.5108 - auc: 0.5116 - loss: 2.6138 - val_accuracy: 0.2084 - val_auc: 0.5759 - val_loss: 2.3643 - learning_rate: 1.0000e-04
Epoch 3/60
[1m396/396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - accuracy: 0.5125 - auc: 0.5140 - loss: 2.

KeyboardInterrupt: 

In [None]:
# Plot training history if training was performed
if history is not None:
    print("\n--- Plotting Training History ---")
    plt.figure(figsize=(15, 6))

    # Accuracy Plot
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.plot(history.history.get('auc', []), label='Training AUC') # Use .get for safety
    plt.plot(history.history.get('val_auc', []), label='Validation AUC')
    plt.title('Model Accuracy & AUC')
    plt.xlabel('Epoch')
    plt.ylabel('Metric Value')
    plt.legend()
    plt.grid(True)

    # Loss Plot
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()
else:
    print("Skipping history plot as training was not performed.")

In [None]:
# Evaluate the model on the evaluation set using model.evaluate()
print("\n--- Evaluating on Evaluation Dataset (using model.evaluate) ---")
eval_results = None
if eval_samples_count > 0:
    eval_results = model.evaluate(eval_gen_for_eval, steps=eval_steps, verbose=1)
    print(f"\nEvaluation Results - Loss: {eval_results[0]:.4f}, Accuracy: {eval_results[1]:.4f}, AUC: {eval_results[2]:.4f}")
else:
    print("Skipping evaluation due to missing evaluation data.")

In [None]:
print("\n--- Performing Advanced Evaluation (Predictions, CM, EER, t-DCF) ---")

y_pred_scores = []
y_true_labels = []
start_time = time.time()

if eval_samples_count > 0:
    print(f"Generating predictions for {eval_samples_count} samples...")
    # Use the separate generator instance for prediction
    for i in range(eval_steps + 1): # Add 1 to ensure all samples are covered if count isn't multiple of batch size
        try:
            batch_x, batch_y, _ = next(eval_gen_for_predict)
            if batch_x.shape[0] == 0: continue # Skip empty batches
            batch_pred = model.predict(batch_x, verbose=0)
            y_pred_scores.extend(batch_pred.flatten())
            y_true_labels.extend(batch_y)
        except StopIteration:
            break # Generator finished
    print(f"Prediction generation finished in {time.time() - start_time:.2f} seconds.")

    # Ensure lists are numpy arrays and have the same length
    y_pred_scores = np.array(y_pred_scores)
    y_true_labels = np.array(y_true_labels)
    min_len = min(len(y_pred_scores), len(y_true_labels), eval_samples_count) # Cap length
    y_pred_scores = y_pred_scores[:min_len]
    y_true_labels = y_true_labels[:min_len]

    if len(y_true_labels) == 0:
        print("No predictions generated or true labels available for advanced evaluation.")
    else:
        # --- F1 Score and Classification Report ---
        y_pred_binary = (y_pred_scores > 0.5).astype(int)
        f1 = f1_score(y_true_labels, y_pred_binary)
        print(f"\nF1 Score (Threshold 0.5): {f1:.4f}")
        print("\nClassification Report (Threshold 0.5):")
        # Use zero_division=0 to handle cases where a class might not be predicted
        print(classification_report(y_true_labels, y_pred_binary, target_names=['Fake (0)', 'Real (1)'], zero_division=0))

        # --- Confusion Matrix ---
        cm = confusion_matrix(y_true_labels, y_pred_binary)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Fake', 'Real'], yticklabels=['Fake', 'Real'])
        plt.title('Confusion Matrix')
        plt.ylabel('True Label (0:Fake, 1:Real)')
        plt.xlabel('Predicted Label (0:Fake, 1:Real)')
        plt.show()

        # --- EER Calculation ---
        # Calculate False Positive Rate (FPR) and True Positive Rate (TPR)
        # Note: roc_curve expects true labels (0 or 1) and prediction scores (probabilities)
        fpr, tpr, thresholds = roc_curve(y_true_labels, y_pred_scores, pos_label=1) # Assuming Real=1 is positive
        fnr = 1 - tpr # False Negative Rate

        # Find the EER point where FPR is closest to FNR
        eer_index = np.nanargmin(np.abs(fpr - fnr))
        eer = (fpr[eer_index] + fnr[eer_index]) / 2.0 # Average FPR and FNR at intersection
        eer_threshold = thresholds[eer_index]

        print(f"\nEqual Error Rate (EER): {eer:.4f}")
        print(f"EER Threshold: {eer_threshold:.4f}")

        # Plot ROC Curve with EER point
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f'ROC curve (AUC = {auc(fpr, tpr):.4f})')
        plt.plot(fpr, fnr, label='FN Rate', linestyle='--') # Plot FNR vs FPR
        plt.plot([0, 1], [1, 0], 'k--', label='Random guess') # Diagonal line
        plt.scatter(fpr[eer_index], tpr[eer_index], color='red', zorder=5, label=f'EER ≈ {eer:.4f}')
        plt.xlabel('False Positive Rate (FPR)')
        plt.ylabel('True Positive Rate (TPR)')
        plt.title('Receiver Operating Characteristic (ROC) Curve & EER')
        plt.legend()
        plt.grid(True)
        plt.show()

        # --- t-DCF Calculation ---
        # ASVspoof 2019 LA parameters (example)
        p_target = 0.05  # Prior probability of target (bona fide/real) - Adjust if needed
        c_miss = 1       # Cost of missing a spoof (FN for spoof detection -> FP for real detection)
        c_fa_bona = 10   # Cost of false alarm on bona fide (FP for spoof detection -> FN for real detection)

        def calculate_t_dcf(frate, farate, p_target, c_miss, c_fa):
             """ Calculate t-DCF """
             dcf = c_miss * p_target * frate + c_fa * (1 - p_target) * farate
             # Normalize DCF by the cost of always missing target and always accepting non-target
             dcf_norm = dcf / min(c_miss * p_target, c_fa * (1 - p_target))
             return dcf_norm # Return normalized t-DCF

        # Calculate FAR and FRR at the EER threshold
        # FAR (Spoof classified as Real) = FPR when Real=1 is positive class
        # FRR (Real classified as Spoof) = FNR when Real=1 is positive class
        far_at_eer = fpr[eer_index]
        frr_at_eer = fnr[eer_index]

        # Calculate t-DCF (using FRR as miss rate for target=Real, FAR as false alarm rate for non-target=Fake)
        # Here, "miss" means missing a Real speaker (FRR), "false alarm" means misclassifying Fake as Real (FAR)
        # Check ASVspoof definitions carefully for your specific task interpretation
        # Assuming standard definition where target=bona fide (1):
        # P_miss = FRR (Bona fide classified as Spoof)
        # P_fa = FAR (Spoof classified as Bona fide)

        t_dcf_eer = calculate_t_dcf(frr_at_eer, far_at_eer, p_target, c_miss=c_fa_bona, c_fa=c_miss) # Swapped costs based on definition above
        print(f"Normalized t-DCF (at EER threshold): {t_dcf_eer:.4f}")

        # Find minimum t-DCF across all thresholds
        tdcf_values = [calculate_t_dcf(fnr[i], fpr[i], p_target, c_miss=c_fa_bona, c_fa=c_miss) for i in range(len(thresholds))]
        min_tdcf_index = np.nanargmin(tdcf_values)
        min_t_dcf = tdcf_values[min_tdcf_index]
        min_tdcf_threshold = thresholds[min_tdcf_index]
        print(f"Minimum Normalized t-DCF: {min_t_dcf:.4f} at Threshold: {min_tdcf_threshold:.4f}")

else:
    print("Skipping advanced evaluation due to missing evaluation data or predictions.")