In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, GRU, Dense, Dropout, BatchNormalization, LayerNormalization, Reshape, Permute, Bidirectional, Add, Attention, Flatten, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.layers import Layer, Concatenate
from tensorflow.keras import backend as K
from sklearn.metrics import f1_score
import librosa
import soundfile as sf
import noisereduce as nr
import matplotlib.pyplot as plt
from scipy.signal import butter, sosfilt

# Add this to create a directory for saving figures
FIGURES_DIR = 'training_figures'
os.makedirs(FIGURES_DIR, exist_ok=True)

In [None]:
# Force GPU usage
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    tf.config.set_visible_devices(physical_devices[0], 'GPU')
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
else:
    print("No GPU devices found")

In [None]:
# Data augmentation and preprocessing
def load_and_preprocess_audio(file_path, sr=16000, duration=4):
    try:
        audio, sr = librosa.load(file_path, sr=sr, duration=duration)

        # Data Augmentation (increased probability and variety)
        if np.random.random() < 0.5:  # 50% chance of applying augmentation
            augmentation_type = np.random.choice(['noise', 'pitch', 'speed'])
            if augmentation_type == 'noise':
                noise = np.random.randn(len(audio)) * 0.005
                audio = audio + noise
            elif augmentation_type == 'pitch':
                audio = librosa.effects.pitch_shift(audio, sr=sr, n_steps=np.random.uniform(-2, 2))
            else:  # speed
                audio = librosa.effects.time_stretch(audio, rate=np.random.uniform(0.8, 1.2))

        # Normalize audio
        audio = audio - np.mean(audio)
        audio = audio / np.max(np.abs(audio))

        # Pad if necessary
        if len(audio) < sr * duration:
            audio = np.pad(audio, (0, sr * duration - len(audio)))
        return audio
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

In [None]:
def extract_features(audio, sr=16000, n_mels=80, n_fft=2048, hop_length=512):
    if audio is None:
        return None

    # Extract mel spectrogram
    mel_spec = librosa.feature.melspectrogram(
        y=audio,
        sr=sr,
        n_mels=n_mels,
        n_fft=n_fft,
        hop_length=hop_length
    )
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

    # Normalize features
    log_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / np.std(log_mel_spec)
    return log_mel_spec

In [None]:
# Analyze class distribution for debugging class imbalance
def analyze_class_distribution(data_path):
    real_count = len([f for f in os.listdir(os.path.join(data_path, 'real')) if f.endswith('.wav')])
    fake_count = len([f for f in os.listdir(os.path.join(data_path, 'fake')) if f.endswith('.wav')])
    total = real_count + fake_count
    print(f"\nClass Distribution for {data_path}:")
    print(f"Real: {real_count} ({real_count/total*100:.2f}%)")
    print(f"Fake: {fake_count} ({fake_count/total*100:.2f}%)")
    return {'real': real_count, 'fake': fake_count}

In [None]:
# Data generator with sample weights and debugging utilities
def data_generator(data_path, batch_size=128, shuffle=True):
    real_files = [os.path.join(data_path, 'real', f) for f in os.listdir(os.path.join(data_path, 'real')) if f.endswith('.wav')]
    fake_files = [os.path.join(data_path, 'fake', f) for f in os.listdir(os.path.join(data_path, 'fake')) if f.endswith('.wav')]
    
    all_files = real_files + fake_files
    labels = [1] * len(real_files) + [0] * len(fake_files)
    
    total_samples = len(all_files)
    class_weights = {
        1: total_samples / (2 * len(real_files)),
        0: total_samples / (2 * len(fake_files))
    }
    
    while True:
        if shuffle:
            temp = list(zip(all_files, labels))
            np.random.shuffle(temp)
            all_files, labels = zip(*temp)
        
        for i in range(0, len(all_files), batch_size):
            batch_files = all_files[i:i+batch_size]
            batch_labels = labels[i:i+batch_size]
            
            batch_x = []
            batch_y = []
            batch_weights = []
            max_length = 0
            
            for file_path, label in zip(batch_files, batch_labels):
                audio = load_and_preprocess_audio(file_path)
                features = extract_features(audio)
                
                if features is not None:
                    batch_x.append(features.T)
                    batch_y.append(label)
                    weight = class_weights[label]
                    batch_weights.append(weight)
                    max_length = max(max_length, features.T.shape[0])
            
            # Pad sequences to max_length
            padded_batch_x = []
            for x in batch_x:
                padded_x = np.pad(x, ((0, max_length - x.shape[0]), (0, 0)), mode='constant')
                padded_batch_x.append(padded_x)
            
            if padded_batch_x:
                yield np.array(padded_batch_x), np.array(batch_y), np.array(batch_weights)

In [None]:
class MFM(Layer):
    def __init__(self, **kwargs):
        super(MFM, self).__init__(**kwargs)

    def call(self, inputs):
        shape = tf.shape(inputs)
        return tf.reshape(tf.math.maximum(inputs[:,:,:shape[-1]//2], inputs[:,:,shape[-1]//2:]), (shape[0], shape[1], shape[-1]//2))
    
def create_enhanced_lc_grnn_model(input_shape):
    inputs = Input(shape=input_shape)

    x = BatchNormalization()(inputs)

    # Light Convolutional layers with increased regularization
    x = Conv1D(32, 5, padding='same', activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.02))(x)
    x = MFM()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.4)(x)

    x = Conv1D(64, 3, padding='same', activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.02))(x)
    x = MFM()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.4)(x)

    # Bidirectional GRU layers with residual connections and increased regularization
    for units in [64, 32]:
        gru = Bidirectional(GRU(units // 2, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(0.02)))
        gru_output = gru(x)
        gru_output = Dense(K.int_shape(x)[-1])(gru_output)
        x = Add()([x, gru_output])
        x = LayerNormalization()(x)
        x = Dropout(0.5)(x)

    # Attention mechanism
    attention = Attention()([x, x])
    x = Add()([x, attention])

    # Final GRU layer
    x = Bidirectional(GRU(16, kernel_regularizer=tf.keras.regularizers.l2(0.02)))(x)
    x = LayerNormalization()(x)
    x = Dropout(0.5)(x)

    # Dense layers with increased regularization
    x = Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.02))(x)
    x = Dropout(0.5)(x)

    outputs = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=inputs, outputs=outputs)

    return model

In [None]:
# Paths to data
train_data_path = 'datasetNEW/train'
dev_data_path = 'datasetNEW/dev'
eval_data_path = 'datasetNEW/eval'

# Analyze class distribution
print("Training set class distribution:")
analyze_class_distribution(train_data_path)
analyze_class_distribution(dev_data_path)
analyze_class_distribution(eval_data_path)

In [None]:
def count_files(path):
    real_files = [f for f in os.listdir(os.path.join(path, 'real')) if f.endswith('.wav')]
    fake_files = [f for f in os.listdir(os.path.join(path, 'fake')) if f.endswith('.wav')]
    return len(real_files) + len(fake_files)

# Create generators
batch_size = 128  # Increased batch size
train_gen = data_generator(train_data_path, batch_size=batch_size)
dev_gen = data_generator(dev_data_path, batch_size=batch_size)
eval_gen = data_generator(eval_data_path, batch_size=batch_size)

# Calculate steps per epoch
train_samples_count = count_files(train_data_path)
dev_samples_count = count_files(dev_data_path)
eval_samples_count = count_files(eval_data_path)

steps_per_epoch = train_samples_count // batch_size
validation_steps = dev_samples_count // batch_size
eval_steps = eval_samples_count // batch_size

In [None]:
# Create a custom callback to save training history plots after each epoch
class PlotTrainingHistory(Callback):
    def __init__(self, model_name='model'):
        super().__init__()
        self.model_name = model_name

    def on_epoch_end(self, epoch, logs=None):
        # Plot training history
        plt.figure(figsize=(15, 5))

        plt.subplot(1, 2, 1)
        plt.plot(self.model.history.history['accuracy'], label='Training Accuracy')
        plt.plot(self.model.history.history['val_accuracy'], label='Validation Accuracy')
        plt.title('Model Accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(self.model.history.history['loss'], label='Training Loss')
        plt.plot(self.model.history.history['val_loss'], label='Validation Loss')
        plt.title('Model Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()

        plt.tight_layout()
        # Save the plot to a file
        filepath = os.path.join(FIGURES_DIR, f'{self.model_name}_epoch_{epoch+1}.png')
        plt.savefig(filepath)
        plt.close()
        print(f'Training history plot saved to {filepath}')

# Callbacks
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=5,
    min_lr=1e-6,
    verbose=1
)
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True,
    verbose=1
)

# Create a checkpoint directory to save the model weights after each epoch
checkpoint_dir = './training_checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True,
    monitor='val_loss',
    save_best_only=False,  # Save every epoch, not just the best
    verbose=1
)

plot_training_callback = PlotTrainingHistory(model_name='audio_model') # create instance here

In [None]:
# Create and compile the model
input_shape = (None, 80)
model = create_enhanced_lc_grnn_model(input_shape)
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC()])

In [None]:
# Train the model
history = model.fit(
    train_gen,
    steps_per_epoch=steps_per_epoch,
    epochs=50,  # Reduced number of epochs
    validation_data=dev_gen,
    validation_steps=validation_steps,
    callbacks=[reduce_lr, early_stopping, checkpoint_callback, plot_training_callback], # add the callback here

)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Evaluate the model
print("\nEvaluating on Evaluation Dataset:")
eval_results = model.evaluate(eval_gen, steps=eval_steps)
print(f"Evaluation Results - Loss: {eval_results[0]}, Accuracy: {eval_results[1]}, AUC: {eval_results[2]}")

In [None]:
from sklearn.metrics import confusion_matrix, f1_score, roc_curve
import seaborn as sns
import matplotlib.pyplot as plt

# EER and t-DCF related imports
from scipy.interpolate import interp1d

# Define t-DCF parameters (these should be set according to your task)
p_target = 0.05  # Prior probability of target speaker
c_miss = 1       # Cost of a miss (false negative)
c_false_alarm = 1 # Cost of a false alarm (false positive)

# Reset the generator to its initial state
eval_gen = data_generator(eval_data_path, batch_size=batch_size)

# Generate predictions and collect true labels
y_pred = []
y_true = []
for _ in range(eval_steps):
    batch_x, batch_y, _ = next(eval_gen)
    batch_pred = model.predict(batch_x, verbose=0)
    y_pred.extend(batch_pred.flatten())
    y_true.extend(batch_y)

# Convert to numpy arrays and ensure same length
y_pred = np.array(y_pred)
y_true = np.array(y_true)
min_len = min(len(y_pred), len(y_true))
y_pred = y_pred[:min_len]
y_true = y_true[:min_len]

# Convert predictions to binary (0 or 1)
y_pred_binary = (y_pred > 0.5).astype(int)


# Calculate F1 Score
f1 = f1_score(y_true, y_pred_binary)
print(f"F1 Score: {f1:.4f}")


# Create confusion matrix
cm = confusion_matrix(y_true, y_pred_binary)

# Convert confusion matrix to percentages
cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

# Visualize confusion matrix as percentages
plt.figure(figsize=(10, 8))
sns.heatmap(cm_percentage, annot=True, fmt='.2f', cmap='Blues', cbar_kws={'format': '%.0f%%'})
plt.title('Confusion Matrix (Percentage)')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig(os.path.join(FIGURES_DIR, 'confusion_matrix.png'))  # Save confusion matrix
plt.show()


#---------------------------------------------------------------
# EER Calculation
#---------------------------------------------------------------
fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1)
# Find the EER threshold
eer_threshold = thresholds[np.argmin(np.abs(fpr - (1-tpr)))]

# Calculate EER
eer = fpr[np.argmin(np.abs(fpr - (1-tpr)))]

print(f"EER: {eer:.4f}")

#---------------------------------------------------------------
# t-DCF Calculation
#---------------------------------------------------------------
def calculate_t_dcf(y_true, y_pred, p_target, c_miss, c_false_alarm, threshold):
    """
    Calculates the tuned Detection Cost Function (t-DCF).
    """
    # Apply threshold to get binary predictions
    y_pred_binary = (y_pred >= threshold).astype(int)

    # Calculate confusion matrix elements
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred_binary).ravel()

    # Calculate False Alarm Rate (FAR) and Miss Rate (FR)
    far = fp / (tn + fp)
    fr = fn / (tp + fn)

    # Calculate t-DCF
    t_dcf = c_miss * p_target * fr + c_false_alarm * (1 - p_target) * far

    return t_dcf

# Calculate t-DCF using the EER threshold
t_dcf = calculate_t_dcf(y_true, y_pred, p_target, c_miss, c_false_alarm, eer_threshold)
print(f"t-DCF: {t_dcf:.4f}")

In [None]:
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import noisereduce as nr
from scipy.signal import butter, sosfilt
import soundfile as sf

def denoise_and_amplify(audio, sr):
    try:
        # Noise Reduction
        reduced_noise = nr.reduce_noise(y=audio, sr=sr, stationary=False)
        return reduced_noise
    except Exception as e:
        print(f"Error in denoise_and_amplify: {e}")
        return audio  # Return original audio if an error occurs

def preprocess_and_visualize(file_path, sr=16000, duration=4):
    # Load the audio file
    audio, sr = librosa.load(file_path, sr=sr, duration=duration)
    
    # Save original audio
    sf.write('original_audio.wav', audio, sr)
    
    # Original Mel Spectrogram
    plt.figure(figsize=(15, 5))
    plt.subplot(1, 3, 1)
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=80, n_fft=2048, hop_length=512)
    librosa.display.specshow(librosa.power_to_db(mel_spec, ref=np.max), sr=sr, x_axis='time', y_axis='mel')
    plt.title('Original Mel Spectrogram')
    
    # Noise Reduction using denoise_and_amplify function
    reduced_noise = denoise_and_amplify(audio, sr)
    sf.write('noise_reduced_audio.wav', reduced_noise, sr)
    
    # Final Mel Spectrogram
    plt.subplot(1, 3, 3)
    final_mel_spec = librosa.feature.melspectrogram(y=reduced_noise, sr=sr, n_mels=80, n_fft=2048, hop_length=512)
    librosa.display.specshow(librosa.power_to_db(final_mel_spec, ref=np.max), sr=sr, x_axis='time', y_axis='mel')
    plt.title('Final Mel Spectrogram')
    
    plt.tight_layout()
    plt.savefig(os.path.join(FIGURES_DIR, 'mel_spectrograms.png')) # saves graphs
    plt.close()

# Use the function
file_path = 'A_2582_0_A.wav'
preprocess_and_visualize(file_path)