# Audio Deepfake Detection Using ECAPA-TDNN Model
This notebook demonstrates the complete workflow for detecting audio deepfakes using a pretrained ECAPA-TDNN model and a custom classifier.

Dataset : https://drive.google.com/drive/folders/18Wz86no7pKuXLqo0LKCF-_wQx3R-nf83?usp=sharing

( Above data is preprocessed and the preprocesssing code is in preprocessing_data_ecapa.ipynb notebook )


## Installation of Required Libraries

In [1]:
pip install torch torchaudio librosa speechbrain pandas

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import torchaudio
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchaudio.transforms import MelSpectrogram, Resample

## Data Preparation and Custom Dataset Class

In [None]:
import os
import pandas as pd
import torchaudio
from torchaudio.transforms import Resample, MelSpectrogram
from torch.utils.data import Dataset, DataLoader

# Dataset class for Audio Deepfake Detection
class AudioDeepfakeDataset(Dataset):
    def __init__(self, folder_path, labels_file, target_sr=16000):
        """
        Initializes the dataset for audio deepfake detection.
        
        Args:
            folder_path (str): Path to the folder containing audio files.
            labels_file (str): Path to the CSV file with filenames and labels.
            target_sr (int): Target sample rate for audio files.
        """
        self.folder_path = folder_path
        self.labels = pd.read_csv(labels_file)
        self.target_sr = target_sr
        self.mel_transform = MelSpectrogram(
            sample_rate=target_sr,
            n_fft=400,
            win_length=400,
            hop_length=160,
            n_mels=80
        )

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        """
        Returns the Mel spectrogram and label for a given index.
        
        Args:
            idx (int): Index of the data point.
        
        Returns:
            torch.Tensor: Mel spectrogram of the audio.
            int: Label (0 for bonafide, 1 for spoof).
        """
        # Get file name and label
        row = self.labels.iloc[idx]
        file_path = os.path.join(self.folder_path, row['filename'])
        label = 1 if row['label'] == "spoof" else 0  # Encode labels: spoof = 1, bonafide = 0

        # Load and resample audio
        waveform, sr = torchaudio.load(file_path)
        if sr != self.target_sr:
            resampler = Resample(orig_freq=sr, new_freq=self.target_sr)
            waveform = resampler(waveform)

        # Apply Mel spectrogram transform (feature extraction)
        mel_spec = self.mel_transform(waveform)

        return mel_spec, label


# Paths to your dataset and CSV labels
train_folder = "/Users/samruddhikale/Desktop/CAIR/OG1/Preprocessed_Dataset/train"
train_labels = "/Users/samruddhikale/Desktop/CAIR/OG1/Preprocessed_Dataset/train_labels.csv"
dev_folder = "/Users/samruddhikale/Desktop/CAIR/OG1/Preprocessed_Dataset/dev"
dev_labels = "/Users/samruddhikale/Desktop/CAIR/OG1/Preprocessed_Dataset/dev_labels.csv"
eval_folder = "/Users/samruddhikale/Desktop/CAIR/OG1/Preprocessed_Dataset/eval"
eval_labels = "/Users/samruddhikale/Desktop/CAIR/OG1/Preprocessed_Dataset/eval_labels.csv"

# Create datasets
train_dataset = AudioDeepfakeDataset(train_folder, train_labels)
dev_dataset = AudioDeepfakeDataset(dev_folder, dev_labels)
eval_dataset = AudioDeepfakeDataset(eval_folder, eval_labels)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=8, shuffle=False)
eval_loader = DataLoader(eval_dataset, batch_size=8, shuffle=False)





## Pretrained ECAPA-TDNN Model Setup



In [3]:
from speechbrain.pretrained import SpeakerRecognition

# Load pretrained ECAPA-TDNN model
device = "cuda" if torch.cuda.is_available() else "mps" if torch.has_mps else "cpu"
classifier = SpeakerRecognition.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir="tmp/speechbrain_ecapa",
    run_opts={"device": device}
)


INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
  from speechbrain.pretrained import SpeakerRecognition
  device = "cuda" if torch.cuda.is_available() else "mps" if torch.has_mps else "cpu"
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
IN

##  Custom Classifier for Deepfake Detection 

In [4]:
import torch.nn.functional as F

class DeepfakeClassifier(nn.Module):
    def __init__(self, input_dim):
        super(DeepfakeClassifier, self).__init__()
        
        # Add hidden layers and dropout
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 1)
        self.dropout = nn.Dropout(0.5)  # Dropout layer
        self.batch_norm = nn.BatchNorm1d(512)  # Batch Normalization

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.batch_norm(x)  # Apply batch normalization
        x = self.dropout(x)  # Apply dropout
        x = F.relu(self.fc2(x))
        x = self.dropout(x)  # Apply dropout
        x = F.relu(self.fc3(x))
        x = self.fc4(x)  # Output logits
        return x

## Training Loop

In [5]:
# Initialize the classification model
input_dim = 80 * 301  # Number of Mel bands * time steps (for a single frame of audio)
deepfake_classifier = DeepfakeClassifier(input_dim).to(device)

# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()  # Use BCEWithLogitsLoss for binary classification
optimizer = optim.Adam(deepfake_classifier.parameters(), lr=0.0001)  # Reduce learning rate

# Training loop with accuracy
for epoch in range(10):
    deepfake_classifier.train()
    correct_predictions = 0
    total_predictions = 0
    running_loss = 0

    for mel_spec, labels in train_loader:
        mel_spec, labels = mel_spec.to(device), labels.to(device)

        # Flatten the Mel spectrogram
        mel_spec = mel_spec.view(mel_spec.size(0), -1)

        # Forward pass
        outputs = deepfake_classifier(mel_spec).squeeze(1)  # No sigmoid applied here

        # Calculate loss
        loss = criterion(outputs, labels.float())
        running_loss += loss.item()

        # Predictions
        predictions = torch.sigmoid(outputs) >= 0.5  # Apply sigmoid, then threshold

        # Accuracy calculation
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)

        # Backward pass and optimization
        optimizer.zero_grad()  # This is the correct method to call
        loss.backward()
        optimizer.step()

    # Calculate epoch loss and accuracy
    epoch_loss = running_loss / len(train_loader)
    accuracy = 100 * correct_predictions / total_predictions

    print(f"Epoch [{epoch+1}/10], Loss: {epoch_loss:.4f}, Accuracy: {accuracy:.2f}%")

Epoch [1/10], Loss: 0.3667, Accuracy: 83.36%
Epoch [2/10], Loss: 0.2701, Accuracy: 88.74%
Epoch [3/10], Loss: 0.2269, Accuracy: 90.61%
Epoch [4/10], Loss: 0.2123, Accuracy: 91.59%
Epoch [5/10], Loss: 0.1839, Accuracy: 92.72%
Epoch [6/10], Loss: 0.1744, Accuracy: 93.09%
Epoch [7/10], Loss: 0.1607, Accuracy: 93.85%
Epoch [8/10], Loss: 0.1491, Accuracy: 94.27%
Epoch [9/10], Loss: 0.1431, Accuracy: 94.58%
Epoch [10/10], Loss: 0.1301, Accuracy: 94.98%


## Model Saving



In [7]:
import torch  # Make sure torch is imported

# Save the trained model
model_path = "/Users/samruddhikale/tensorflow-cair/model/deepfake_classifier.pth"  # Change this path to where you want to save the model
torch.save(deepfake_classifier.state_dict(), model_path)

print(f"Model saved to {model_path}")


Model saved to /Users/samruddhikale/tensorflow-cair/model/deepfake_classifier.pth


## Evaluation Loop

In [8]:
# Evaluation loop
deepfake_classifier.eval()  # Set the model to evaluation mode
correct_predictions = 0
total_predictions = 0
running_loss = 0

with torch.no_grad():  # No gradients needed during evaluation
    for mel_spec, labels in eval_loader:
        mel_spec, labels = mel_spec.to(device), labels.to(device)

        # Flatten the Mel spectrogram
        mel_spec = mel_spec.view(mel_spec.size(0), -1)

        # Forward pass
        outputs = deepfake_classifier(mel_spec).squeeze(1)

        # Calculate loss
        loss = criterion(outputs, labels.float())
        running_loss += loss.item()

        # Predictions
        predictions = torch.sigmoid(outputs) >= 0.5  # Apply sigmoid, then threshold

        # Accuracy calculation
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)

# Calculate evaluation loss and accuracy
eval_loss = running_loss / len(eval_loader)
eval_accuracy = 100 * correct_predictions / total_predictions

print(f"Evaluation Loss: {eval_loss:.4f}, Evaluation Accuracy: {eval_accuracy:.2f}%")


Evaluation Loss: 1.2472, Evaluation Accuracy: 93.13%


## Predicting on New Audio Files

In [9]:
# Function for making predictions on new data
def predict(model, audio_path, target_sr=16000):
    # Load and preprocess the audio file
    waveform, sr = torchaudio.load(audio_path)
    
    if sr != target_sr:
        waveform = Resample(orig_freq=sr, new_freq=target_sr)(waveform)

    # Apply Mel spectrogram transform
    mel_transform = MelSpectrogram(
        sample_rate=target_sr, n_fft=400, win_length=400, hop_length=160, n_mels=80
    )
    mel_spec = mel_transform(waveform)

    # Flatten the Mel spectrogram
    mel_spec = mel_spec.reshape(1, -1)  # Add batch dimension

    # Move the data to the same device as the model
    mel_spec = mel_spec.to(device)

    # Model prediction
    with torch.no_grad():  # No gradients needed for inference
        output = model(mel_spec).squeeze(1)

    # Apply sigmoid to the output and threshold at 0.5 for binary classification
    prediction = torch.sigmoid(output) >= 0.5
    return "spoof" if prediction.item() == 1 else "bonafide"

# Example: Use the model to predict on a new audio file
audio_file = "/Users/samruddhikale/Desktop/CAIR/OG1/Preprocessed_Dataset/dev/LA_T_2312160.wav"  # Replace with the actual path to your new audio file
prediction = predict(deepfake_classifier, audio_file)
print(f"Prediction for the audio file: {prediction}")


Prediction for the audio file: spoof


## Sampling and Testing on Smaller Data Subset
Randomly samples 500 files from the unknown data for quick testing.
Saves the subset and corresponding labels to new folders.

In [39]:
import os
import pandas as pd
import shutil
import random

# Paths
dev_folder = "/Users/samruddhikale/Desktop/CAIR/OG1/Preprocessed_Dataset/dev"
dev_labels_file = "/Users/samruddhikale/Desktop/CAIR/OG1/Preprocessed_Dataset/dev_labels.csv"
new_folder = "/Users/samruddhikale/Desktop/CAIR/OG1/Preprocessed_Dataset/new_dev"
new_labels_file = "/Users/samruddhikale/Desktop/CAIR/OG1/Preprocessed_Dataset/new_dev_labels.csv"

# Create the new folder if it doesn't exist
os.makedirs(new_folder, exist_ok=True)

# Load the dev CSV
dev_labels = pd.read_csv(dev_labels_file)

# Randomly sample 100 rows
sampled_labels = dev_labels.sample(n=500, random_state=42)  # Use random_state for reproducibility

# Copy the sampled files to the new folder
for _, row in sampled_labels.iterrows():
    source_path = os.path.join(dev_folder, row['filename'])
    destination_path = os.path.join(new_folder, row['filename'])
    shutil.copy(source_path, destination_path)

# Save the sampled labels to a new CSV
sampled_labels.to_csv(new_labels_file, index=False)

print(f"Randomly selected 500 files have been copied to {new_folder}.")
print(f"Corresponding labels CSV has been saved to {new_labels_file}.")


Randomly selected 500 files have been copied to /Users/samruddhikale/Desktop/CAIR/OG1/Preprocessed_Dataset/new_dev.
Corresponding labels CSV has been saved to /Users/samruddhikale/Desktop/CAIR/OG1/Preprocessed_Dataset/new_dev_labels.csv.


## Predictions and Accuracy
Compares predictions with ground truth labels from the new dataset.



In [11]:
import os
import pandas as pd
import torch
import torchaudio
from torchaudio.transforms import MelSpectrogram, Resample

# Function for making predictions on an audio file
def predict(model, audio_path, target_sr=16000):
    # Load and preprocess the audio file
    waveform, sr = torchaudio.load(audio_path)
    if sr != target_sr:
        waveform = Resample(orig_freq=sr, new_freq=target_sr)(waveform)

    # Apply Mel spectrogram transform
    mel_transform = MelSpectrogram(
        sample_rate=target_sr, n_fft=400, win_length=400, hop_length=160, n_mels=80
    )
    mel_spec = mel_transform(waveform)

    # Flatten the Mel spectrogram
    mel_spec = mel_spec.reshape(1, -1)  # Add batch dimension

    # Move the data to the same device as the model
    mel_spec = mel_spec.to(device)

    # Model prediction
    with torch.no_grad():  # No gradients needed for inference
        output = model(mel_spec).squeeze(1)

    # Apply sigmoid to the output and threshold at 0.5 for binary classification
    prediction = torch.sigmoid(output) >= 0.5
    return "spoof" if prediction.item() == 1 else "bonafide"

# Paths to the new folder and its CSV file
new_folder = "/Users/samruddhikale/Desktop/CAIR/OG1/Preprocessed_Dataset/new_dev"
new_labels_file = "/Users/samruddhikale/Desktop/CAIR/OG1/Preprocessed_Dataset/new_dev_labels.csv"
predictions_file = "/Users/samruddhikale/Desktop/CAIR/OG1/Preprocessed_Dataset/predictions.csv"

# Load the new labels CSV
new_labels = pd.read_csv(new_labels_file)

# Create a list to store prediction results
results = []

# Loop through the files in the new folder
for _, row in new_labels.iterrows():
    audio_path = os.path.join(new_folder, row["filename"])
    label = row["label"]  # Ground truth label
    prediction = predict(deepfake_classifier, audio_path)  # Model prediction
    results.append({"filename": row["filename"], "label": label, "prediction": prediction})

# Save predictions to a new CSV file
predictions_df = pd.DataFrame(results)
predictions_df.to_csv(predictions_file, index=False)

print(f"Predictions saved to {predictions_file}")

# Compare predictions with ground truth
correct = (predictions_df["label"] == predictions_df["prediction"]).sum()
accuracy = correct / len(predictions_df) * 100
print(f"Accuracy on the 100 files: {accuracy:.2f}%")


Predictions saved to /Users/samruddhikale/Desktop/CAIR/OG1/Preprocessed_Dataset/predictions.csv
Accuracy on the 100 files: 93.40%


#### This notebook provides a structured pipeline for audio deepfake detection, combining feature extraction, a pretrained model, and a custom classifier. 