In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import torchaudio.transforms as T
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



In [2]:
import os
import librosa
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder

# Function to extract MFCC features
def extract_features(file_path, sr=22050, n_mfcc=13):
    audio, sr = librosa.load(file_path, sr=sr)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    return np.mean(mfccs, axis=1)  # Convert to fixed-size feature vector

# Function to load dataset dynamically
def load_dataset(base_path):
    data, labels = [], []
    for category in ["real", "fake"]:  # Two categories
        folder_path = os.path.join(base_path, category)
        for file in os.listdir(folder_path):
            if file.endswith(".wav"):  # Only process .wav files
                file_path = os.path.join(folder_path, file)
                features = extract_features(file_path)
                data.append(features)
                labels.append(category)

    return np.array(data), np.array(labels)

# Load training, validation, and testing data
train_data, train_labels = load_dataset("./for-2sec/training")
val_data, val_labels = load_dataset("./for-2sec/validation")
test_data, test_labels = load_dataset("./for-2sec/testing")

# Encode labels (Real = 0, Fake = 1)
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
val_labels = label_encoder.transform(val_labels)
test_labels = label_encoder.transform(test_labels)


In [10]:
class DeepfakeAudioModel(nn.Module):
    def __init__(self):
        super(DeepfakeAudioModel, self).__init__()
        
        self.conv1 = nn.Conv2d(1, 32, kernel_size=(3, 3), stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=(2, 2))
        self.dropout = nn.Dropout(0.3)
        
        self.fc1 = nn.Linear(64 * 32 * 32, 128)  # Adjust based on input shape
        self.fc2 = nn.Linear(128, 2)  # 2 classes: Real/Fake
        
    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = torch.flatten(x, start_dim=1)
        x = self.dropout(torch.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

# Initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeepfakeAudioModel().to(device)



In [4]:
import torch.nn as nn
import torch.optim as optim

class DeepfakeAudioModel(nn.Module):
    def __init__(self, input_size):
        super(DeepfakeAudioModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)  # Output: Real (0) or Fake (1)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return self.softmax(x)

# Initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size = train_data.shape[1]  # Number of MFCC features
model = DeepfakeAudioModel(input_size).to(device)


In [12]:
# Loss function with class weights (for imbalance)
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights
train_labels = [train_dataset[i][1].item() for i in range(len(train_dataset))]  # Extract labels properly
class_weights = compute_class_weight("balanced", classes=np.array([0, 1]), y=np.array(train_labels))
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training function
def train_model(model, train_loader, val_loader, epochs=10):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct, total = 0, 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

        epoch_loss = running_loss / len(train_loader)
        epoch_acc = correct / total * 100
        print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%")

        # Evaluate
        evaluate_model(model, val_loader)

# Evaluation function
def evaluate_model(model, val_loader):
    model.eval()
    correct, total = 0, 0

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total * 100
    print(f"Validation Accuracy: {accuracy:.2f}%")

# Train the model
train_model(model, train_loader, val_loader, epochs=10)


RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [16, 13]

In [13]:
from sklearn.metrics import classification_report

def evaluate_test_set(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)

            all_preds.extend(preds.cpu().numpy())  # Convert to NumPy
            all_labels.extend(labels.cpu().numpy())

    return all_labels, all_preds

# Get predictions
true_labels, pred_labels = evaluate_test_set(model, test_loader)


RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [16, 13]

In [8]:
report = classification_report(true_labels, pred_labels, target_names=["Real", "Fake"])
print(report)


              precision    recall  f1-score   support

        Real       0.00      0.00      0.00       544
        Fake       0.50      1.00      0.67       544

    accuracy                           0.50      1088
   macro avg       0.25      0.50      0.33      1088
weighted avg       0.25      0.50      0.33      1088



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(class_weight="balanced", classes=[0,1], y=true_labels)  
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)  

criterion = nn.CrossEntropyLoss(weight=class_weights)  # Use weighted loss  


InvalidParameterError: The 'classes' parameter of compute_class_weight must be an instance of 'numpy.ndarray'. Got [0, 1] instead.

In [14]:
import os
import numpy as np
import librosa
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score

# Feature Extraction Function
def extract_features(file_path, sr=22050, n_mfcc=20):
    audio, sr = librosa.load(file_path, sr=sr)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    delta_mfccs = librosa.feature.delta(mfccs)
    combined_features = np.hstack((mfccs.mean(axis=1), delta_mfccs.mean(axis=1)))  # [2 * n_mfcc]
    return combined_features

# Function to Load Dataset
def load_dataset(base_path):
    data, labels = [], []
    for category in ["real", "fake"]:  # Two categories
        folder_path = os.path.join(base_path, category)
        for file in os.listdir(folder_path):
            if file.endswith(".wav"):
                file_path = os.path.join(folder_path, file)
                features = extract_features(file_path)
                data.append(features)
                labels.append(category)

    return np.array(data), np.array(labels)

# Load Datasets
train_data, train_labels = load_dataset("./for-2sec/training")
val_data, val_labels = load_dataset("./for-2sec/validation")
test_data, test_labels = load_dataset("./for-2sec/testing")

# Encode Labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)  # 0 = Real, 1 = Fake
val_labels = label_encoder.transform(val_labels)
test_labels = label_encoder.transform(test_labels)

# Normalize Features
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
val_data = scaler.transform(val_data)
test_data = scaler.transform(test_data)

# Convert to Torch Tensors
train_data, train_labels = torch.tensor(train_data, dtype=torch.float32), torch.tensor(train_labels, dtype=torch.long)
val_data, val_labels = torch.tensor(val_data, dtype=torch.float32), torch.tensor(val_labels, dtype=torch.long)
test_data, test_labels = torch.tensor(test_data, dtype=torch.float32), torch.tensor(test_labels, dtype=torch.long)

# Create Dataset & DataLoader
class AudioDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

train_loader = DataLoader(AudioDataset(train_data, train_labels), batch_size=16, shuffle=True)
val_loader = DataLoader(AudioDataset(val_data, val_labels), batch_size=16, shuffle=False)
test_loader = DataLoader(AudioDataset(test_data, test_labels), batch_size=16, shuffle=False)

# Define Neural Network Model
class DeepfakeAudioModel(nn.Module):
    def __init__(self, input_size):
        super(DeepfakeAudioModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 2)  # Output 2 classes (Real/Fake)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Model Initialization
input_size = train_data.shape[1]  # Number of MFCC features
model = DeepfakeAudioModel(input_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Function
def train_model(model, train_loader, val_loader, epochs=20):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        # Validation Accuracy
        model.eval()
        val_preds, val_true = [], []
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                preds = torch.argmax(outputs, axis=1)
                val_preds.extend(preds.numpy())
                val_true.extend(labels.numpy())

        val_acc = accuracy_score(val_true, val_preds)
        print(f"Epoch {epoch+1}, Loss: {running_loss:.4f}, Validation Accuracy: {val_acc:.4f}")

# Train the Model
train_model(model, train_loader, val_loader, epochs=20)

# Test Model Performance
model.eval()
test_preds, test_true = [], []
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        preds = torch.argmax(outputs, axis=1)
        test_preds.extend(preds.numpy())
        test_true.extend(labels.numpy())

# Print Classification Report
print("\nTest Classification Report:\n", classification_report(test_true, test_preds, target_names=["Real", "Fake"]))


Epoch 1, Loss: 243.2051, Validation Accuracy: 0.9505
Epoch 2, Loss: 85.0980, Validation Accuracy: 0.9632
Epoch 3, Loss: 56.1333, Validation Accuracy: 0.9770
Epoch 4, Loss: 42.2342, Validation Accuracy: 0.9749
Epoch 5, Loss: 32.3017, Validation Accuracy: 0.9770
Epoch 6, Loss: 27.9505, Validation Accuracy: 0.9841
Epoch 7, Loss: 21.3185, Validation Accuracy: 0.9816
Epoch 8, Loss: 16.9975, Validation Accuracy: 0.9851
Epoch 9, Loss: 13.9361, Validation Accuracy: 0.9851
Epoch 10, Loss: 11.3487, Validation Accuracy: 0.9844
Epoch 11, Loss: 12.8547, Validation Accuracy: 0.9837
Epoch 12, Loss: 10.8675, Validation Accuracy: 0.9866
Epoch 13, Loss: 11.5887, Validation Accuracy: 0.9883
Epoch 14, Loss: 7.4008, Validation Accuracy: 0.9862
Epoch 15, Loss: 8.6952, Validation Accuracy: 0.9858
Epoch 16, Loss: 4.9800, Validation Accuracy: 0.9802
Epoch 17, Loss: 7.9735, Validation Accuracy: 0.9869
Epoch 18, Loss: 4.9051, Validation Accuracy: 0.9869
Epoch 19, Loss: 7.1266, Validation Accuracy: 0.9858
Epoch 2

In [15]:
import torch
import pickle

# Save the model's state dictionary
torch.save(model.state_dict(), "deepfake_audio_model.pth")

# Save the LabelEncoder
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

print("Model and label encoder saved successfully!")


Model and label encoder saved successfully!


In [16]:
print(model)


DeepfakeAudioModel(
  (fc1): Linear(in_features=40, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=2, bias=True)
)


In [17]:
import os
import numpy as np
import librosa
from sklearn.preprocessing import LabelEncoder
import pickle

def extract_features(file_path, sr=22050, n_mfcc=40):
    """Extract MFCC features from an audio file."""
    audio, sr = librosa.load(file_path, sr=sr)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    return np.mean(mfccs, axis=1)  # Convert to fixed-size feature vector

def load_dataset(base_path):
    """Load dataset dynamically and extract features."""
    data, labels = [], []
    for category in ["real", "fake"]:  
        folder_path = os.path.join(base_path, category)
        for file in os.listdir(folder_path):
            if file.endswith(".wav"):  
                file_path = os.path.join(folder_path, file)
                features = extract_features(file_path)
                data.append(features)
                labels.append(category)

    return np.array(data), np.array(labels)

# Load training, validation, and test sets
train_data, train_labels = load_dataset("./for-2sec/training")
val_data, val_labels = load_dataset("./for-2sec/validation")
test_data, test_labels = load_dataset("./for-2sec/testing")

# Encode labels (Real = 0, Fake = 1)
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
val_labels = label_encoder.transform(val_labels)
test_labels = label_encoder.transform(test_labels)

# Save extracted features & labels
with open("features.pkl", "wb") as f:
    pickle.dump((train_data, train_labels, val_data, val_labels, test_data, test_labels, label_encoder), f)

print("Feature extraction complete! Data saved.")
import torch
import torch.nn as nn
import torch.optim as optim
import pickle

# Load extracted features
with open("features.pkl", "rb") as f:
    train_data, train_labels, val_data, val_labels, test_data, test_labels, label_encoder = pickle.load(f)

# Convert data to PyTorch tensors
train_data, train_labels = torch.tensor(train_data, dtype=torch.float32), torch.tensor(train_labels, dtype=torch.long)
val_data, val_labels = torch.tensor(val_data, dtype=torch.float32), torch.tensor(val_labels, dtype=torch.long)
test_data, test_labels = torch.tensor(test_data, dtype=torch.float32), torch.tensor(test_labels, dtype=torch.long)

# Define the Deepfake Audio Model
class DeepfakeAudioModel(nn.Module):
    def __init__(self, input_size=40):
        super(DeepfakeAudioModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)  
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 2)  

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize model, loss function, and optimizer
model = DeepfakeAudioModel(input_size=40)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(train_data)
    loss = criterion(outputs, train_labels)
    loss.backward()
    optimizer.step()
    
    # Validation accuracy
    model.eval()
    with torch.no_grad():
        val_outputs = model(val_data)
        val_preds = torch.argmax(val_outputs, dim=1)
        accuracy = (val_preds == val_labels).float().mean().item()
    
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}, Validation Accuracy: {accuracy:.4f}")

# Save model
torch.save(model.state_dict(), "deepfake_audio_model.pth")
print("Model training complete! Model saved.")


Feature extraction complete! Data saved.
Epoch 1, Loss: 1.5685, Validation Accuracy: 0.4494
Epoch 2, Loss: 1.0044, Validation Accuracy: 0.4232
Epoch 3, Loss: 0.9406, Validation Accuracy: 0.5142
Epoch 4, Loss: 1.0792, Validation Accuracy: 0.5184
Epoch 5, Loss: 1.0618, Validation Accuracy: 0.5092
Epoch 6, Loss: 0.9434, Validation Accuracy: 0.4483
Epoch 7, Loss: 0.8296, Validation Accuracy: 0.4317
Epoch 8, Loss: 0.7878, Validation Accuracy: 0.5050
Epoch 9, Loss: 0.8118, Validation Accuracy: 0.5375
Epoch 10, Loss: 0.8402, Validation Accuracy: 0.5467
Epoch 11, Loss: 0.8294, Validation Accuracy: 0.5502
Epoch 12, Loss: 0.7801, Validation Accuracy: 0.5464
Epoch 13, Loss: 0.7206, Validation Accuracy: 0.5481
Epoch 14, Loss: 0.6816, Validation Accuracy: 0.5648
Epoch 15, Loss: 0.6744, Validation Accuracy: 0.5538
Epoch 16, Loss: 0.6831, Validation Accuracy: 0.5577
Epoch 17, Loss: 0.6828, Validation Accuracy: 0.5771
Epoch 18, Loss: 0.6623, Validation Accuracy: 0.6263
Epoch 19, Loss: 0.6294, Validati

In [19]:
model.eval()
with torch.no_grad():
    test_outputs = model(test_data)
    test_preds = torch.argmax(test_outputs, dim=1)

# Generate classification report
report = classification_report(test_labels.numpy(), test_preds.numpy(), target_names=label_encoder.classes_)
print("\nTest Classification Report:\n")
print(report)


Test Classification Report:

              precision    recall  f1-score   support

        fake       0.59      0.71      0.64       544
        real       0.64      0.51      0.57       544

    accuracy                           0.61      1088
   macro avg       0.61      0.61      0.61      1088
weighted avg       0.61      0.61      0.61      1088

