In [1]:
import torchaudio
print(torchaudio.__version__)


2.6.0+cpu


In [None]:
import torchaudio
print(torchaudio.__version__)


2.6.0+cpu


In [None]:
import torch
import torch.nn as nn
import torchaudio
import torchvision.models as models
import torchaudio.transforms as transforms
import numpy as np
from torch.utils.data import DataLoader, Dataset
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import os
import joblib

In [None]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [None]:
# Define the dataset class
class AudioDataset(Dataset):
    def __init__(self, audio_dir, label_file, transform=None):
        self.audio_dir = audio_dir
        self.transform = transform
        self.file_paths, self.labels = self.load_labels(label_file)
    
    def load_labels(self, label_file):
        file_paths = []
        labels = []
        with open(label_file, 'r') as f:
            for line in f:
                parts = line.strip().split()
                filename = parts[1] + ".flac"  # Assuming second column is filename
                label = 0 if parts[-1] == "bonafide" else 1  # Assuming last column is label
                file_paths.append(os.path.join(self.audio_dir, filename))
                labels.append(label)
        return file_paths, torch.tensor(labels)
    
    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, idx):
        waveform, sample_rate = torchaudio.load(self.file_paths[idx])
        if self.transform:
            waveform = self.transform(waveform, sample_rate)
        label = self.labels[idx]
        return waveform.to(device), label.to(device)


In [None]:

# ResNet Feature Extractor
class ResNetFeatureExtractor(nn.Module):
    def __init__(self):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18(pretrained=True)
        self.feature_extractor = nn.Sequential(*list(resnet.children())[:-1])  # Remove last FC layer

    def forward(self, x):
        x = self.feature_extractor(x)
        return x.view(x.size(0), -1)  # Flatten output



In [None]:
# Feature extraction function
def extract_features(model, dataloader):
    model.to(device)
    model.eval()
    features = []
    labels = []
    with torch.no_grad():
        for waveforms, lbls in dataloader:
            waveforms = waveforms.to(device)
            feats = model(waveforms)
            features.append(feats.cpu().numpy())
            labels.append(lbls.cpu().numpy())
    return np.vstack(features), np.hstack(labels)


In [None]:
def preprocess_audio(waveform, sample_rate, target_sample_rate=16000):
    # Convert to mono if stereo
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)  # Average across channels
    
    # Downsample if needed
    if sample_rate != target_sample_rate:
        resample_transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
        waveform = resample_transform(waveform)
    
    # Convert to Mel Spectrogram
    transform = transforms.MelSpectrogram(sample_rate=target_sample_rate, n_mels=128)
    mel_spec = transform(waveform)  # Shape: [1, n_mels, time]

    # Resize to 224x224 for ResNet
    mel_spec = torch.nn.functional.interpolate(mel_spec.unsqueeze(0), size=(224, 224))  # Shape: [1, 1, 224, 224]

    # Ensure it's 3 channels
    mel_spec = mel_spec.squeeze(0)  # Shape: [1, 224, 224]
    mel_spec = mel_spec.expand(3, -1, -1)  # Convert to [3, 224, 224]

    return mel_spec


In [None]:
# Define dataset paths
audio_dir = r"E:\Dataset MAIN\LA\LA\ASVspoof2019_LA_train\flac"
label_file = r"E:\Dataset MAIN\LA\LA\ASVspoof2019_LA_cm_protocols\ASVspoof2019.LA.cm.train.trn.txt"

# Create dataset and dataloader
dataset = AudioDataset(audio_dir, label_file, transform=preprocess_audio)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)


In [None]:
# Initialize ResNet feature extractor
resnet_model = ResNetFeatureExtractor().to(device)




In [None]:
# Extract features
features, labels = extract_features(resnet_model, dataloader)




In [None]:
from sklearn.model_selection import train_test_split

# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# SVM model
svm_model = SVC(kernel='rbf', probability=True)
svm_model.fit(X_train, y_train)


In [None]:
import joblib

# Save models
joblib.dump(rf_model, "random_forest_model.pkl")  # Save RF model
joblib.dump(svm_model, "svm_model.pkl")  # Save SVM model
torch.save(resnet_model.state_dict(), "resnet_feature_extractor.pth")  # Save ResNet model


In [None]:
# Predictions
rf_probs = rf_model.predict_proba(X_test)[:, 1]
svm_probs = svm_model.predict_proba(X_test)[:, 1]
ensemble_probs = (rf_probs + svm_probs) / 2  # Averaging
ensemble_preds = (ensemble_probs > 0.5).astype(int)

# Evaluate ensemble model
accuracy = accuracy_score(y_test, ensemble_preds)
print(f"Ensemble Accuracy: {accuracy:.2f}%")


Ensemble Accuracy: 0.90%


In [None]:
# Prediction function
def predict_audio(model, rf_model, svm_model, audio_path):
    model.eval()
    waveform, sample_rate = torchaudio.load(audio_path)
    waveform = preprocess_audio(waveform, sample_rate).unsqueeze(0).to(device)
    
    with torch.no_grad():
        features = model(waveform).cpu().numpy()
    
    rf_prob = rf_model.predict_proba(features)[:, 1]
    svm_prob = svm_model.predict_proba(features)[:, 1]
    ensemble_prob = (rf_prob + svm_prob) / 2
    prediction = int(ensemble_prob > 0.5)
    
    return "Bonafide" if prediction == 0 else "Spoofed"


In [None]:
# Example usage
example_audio = "E:\\FINAL YEAR PROJECT\\Testdata\\Fake\\Fake 2.flac"
prediction = predict_audio(resnet_model, rf_model, svm_model, example_audio)
print(f"Prediction: {prediction}")


Prediction: Spoofed


  prediction = int(ensemble_prob > 0.5)


In [2]:
import torch
import torch.nn as nn
import torchaudio
import torchvision.models as models
import torchaudio.transforms as transforms
import numpy as np
from torch.utils.data import DataLoader, Dataset
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import os
import joblib

In [3]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [4]:
# Define the dataset class
class AudioDataset(Dataset):
    def __init__(self, audio_dir, label_file, transform=None):
        self.audio_dir = audio_dir
        self.transform = transform
        self.file_paths, self.labels = self.load_labels(label_file)
    
    def load_labels(self, label_file):
        file_paths = []
        labels = []
        with open(label_file, 'r') as f:
            for line in f:
                parts = line.strip().split()
                filename = parts[1] + ".flac"  # Assuming second column is filename
                label = 0 if parts[-1] == "bonafide" else 1  # Assuming last column is label
                file_paths.append(os.path.join(self.audio_dir, filename))
                labels.append(label)
        return file_paths, torch.tensor(labels)
    
    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, idx):
        waveform, sample_rate = torchaudio.load(self.file_paths[idx])
        if self.transform:
            waveform = self.transform(waveform, sample_rate)
        label = self.labels[idx]
        return waveform.to(device), label.to(device)


In [5]:

# ResNet Feature Extractor
class ResNetFeatureExtractor(nn.Module):
    def __init__(self):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18(pretrained=True)
        self.feature_extractor = nn.Sequential(*list(resnet.children())[:-1])  # Remove last FC layer

    def forward(self, x):
        x = self.feature_extractor(x)
        return x.view(x.size(0), -1)  # Flatten output



In [6]:
# Feature extraction function
def extract_features(model, dataloader):
    model.to(device)
    model.eval()
    features = []
    labels = []
    with torch.no_grad():
        for waveforms, lbls in dataloader:
            waveforms = waveforms.to(device)
            feats = model(waveforms)
            features.append(feats.cpu().numpy())
            labels.append(lbls.cpu().numpy())
    return np.vstack(features), np.hstack(labels)


In [7]:
def preprocess_audio(waveform, sample_rate, target_sample_rate=16000):
    # Convert to mono if stereo
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)  # Average across channels
    
    # Downsample if needed
    if sample_rate != target_sample_rate:
        resample_transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
        waveform = resample_transform(waveform)
    
    # Convert to Mel Spectrogram
    transform = transforms.MelSpectrogram(sample_rate=target_sample_rate, n_mels=128)
    mel_spec = transform(waveform)  # Shape: [1, n_mels, time]

    # Resize to 224x224 for ResNet
    mel_spec = torch.nn.functional.interpolate(mel_spec.unsqueeze(0), size=(224, 224))  # Shape: [1, 1, 224, 224]

    # Ensure it's 3 channels
    mel_spec = mel_spec.squeeze(0)  # Shape: [1, 224, 224]
    mel_spec = mel_spec.expand(3, -1, -1)  # Convert to [3, 224, 224]

    return mel_spec


In [8]:
# Define dataset paths
audio_dir = r"E:\Dataset MAIN\LA\LA\ASVspoof2019_LA_train\flac"
label_file = r"E:\Dataset MAIN\LA\LA\ASVspoof2019_LA_cm_protocols\ASVspoof2019.LA.cm.train.trn.txt"

# Create dataset and dataloader
dataset = AudioDataset(audio_dir, label_file, transform=preprocess_audio)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)


In [9]:
# Initialize ResNet feature extractor
resnet_model = ResNetFeatureExtractor().to(device)




In [10]:
# Extract features
features, labels = extract_features(resnet_model, dataloader)




In [11]:
from sklearn.model_selection import train_test_split

# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# SVM model
svm_model = SVC(kernel='rbf', probability=True)
svm_model.fit(X_train, y_train)


In [12]:
import joblib

# Save models
joblib.dump(rf_model, "random_forest_model.pkl")  # Save RF model
joblib.dump(svm_model, "svm_model.pkl")  # Save SVM model
torch.save(resnet_model.state_dict(), "resnet_feature_extractor.pth")  # Save ResNet model


In [13]:
# Predictions
rf_probs = rf_model.predict_proba(X_test)[:, 1]
svm_probs = svm_model.predict_proba(X_test)[:, 1]
ensemble_probs = (rf_probs + svm_probs) / 2  # Averaging
ensemble_preds = (ensemble_probs > 0.5).astype(int)

# Evaluate ensemble model
accuracy = accuracy_score(y_test, ensemble_preds)
print(f"Ensemble Accuracy: {accuracy:.2f}%")


Ensemble Accuracy: 0.90%


In [14]:
# Prediction function
def predict_audio(model, rf_model, svm_model, audio_path):
    model.eval()
    waveform, sample_rate = torchaudio.load(audio_path)
    waveform = preprocess_audio(waveform, sample_rate).unsqueeze(0).to(device)
    
    with torch.no_grad():
        features = model(waveform).cpu().numpy()
    
    rf_prob = rf_model.predict_proba(features)[:, 1]
    svm_prob = svm_model.predict_proba(features)[:, 1]
    ensemble_prob = (rf_prob + svm_prob) / 2
    prediction = int(ensemble_prob > 0.5)
    
    return "Bonafide" if prediction == 0 else "Spoofed"


In [16]:
# Example usage
example_audio = "E:\\FINAL YEAR PROJECT\\Testdata\\Fake\\Fake 2.flac"
prediction = predict_audio(resnet_model, rf_model, svm_model, example_audio)
print(f"Prediction: {prediction}")


Prediction: Spoofed


  prediction = int(ensemble_prob > 0.5)
