In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
pip install datasets



In [9]:
import os
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset, Audio
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [10]:
real_audio_dir = "/content/drive/MyDrive/mini_bonafide"
fake_audio_dir = "/content/drive/MyDrive/mini_spoof"

In [11]:
def load_audio_dataset(real_audio_dir, fake_audio_dir):
    data = {
        'audio': [],
        'label': []
    }

    for filename in os.listdir(real_audio_dir):
        if filename.endswith(('.wav', '.mp3', '.flac')):
            data['audio'].append(os.path.join(real_audio_dir, filename))
            data['label'].append(0)

    for filename in os.listdir(fake_audio_dir):
        if filename.endswith(('.wav', '.mp3', '.flac')):
            data['audio'].append(os.path.join(fake_audio_dir, filename))
            data['label'].append(1)

    df = pd.DataFrame(data)

    train_df = df.sample(frac=0.8, random_state=42)
    val_df = df.drop(train_df.index)

    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)

    train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
    val_dataset = val_dataset.cast_column("audio", Audio(sampling_rate=16000))

    return train_dataset, val_dataset

train_dataset, val_dataset = load_audio_dataset(real_audio_dir, fake_audio_dir)

In [12]:
model_checkpoint = "facebook/wav2vec2-base"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

In [13]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]

    max_duration_in_seconds = 10
    max_length = feature_extractor.sampling_rate * max_duration_in_seconds
    processed_arrays = []

    for audio in audio_arrays:
        if len(audio) > max_length:
            processed_arrays.append(audio[:max_length])
        else:
            padding = np.zeros(max_length - len(audio), dtype=np.float32)
            processed_arrays.append(np.concatenate([audio, padding]))

    inputs = feature_extractor(
        processed_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        padding="max_length",
        max_length=max_length,
        truncation=True,
        return_tensors="pt"
    )

    inputs["labels"] = examples["label"]
    return inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/176 [00:00<?, ? examples/s]

Map:   0%|          | 0/44 [00:00<?, ? examples/s]

In [14]:
num_labels = 2
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
)

for param in model.wav2vec2.feature_extractor.parameters():
    param.requires_grad = False

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
def compute_metrics(pred):
    predictions = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    accuracy = accuracy_score(labels, predictions)
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [17]:
os.environ["WANDB_DISABLED"] = "true"
training_args = TrainingArguments(
    output_dir="./wav2vec2-finetuned-deepfake-detection",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=False,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()
trainer.save_model("./wav2vec2-finetuned-deepfake-detection")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.569532,0.818182,0.764706,1.0,0.619048
2,No log,0.257916,0.954545,0.95,1.0,0.904762
3,No log,0.280652,0.909091,0.894737,1.0,0.809524
4,No log,0.02972,1.0,1.0,1.0,1.0
5,No log,0.046004,0.977273,0.976744,0.954545,1.0
6,No log,0.104494,0.977273,0.976744,0.954545,1.0
7,No log,0.08431,0.977273,0.976744,0.954545,1.0
8,No log,0.034408,0.977273,0.976744,0.954545,1.0
9,No log,0.015373,1.0,1.0,1.0,1.0
10,No log,0.012653,1.0,1.0,1.0,1.0


In [26]:
def predict_audio(audio_path, model, feature_extractor):

    max_duration_in_seconds = 10

    dataset = Dataset.from_dict({"audio": [audio_path]})
    dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
    audio = dataset[0]["audio"]

    inputs = feature_extractor(
        audio["array"],
        sampling_rate=audio["sampling_rate"],
        padding="max_length",
        max_length=feature_extractor.sampling_rate * max_duration_in_seconds,
        truncation=True,
        return_tensors="pt"
    )

    with torch.no_grad():
        logits = model(**inputs).logits

    probabilities = torch.softmax(logits, dim=1).cpu().numpy()[0]
    predicted_class = np.argmax(probabilities)

    labels = ["Real", "Deepfake"]
    return {
        "prediction": labels[predicted_class],
        "confidence": float(probabilities[predicted_class]),
        "probabilities": {labels[i]: float(prob) for i, prob in enumerate(probabilities)}
    }

In [27]:
model = Wav2Vec2ForSequenceClassification.from_pretrained("./wav2vec2-finetuned-deepfake-detection")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_checkpoint)
result = predict_audio("/content/drive/MyDrive/D_0000406645.flac", model, feature_extractor)
print(result)

{'prediction': 'Real', 'confidence': 0.9749166369438171, 'probabilities': {'Real': 0.9749166369438171, 'Deepfake': 0.02508331462740898}}


# D_2362 D_0000406645 M - - - - bonafide bonafide -
### from ASVspoof5.dev.track_1.tsv file