**Distinguish real speech from fake - Shahaf Wagner**

Imports

In [1]:
import os
import logging
import librosa
import IPython.display as ipd
from dataclasses import dataclass
from typing import Dict, List, Optional, Union

import wandb
from torch.utils.data import DataLoader, Dataset
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve
from scipy.optimize import brentq
from scipy.interpolate import interp1d
from datasets import DatasetDict, load_dataset, load_metric
from transformers import (
    HubertForSequenceClassification,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor,
    EarlyStoppingCallback,
)

logging.basicConfig(
    format="%(asctime)s | %(levelname)s: %(message)s", level=logging.INFO
)

Getting pretrained feature extractor

In [2]:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")

Downloading (…)rocessor_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

Using a Collator to use with the processor so we can work with batches

In [3]:
INPUT_FIELD = "input_values"
LABEL_FIELD = "labels"


@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(
        self, examples: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:

        input_features = [
            {INPUT_FIELD: example[INPUT_FIELD]} for example in examples
        ]  # example is basically row0, row1, etc...
        labels = [example[LABEL_FIELD] for example in examples]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        batch[LABEL_FIELD] = torch.tensor(labels)

        return batch
    
# DEFINE DATA COLLATOR - TO PAD TRAINING BATCHES DYNAMICALLY
data_collator = DataCollatorCTCWithPadding(
            processor=feature_extractor,
            padding=True
)

Logging in to weights and biases to track easily the data, you can create your own WAB at the website, so you can track it

In [5]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mwag[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [6]:
USER = "wag"
WANDB_PROJECT = "audio-classifier"
wandb.init(entity=USER, project=WANDB_PROJECT)

Loading Hubert model https://ai.facebook.com/blog/hubert-self-supervised-representation-learning-for-speech-recognition-generation-and-compression/

In [7]:
model_path = 'facebook/hubert-large-ls960-ft'
hubert_model = HubertForSequenceClassification.from_pretrained(model_path)
hubert_model_config = hubert_model.config
print("Num of labels:", hubert_model_config.num_labels)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/hubert-large-ls960-ft were not used when initializing HubertForSequenceClassification: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing HubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['projector.weight', 'classifier.weight', 'classifier.bias', 'projector.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Num of labels: 2


Freezing weights and unfreezing some for finetuning with transfer learning

In [8]:
# freeze all layers to begin with
for param in hubert_model.parameters():
    param.requires_grad = False
    
# freeze two encoder layers    
layers_freeze_num = 2
n_layers = (
    4 + layers_freeze_num * 16
)  # 4 refers to projector and classifier's weights and biases.
for name, param in list(hubert_model.named_parameters())[-n_layers:]:
    param.requires_grad = True

loading the data into dataframes, assuming it is located in BASE_PATH, which is ../input/asvpoof-2019-dataset/LA/LA

In [9]:
SAMPLE_RATE = 16000
DURATION = 5.0 # duration in second
AUDIO_LEN = int(SAMPLE_RATE * DURATION)
BASE_PATH = '../input/asvpoof-2019-dataset/LA/LA'
FOLDS = 10
SEED = 101
DEBUG = False

train_df = pd.read_csv(f'{BASE_PATH}/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt',
                       sep=" ", header=None)
train_df.columns =['speaker_id','filename','system_id','null','class_name']
train_df.drop(columns=['null'],inplace=True)
train_df['filepath'] = f'{BASE_PATH}/ASVspoof2019_LA_train/flac/'+train_df.filename+'.flac'
train_df['target'] = (train_df.class_name=='spoof').astype('int32') # set labels 1 for fake and 0 for real
if DEBUG:
    train_df = train_df.groupby(['target']).sample(800).reset_index(drop=True)
print(f'Train Samples: {len(train_df)}')
train_df.head(2)

Train Samples: 25380


Unnamed: 0,speaker_id,filename,system_id,class_name,filepath,target
0,LA_0079,LA_T_1138215,-,bonafide,../input/asvpoof-2019-dataset/LA/LA/ASVspoof20...,0
1,LA_0079,LA_T_1271820,-,bonafide,../input/asvpoof-2019-dataset/LA/LA/ASVspoof20...,0


In [10]:
valid_df = pd.read_csv(f'{BASE_PATH}/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.dev.trl.txt',
                       sep=" ", header=None)
valid_df.columns =['speaker_id','filename','system_id','null','class_name']
valid_df.drop(columns=['null'],inplace=True)
valid_df['filepath'] = f'{BASE_PATH}/ASVspoof2019_LA_dev/flac/'+valid_df.filename+'.flac'
valid_df['target'] = (valid_df.class_name=='spoof').astype('int32')
if DEBUG:
    valid_df = valid_df.groupby(['target']).sample(800).reset_index(drop=True)
print(f'Valid Samples: {len(valid_df)}')
valid_df.head(2)

Valid Samples: 24844


Unnamed: 0,speaker_id,filename,system_id,class_name,filepath,target
0,LA_0069,LA_D_1047731,-,bonafide,../input/asvpoof-2019-dataset/LA/LA/ASVspoof20...,0
1,LA_0069,LA_D_1105538,-,bonafide,../input/asvpoof-2019-dataset/LA/LA/ASVspoof20...,0


In [11]:
test_df = pd.read_csv(f'{BASE_PATH}/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.eval.trl.txt',
                       sep=" ", header=None)
test_df.columns =['speaker_id','filename','system_id','null','class_name']
test_df.drop(columns=['null'],inplace=True)
test_df['filepath'] = f'{BASE_PATH}/ASVspoof2019_LA_eval/flac/'+test_df.filename+'.flac'
test_df['target'] = (test_df.class_name=='spoof').astype('int32')
if DEBUG:
    test_df = test_df.groupby(['target']).sample(800).reset_index(drop=True)
print(f'Test Samples: {len(test_df)}')
test_df.head(2)

Test Samples: 71237


Unnamed: 0,speaker_id,filename,system_id,class_name,filepath,target
0,LA_0039,LA_E_2834763,-,spoof,../input/asvpoof-2019-dataset/LA/LA/ASVspoof20...,1
1,LA_0014,LA_E_8877452,-,spoof,../input/asvpoof-2019-dataset/LA/LA/ASVspoof20...,1


Functions in order to load the audio, listen to it

In [12]:
def load_audio(path, sr=16000):
    """load audio from .wav file
    Args:
        path: file path of .wav file
        sr: sample rate
    Returns:
        audio, sr
    """
    audio, sr = librosa.load(path, sr=sr)
    return audio, sr

def listen_audio(audio, sr=16000):
    display(ipd.Audio(audio, rate=sr))


We load and check for a positive and negative example

In [13]:
row = train_df[train_df.target==0].iloc[100]
print(f'> Filename: {row.filename} | Label: {row.class_name}')
audio, sr= load_audio(row.filepath, sr=None)
audio = audio[:AUDIO_LEN]

print('# Listen')
listen_audio(audio, sr=16000)

row = train_df[train_df.target==1].iloc[200]
print(f'> Filename: {row.filename} | Label: {row.class_name}')
audio, sr= load_audio(row.filepath, sr=None)
audio = audio[:AUDIO_LEN]

print('# Listen')
listen_audio(audio, sr=16000)

> Filename: LA_T_8458654 | Label: bonafide
# Listen


> Filename: LA_T_1760550 | Label: spoof
# Listen


We create a pytorch dataset, we load an audio from a line and use the feature extractor on it, we also get the label.

In [14]:
class ASVspoofDataset(torch.utils.data.Dataset):
    def __init__(self, audio_df, max_length, sample_rate=16000, transform=None):
        self.audio_df = audio_df
        self.max_length = max_length
        self.sample_rate = sample_rate
        self.transform = transform

    def __len__(self):
        return len(self.audio_df)

    def __getitem__(self, idx):
        df_line = self.audio_df.iloc[idx]
        waveform = load_audio(df_line.filepath, self.sample_rate)[0]
        label = df_line.target
        
        if self.transform:
            waveform = self.transform(waveform, self.sample_rate)
            
        waveform = feature_extractor(
        waveform, sampling_rate=16000, padding=True, return_tensors="pt"
        ).input_values[0]

        return {'input_values': waveform, 'labels':torch.tensor(label).long()}


we transfer the df into a dataset as we created

In [15]:
max_length = 4096  # You can adjust this value based on the dataset and model requirements
ds = ASVspoofDataset(train_df, max_length)

train_dataset = ASVspoofDataset(train_df, max_length)
valid_dataset = ASVspoofDataset(valid_df, max_length)
test_dataset = ASVspoofDataset(test_df, max_length)

We set the trainer settings and arguements for hugginface trainer, we also define how to comute the metrics we want(eer+ accuracy)

In [16]:
trainer_config = {
  "OUTPUT_DIR": "results",
  "TRAIN_EPOCHS": 7,
  "TRAIN_BATCH_SIZE": 4,
  "EVAL_BATCH_SIZE": 4,
  "GRADIENT_ACCUMULATION_STEPS": 4,
  "WARMUP_STEPS": 500,
  "DECAY": 0.01,
  "LOGGING_STEPS": 10,
  "MODEL_DIR": "models/test-hubert-model",
  "SAVE_STEPS": 2700
}

# Fine-Tuning with Trainer
training_args = TrainingArguments(
    output_dir=trainer_config["OUTPUT_DIR"],  # output directory
    gradient_accumulation_steps=trainer_config[
        "GRADIENT_ACCUMULATION_STEPS"
    ],  # accumulate the gradients before running optimization step
    num_train_epochs=trainer_config[
        "TRAIN_EPOCHS"
    ],  # total number of training epochs
    per_device_train_batch_size=trainer_config[
        "TRAIN_BATCH_SIZE"
    ],  # batch size per device during training
    per_device_eval_batch_size=trainer_config[
        "EVAL_BATCH_SIZE"
    ],  # batch size for evaluation
    warmup_steps=trainer_config[
        "WARMUP_STEPS"
    ],  # number of warmup steps for learning rate scheduler
    save_steps=trainer_config["SAVE_STEPS"], # save checkpoint every 100 steps
    weight_decay=trainer_config["DECAY"],  # strength of weight decay
    logging_steps=trainer_config["LOGGING_STEPS"],
    evaluation_strategy="epoch",  # report metric at end of each epoch
    report_to="wandb",  # enable logging to W&B
)

from datasets import load_metric
def compute_eer(predictions, labels):
    fpr, tpr, thresholds = roc_curve(labels, predictions, pos_label=1)
    eer = brentq(lambda x: 1.0 - x - interp1d(fpr, tpr)(x), 0.0, 1.0)
    return eer

def compute_metrics(eval_pred):
    # Load the accuracy metric
    compute_accuracy_metric = load_metric("accuracy")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Compute accuracy
    accuracy_result = compute_accuracy_metric.compute(predictions=predictions, references=labels)
    # Compute EER
    eer_result = compute_eer(predictions=logits[:, 1], labels=labels)
    
    # Return both accuracy and EER
    return {"accuracy": accuracy_result["accuracy"], "eer": eer_result}


We create the train for the Hubert model

In [17]:
# START TRAINING
trainer = Trainer(
    model=hubert_model,  #
    args=training_args,  # training arguments, defined above
    data_collator=data_collator,
    train_dataset=train_dataset,  # training dataset
    eval_dataset=valid_dataset,  # evaluation dataset
    compute_metrics=compute_metrics,
)

Lets train!

In [18]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Eer
0,0.1953,0.229835,0.896595,0.093559
1,0.0931,0.347026,0.882869,0.062402
2,0.0635,0.216813,0.930687,0.054046
4,0.0558,0.275898,0.921349,0.047273
4,0.0446,0.21938,0.939664,0.043192
5,0.049,0.163141,0.955563,0.043564
6,0.0473,0.238028,0.941515,0.041442


Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]



TrainOutput(global_step=5551, training_loss=0.11331456702028407, metrics={'train_runtime': 35077.7426, 'train_samples_per_second': 5.065, 'train_steps_per_second': 0.158, 'total_flos': 3.09106295909403e+19, 'train_loss': 0.11331456702028407, 'epoch': 7.0})

Now we can save the results

In [28]:
save_dir = '/kaggle/working'
test_results = trainer.predict(test_dataset)
logging.info("Test Set Result: {}".format(test_results.metrics))
wandb.log({"test_accuracy": test_results.metrics["test_accuracy"], "test_eer": test_results.metrics["test_eer"]})

trainer.save_model(os.path.join(save_dir, trainer_config["MODEL_DIR"]))

# logging trained models to wandb
wandb.save(
    os.path.join(save_dir, trainer_config["MODEL_DIR"], "*"),
    base_path=os.path.dirname(trainer_config["MODEL_DIR"]),
    policy="end",
)



['/kaggle/working/wandb/run-20230426_061506-1tpr64nq/files/test-hubert-model/training_args.bin',
 '/kaggle/working/wandb/run-20230426_061506-1tpr64nq/files/test-hubert-model/config.json',
 '/kaggle/working/wandb/run-20230426_061506-1tpr64nq/files/test-hubert-model/pytorch_model.bin',
 '/kaggle/working/wandb/run-20230426_061506-1tpr64nq/files/test-hubert-model/training_args.bin',
 '/kaggle/working/wandb/run-20230426_061506-1tpr64nq/files/test-hubert-model/config.json',
 '/kaggle/working/wandb/run-20230426_061506-1tpr64nq/files/test-hubert-model/pytorch_model.bin']

The final EER results on the test set (0.06)

In [31]:
test_results.metrics['test_eer']

0.060367097213557444

Get a link to save the model

In [33]:
os.chdir(save_dir)
ipd.FileLink(r'models/test-hubert-model/pytorch_model.bin')