# Setup

In [1]:
%load_ext extensions
%cd_repo_root

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/rubchume1/code/Users/rubchume/VoiceCloningFakeAudioDetection'

In [2]:
import random
from pathlib import Path
from typing import Iterable, List

import mlflow
import numpy as np
import pandas as pd
from pydub import AudioSegment
import pytorch_lightning as pl
import torch
from torch.utils.data import Dataset
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

import directory_structure



In [3]:
def reproduce_audio_file_with_pydub(audio_file):
    audio = AudioSegment.from_file(audio_file)
    display(audio)

# Load dataset

In [4]:
hugging_face_dataset = "HuggingFaceDataset"
hugging_face_dataset_path = directory_structure.data_path / hugging_face_dataset
hugging_face_dataset_script = (hugging_face_dataset_path / hugging_face_dataset).with_suffix(".py")

In [5]:
cloned_voices_path = ""
real_voices_path = directory_structure.data_path / "Common Voice/cv-corpus-15-delta-2023-09-08/en"
real_voices_info_file = real_voices_path / "validated.tsv"

In [6]:
real_info = pd.read_csv(real_voices_info_file, delimiter="\t")["path"].map(
    lambda path: str(real_voices_path / "clips" / path)
)
cloned_info = pd.Series([str(path) for path in Path("outputs/OOTB-YourTTS/TIMITexamples/").glob("*.wav")]).rename("path")


# Pytorch Lightning

In [7]:
from IPython.display import Audio
def reproduce_audio_from_pcm_samples(pcm_samples: np.array, sample_rate: int):
    audio = Audio(data=pcm_samples, rate=sample_rate, autoplay=True)
    display(audio)

In [8]:
class AudioBinaryDataset(Dataset):
    def __init__(
        self,
        negative_audio_files: Iterable,
        postive_audio_files: Iterable,
        target_sample_rate: int,
        num_samples: int,
        max_imbalance=1,
        random_seed=0,
    ):
        self.negative_audio_files = list(negative_audio_files)
        self.positive_audio_files = list(postive_audio_files)
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples
        
        self.random_instance = random.Random(random_seed)
        
        negative_samples, positive_samples = self._undersample_unbalanced_dataset(
            self.negative_audio_files,
            self.positive_audio_files,
            max_imbalance
        )
        
        negative_samples_with_label = [
            (sample, 0)
            for sample in negative_samples
        ]
        
        positive_samples_with_label = [
            (sample, 1)
            for sample in negative_samples
        ]
        
        self.samples = self.random_instance.sample(
            negative_samples_with_label + positive_samples_with_label,
            len(negative_samples_with_label) + len(positive_samples_with_label)
        )
        
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, index):
        audio_file, label = self.samples[index]
        audio_segment = AudioSegment.from_file(audio_file)
        audio_resampled = audio_segment.set_frame_rate(self.target_sample_rate)
        pcm_samples = self._bytes_to_numpy(
            audio_resampled.raw_data,
            audio_resampled.sample_width
        )
        resized_samples = np.zeros(self.num_samples)
        resized_samples[:len(pcm_samples)] = pcm_samples[:self.num_samples]
        return torch.Tensor(resized_samples), label
    
    def _undersample_unbalanced_dataset(self, dataset_A: List, dataset_B: List, max_imbalance):
        if len(dataset_A) > len(dataset_B):
            dataset_big = dataset_A
            dataset_small = dataset_B
            a_bigger_than_b = True
        else:
            dataset_big = dataset_B
            dataset_small = dataset_A
            a_bigger_than_b = True
        
        if max_imbalance < 1:
            max_imbalance = 1 / max_imbalance
            
        max_samples = int(len(dataset_small) * max_imbalance)
        samples_big = self.random_instance.sample(dataset_big, min(max_samples, len(dataset_big)))
        samples_small = self.random_instance.sample(dataset_small, len(dataset_small))
        
        if a_bigger_than_b:
            return samples_big, samples_small
        else:
            return samples_small, samples_big
    
    @staticmethod
    def _bytes_to_numpy(bytes_stream: bytes, sample_width=2) -> np.array:
        """
        sample_width: number of bytes per sample
        """
        dtype_map = {
            1: np.int8,
            2: np.int16,
            4: np.int32
        }

        if sample_width not in dtype_map:
            raise ValueError(f"Unsupported sample width: {sample_width}")

        return np.frombuffer(bytes_stream, dtype=dtype_map[sample_width])

In [9]:
from torch.utils.data import DataLoader, random_split


class DataModule(pl.LightningDataModule):
    def __init__(self, batch_size, target_sample_rate, num_samples, cloned_samples: pd.Series, real_samples: pd.Series):
        super().__init__()
        self.batch_size = batch_size
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples
        self.cloned_samples = cloned_samples
        self.real_samples = real_samples
    
    def prepare_data(self):
        dataset = AudioBinaryDataset(
            self.real_samples,
            self.cloned_samples,
            self.target_sample_rate,
            self.num_samples
        )
        
        self.dataset_training, self.dataset_validation, self.dataset_test = random_split(
            dataset,
            [0.7, 0.1, 0.2],
            generator=torch.Generator().manual_seed(0)
        )
        
    def train_dataloader(self):
        return DataLoader(self.dataset_training, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.dataset_validation, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.dataset_test, batch_size=self.batch_size)

In [10]:
from enum import Enum
from collections import defaultdict

import pytorch_lightning as pl
import torch.nn as nn
import torchmetrics


class Stage(Enum):
    TRAIN = "TRAIN"
    VALIDATION = "VALIDATION"
    TEST = "TEST"


class ClonedAudioDetector(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self._create_model()
        self._prepare_metrics()
        
    def _create_model(self):
        num_labels = 2

        label2id = dict(
            cloned=1,
            real=0)

        id2label = {
            1: "cloned",
            0: "real"
        }

        self.model = AutoModelForAudioClassification.from_pretrained(
            "facebook/wav2vec2-base",
            num_labels=num_labels,
            label2id=label2id,
            id2label=id2label
        )
             
    def _prepare_metrics(self):
        self.precision = torchmetrics.Precision(task='binary')
        self.recall = torchmetrics.Recall(task='binary')
        self.f1 = torchmetrics.F1Score(task='binary')
        self.confmat = torchmetrics.ConfusionMatrix(task="binary")

        self.targets_scores = {}
        self.targets_predicted = {}
        self.targets = {}
        
        self._reset_target_registries(Stage.TRAIN)
        self._reset_target_registries(Stage.VALIDATION)
        self._reset_target_registries(Stage.TEST)
 
    def forward(self, x):
        return self.model.forward(x)
    
    def criterion(self, logits, labels):
        return nn.functional.cross_entropy(logits, labels)
    
    def training_step(self, batch, batch_index):
        return self._step(batch, Stage.TRAIN)

    def validation_step(self, batch, batch_index):
        return self._step(batch, Stage.VALIDATION)
        
    def test_step(self, batch, batch_index):
        return self._step(batch, Stage.TEST)
    
    def on_train_epoch_start(self):
        self._reset_target_registries(Stage.TRAIN)
    
    def on_train_epoch_end(self):
        self._log_epoch_metrics(Stage.TRAIN)
    
    def on_validation_epoch_start(self):
        self._reset_target_registries(Stage.VALIDATION)
    
    def on_validation_epoch_end(self):
        self._log_epoch_metrics(Stage.VALIDATION)
    
    def on_test_epoch_start(self):
        self._reset_target_registries(Stage.TEST)
    
    def on_test_epoch_end(self):
        self._log_epoch_metrics(Stage.TEST)
                                
    def _reset_target_registries(self, stage: Stage):
        self.targets_scores[stage] = []
        self.targets_predicted[stage] = []
        self.targets[stage] = []

    def _step(self, batch, stage: Stage):
        audios, targets = batch
        logits, targets_predicted = self._predict(audios)

        self.targets_scores[stage].append(logits)
        self.targets_predicted[stage].append(targets_predicted)
        self.targets[stage].append(targets)
        
        loss = self.criterion(logits, targets)
        
        metric_name = {
            stage.TRAIN: "train_loss",
            stage.VALIDATION: "val_loss",
            stage.TEST: "test_loss",
        }
        
        self.log(metric_name[stage], loss, prog_bar=True)
        return loss
        
    def _predict(self, data):
        logits = self.forward(data).logits
        targets_predicted = (logits[:, 1] > logits[:, 0]) * 1
        return logits, targets_predicted
        
    def _log_epoch_metrics(self, stage: Stage):
        targets_predicted = torch.cat(self.targets_predicted[stage], dim=0).squeeze()
        targets = torch.cat(self.targets[stage], dim=0)

        precision = self.precision(targets_predicted, targets)
        recall = self.recall(targets_predicted, targets)
        f1_score = self.f1(targets_predicted, targets)

        self.log(f'{stage.value}_precision', precision, prog_bar=True)
        self.log(f'{stage.value}_recall', recall, prog_bar=True)
        self.log(f'{stage.value}_f1', f1_score, prog_bar=True)
        
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer
    
    def get_targets(self, stage: Stage):
        return torch.cat(self.targets[stage], dim=0).to(torch.device("cpu"))
    
    def get_targets_scores(self, stage: Stage):
        return torch.cat(self.targets_scores[stage], dim=0).squeeze().to(torch.device("cpu"))
    
    def get_targets_predicted(self, stage: Stage):
        return torch.cat(self.targets_predicted[stage], 0).squeeze().to(torch.device("cpu"))
        

In [11]:
from pytorch_lightning.loggers import TensorBoardLogger


data_module = DataModule(4, 16000, 64000, real_info, cloned_info)

logger = TensorBoardLogger(str(directory_structure.training_artifacts_path), name="wav2vec2")
detector = ClonedAudioDetector()
trainer = pl.Trainer(
    logger=logger,
    max_epochs=3,
    accelerator="auto",
    log_every_n_steps=10,
    callbacks=[],
)

trainer.fit(detector, data_module)
trainer.test(detector, data_module)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.weight', 'projector.weight', 'classifier.bias', 'projector.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name      | Type                              | Params
----------------------------------------------------------------
0 | model     | Wav2Vec2ForSequenceClassification | 94.6 M
1 | precision | BinaryPrecision                   | 0     
2 | recall    | BinaryRecall                      | 0     
3 | f1        | BinaryF1Score                     | 0     
4 | confmat   | BinaryConfusionMatrix             | 0     
----------------------------------------------------------------
94.6 M    Trainable params
0

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


Testing: 0it [00:00, ?it/s]

[{'test_loss': nan, 'TEST_precision': 0.0, 'TEST_recall': 0.0, 'TEST_f1': 0.0}]

In [250]:
detector.forward(batch[0]).logits

tensor([[-0.0097, -0.0379],
        [-0.0489, -0.0492],
        [-0.0232, -0.0551],
        [ 0.0078, -0.0246]], grad_fn=<AddmmBackward0>)

# Tutorial example

https://huggingface.co/docs/transformers/tasks/audio_classification

In [92]:
next(iter(dataset["train"]))
# encoded_minds["train"][0]

{'input_values': array([ 0.02907566,  0.02133443,  0.01332626, ..., -0.00162231,
        -0.00162231, -0.00162231], dtype=float32),
 'label': 1}

In [113]:
from datasets import load_dataset, Audio

minds = load_dataset("PolyAI/minds14", name="en-US", split="train")
minds = minds.train_test_split(test_size=0.2)
minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"])

labels = minds["train"].features["intent_class"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label
    
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
encoded_minds = encoded_minds.rename_column("intent_class", "label")

import evaluate

accuracy = evaluate.load("accuracy")

import numpy as np


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)



Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['projector.bias', 'classifier.weight', 'projector.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [121]:
next(itertools.islice(dataset["train"], 0, 1))["input_values"]


array([ 0.02907566,  0.02133443,  0.01332626, ..., -0.00162231,
       -0.00162231, -0.00162231], dtype=float32)

In [129]:
import torch


torch.Tensor(encoded_minds["train"][0]["input_values"]).unsqueeze(0).shape

torch.Size([1, 16000])

In [128]:
model.forward(torch.Tensor(encoded_minds["train"][0]["input_values"]).unsqueeze(0))

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0455,  0.0232,  0.0506, -0.0133, -0.0818,  0.0139, -0.1356,  0.0208,
          0.0164,  0.0020, -0.0913, -0.0053,  0.0378, -0.0143]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [131]:
model.forward(np.array(encoded_minds["train"][0]["input_values"]))

TypeError: conv1d() received an invalid combination of arguments - got (numpy.ndarray, Parameter, NoneType, tuple, tuple, tuple, int), but expected one of:
 * (Tensor input, Tensor weight, Tensor bias, tuple of ints stride, tuple of ints padding, tuple of ints dilation, int groups)
      didn't match because some of the arguments have invalid types: (!numpy.ndarray!, !Parameter!, !NoneType!, !tuple of (int,)!, !tuple of (int,)!, !tuple of (int,)!, int)
 * (Tensor input, Tensor weight, Tensor bias, tuple of ints stride, str padding, tuple of ints dilation, int groups)
      didn't match because some of the arguments have invalid types: (!numpy.ndarray!, !Parameter!, !NoneType!, !tuple of (int,)!, !tuple of (int,)!, !tuple of (int,)!, int)


In [7]:
import mlflow


mlflow.set_tracking_uri(f"file:./{directory_structure.runs_path}")
experiment_name = "my-experiment"
experiment = mlflow.set_experiment(experiment_name)


training_args = TrainingArguments(
    output_dir="my_awesome_mind_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"],
    eval_dataset=encoded_minds["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

# try:
trainer.train()
# except Exception:
#     import pdb; pdb.post_mortem()

Exception ignored in: <function MLflowCallback.__del__ at 0x7f2a96a0af80>
Traceback (most recent call last):
  File "/anaconda/envs/voicecloningenv/lib/python3.10/site-packages/transformers/integrations/integration_utils.py", line 1064, in __del__
    self._ml_flow.end_run()
  File "/anaconda/envs/voicecloningenv/lib/python3.10/site-packages/mlflow/tracking/fluent.py", line 409, in end_run
    MlflowClient().set_terminated(run.info.run_id, status)
  File "/anaconda/envs/voicecloningenv/lib/python3.10/site-packages/mlflow/tracking/client.py", line 1856, in set_terminated
    self._tracking_client.set_terminated(run_id, status, end_time)
  File "/anaconda/envs/voicecloningenv/lib/python3.10/site-packages/mlflow/tracking/_tracking_service/client.py", line 502, in set_terminated
    self.store.update_run_info(
  File "/anaconda/envs/voicecloningenv/lib/python3.10/site-packages/mlflow/store/tracking/file_store.py", line 593, in update_run_info
    run_info = self._get_run_info(run_id)
  Fil

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 