In [1]:
%load_ext extensions
%cd_repo_root

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/rubchume1/code/Users/rubchume/VoiceCloningFakeAudioDetection'

In [75]:
import pandas as pd

import directory_structure

# Load dataset

In [5]:
directory_structure.paths

{'data_path': PosixPath('data'),
 'job_definitions_path': PosixPath('job_definitions'),
 'computes_path': PosixPath('job_definitions/computes'),
 'environments_path': PosixPath('job_definitions/environments'),
 'source_path': PosixPath('src'),
 'audio_output_path': PosixPath('outputs'),
 'training_artifacts_path': PosixPath('artifacts'),
 'models_path': PosixPath('models')}

In [26]:
hugging_face_dataset = "HuggingFaceDataset"
hugging_face_dataset_path = directory_structure.data_path / hugging_face_dataset
hugging_face_dataset_script = (hugging_face_dataset_path / hugging_face_dataset).with_suffix(".py")

In [101]:
cloned_voices_path = ""
real_voices_path = directory_structure.data_path / "Common Voice/cv-corpus-15-delta-2023-09-08/en"

In [105]:
real_voices_info_file = real_voices_path / "validated.tsv"

In [112]:
pd.read_csv(real_voices_info_file, delimiter="\t")["path"].map(lambda path: str(real_voices_path / path))

0       data/Common Voice/cv-corpus-15-delta-2023-09-0...
1       data/Common Voice/cv-corpus-15-delta-2023-09-0...
2       data/Common Voice/cv-corpus-15-delta-2023-09-0...
3       data/Common Voice/cv-corpus-15-delta-2023-09-0...
4       data/Common Voice/cv-corpus-15-delta-2023-09-0...
                              ...                        
3061    data/Common Voice/cv-corpus-15-delta-2023-09-0...
3062    data/Common Voice/cv-corpus-15-delta-2023-09-0...
3063    data/Common Voice/cv-corpus-15-delta-2023-09-0...
3064    data/Common Voice/cv-corpus-15-delta-2023-09-0...
3065    data/Common Voice/cv-corpus-15-delta-2023-09-0...
Name: path, Length: 3066, dtype: object

In [52]:
from abc import ABC, abstractmethod


class AudioFilesLoader(ABC):
    def get_info(self) -> pd.DataFrame:
        pass
    
    
class CommonVoiceLoader(AudioFilesLoader):

ModuleNotFoundError: No module named 'collections.abstract'

In [71]:
min(4,6)

4

In [121]:
%%rendertemplate {hugging_face_dataset_script}
from pathlib import Path

import datasets
import pandas as pd


_DESCRIPTION = """\
Dataset for Apziva Voice cloning project
"""


real_info = pd.read_csv("[[real_voices_info_file]]", delimiter="\t")["path"].map(
    lambda path: str(Path("[[real_voices_path]]") / "clips" / path)
)
cloned_info = pd.read_csv("outputs/OOTB-YourTTS/TIMITexamples/index.csv")["voice_sample"].rename("path")


class HuggingFaceDatasetConfig(datasets.BuilderConfig):
    def __init__(self, name, max_imbalance: float, test_ratio: float, **kwargs):
        self.max_imbalance = max_imbalance
        self.test_ratio = test_ratio
        
        super(HuggingFaceDatasetConfig, self).__init__(
            name=name,
            description="HuggingFaceDataset configuration",
            **kwargs
        )


class HuggingFaceDataset(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("0.0.0")
    
    DEFAULT_CONFIG_NAME = "balanced"

    BUILDER_CONFIGS = [
        HuggingFaceDatasetConfig(
            name="balanced",
            max_imbalance=1,
            test_ratio=0.2
        )
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    "path": datasets.Value("string"),
                    "audio": datasets.Audio(sampling_rate=16_000),
                    "cloned": datasets.Value("bool")
                }
            ),
            supervised_keys=None,
        )

    def _split_generators(self, dl_manager):
        def split_train_test(df: pd.DataFrame, test_proportion):
            train_df = df.sample(frac=1 - test_proportion, random_state=0)
            test_df = df.drop(train_df.index)
            return train_df, test_df
        
        def generate_balanced_dataset(data_A: pd.DataFrame, data_B: pd.DataFrame, max_B_to_A_imbalance: float):
            max_samples = int(len(data_A) * max_B_to_A_imbalance)
            samples_A = data_A.sample(len(data_A), random_state=0)
            samples_B = data_B.sample(max_samples, random_state=0)
            return samples_A, samples_B
            
        train_cloned, test_cloned = split_train_test(cloned_info, self.config.test_ratio)
        train_real, test_real = split_train_test(real_info, self.config.test_ratio)
        
        cloned_samples_train, real_samples_train = generate_balanced_dataset(train_cloned, train_real, self.config.max_imbalance)
        cloned_samples_test, real_samples_test = generate_balanced_dataset(test_cloned, test_real, self.config.max_imbalance)
            
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "cloned_samples": cloned_samples_train,
                    "real_samples": real_samples_train,
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "cloned_samples": cloned_samples_test,
                    "real_samples": real_samples_test,
                },
            ),
        ]

    def _generate_examples(self, cloned_samples, real_samples):
        cloned_samples = pd.DataFrame(cloned_samples.rename("path"))
        cloned_samples["cloned"] = True
        
        real_samples = pd.DataFrame(real_samples.rename("path"))
        real_samples["cloned"] = False
        
        samples = pd.concat([cloned_samples, real_samples], axis="index").sample(frac=1, random_state=0)
        
        for index, sample in samples.iterrows():
            path = sample["path"]
            with Path(path).open("rb") as file:
                audio = {"path": path, "bytes": file.read()}
            yield index, {**sample.to_dict(), "audio": audio}

'data/HuggingFaceDataset/HuggingFaceDataset.py'

In [122]:
from datasets import load_dataset


dataset = load_dataset(str(hugging_face_dataset_path), "balanced", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [123]:
dataset

Dataset({
    features: ['path', 'audio', 'cloned'],
    num_rows: 32
})