In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [25]:
torch.arange(10)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [30]:

inputs = torch.randn(3, 5)
print(inputs)
F.softmax(inputs, dim=-1)

tensor([[ 2.3232, -0.5501,  0.7347, -0.0715, -1.0714],
        [-0.5967, -1.8648, -0.3850,  1.4584,  0.6718],
        [-1.4820,  0.8257, -0.3200, -1.1902, -0.5884]])


tensor([[0.7218, 0.0408, 0.1474, 0.0658, 0.0242],
        [0.0720, 0.0203, 0.0890, 0.5625, 0.2562],
        [0.0555, 0.5575, 0.1773, 0.0743, 0.1355]])

In [6]:
def hf_load_dataset(cfg: configuration.CFG) -> DatasetDict:
    """ Load dataset from Huggingface Datasets
    Notes:
        This function is temporary just fit-able for Wikipedia dataset
    References:
        https://github.com/huggingface/datasets/blob/main/src/datasets/load.py#2247
    """
    dataset = load_dataset(cfg.hf_dataset, cfg.language)
    return dataset


def hf_split_dataset(cfg: configuration.CFG, dataset: Dataset) -> Tuple[Dataset, Dataset]:
    """ Split dataset from Huggingface Datasets with huggingface method "train_test_split"
    Args:
        cfg: configuration.CFG, needed to load split ratio, seed value
        dataset: Huggingface Datasets object, dataset from Huggingface Datasets
    Notes:
        This function is temporary just fit-able for Wikipedia dataset & MLM Task
    """
    dataset = dataset.train_test_split(cfg.split_ratio, seed=cfg.seed)
    train, valid = dataset['train'], dataset['test']
    return train, valid


def chunking(sequences: Dict, cfg: configuration.CFG = CFG) -> List[str]:
    """ Chunking sentence to token using pretrained tokenizer
    Args:
        cfg: configuration.CFG, needed to load pretrained tokenizer
        sequences: list, sentence to chunking
    References:
        https://huggingface.co/docs/transformers/main/tasks/masked_language_modeling
    """
    return cfg.tokenizer([" ".join(x) for x in sequences['text']])


def group_texts(sequences: Dict, cfg: configuration.CFG = CFG) -> Dict:
    """ Dealing Problem: some of data instances are longer than the maximum input length for the model,
    This function is ONLY used to HF Dataset Object
    1) Concatenate all texts
    2) We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    3) customize this part to your needs
    4) Split by chunks of max_len
    """
    concatenated_sequences = {k: sum(sequences[k], []) for k in sequences.keys()}
    total_length = len(concatenated_sequences[list(sequences.keys())[0]])
    if total_length >= cfg.max_seq:
        total_length = (total_length // cfg.max_seq) * cfg.max_seq
    result = {
        k: [t[i: i + cfg.max_seq] for i in range(0, total_length, cfg.max_seq)]
        for k, t in concatenated_sequences.items()
    }
    return result


def apply_preprocess(dataset: Dataset, function: Callable, batched: bool = True, num_proc: int = 4, remove_columns: any = None) -> Dataset:
    """ Apply preprocessing to text data, which is using huggingface dataset method "map()"
    for pretrained training (MLM, CLM)
    Args:
        dataset: Huggingface Datasets object, dataset from Huggingface Datasets
        function: Callable, function that you want to apply
        batched: bool, default True, if you want to apply function to batched data, set True
        num_proc: int, default 4, number of process for multiprocessing
        remove_columns: any, default None, if you want to remove some columns, set column name
    References:
        https://huggingface.co/docs/transformers/main/tasks/masked_language_modeling
    """
    mapped_dataset = dataset.map(
        function,
        batched=batched,
        num_proc=num_proc,
        remove_columns=remove_columns,
    )
    return mapped_dataset

In [3]:
"""
1) Load Dataset, Tokenizer
2) Split Dataset, preprocess dataset for MLM Task
"""
ds = hf_load_dataset(CFG)
_, sub_ds = hf_split_dataset(CFG, ds['train'])
train, valid = hf_split_dataset(CFG, sub_ds)

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

In [4]:
""" Apply preprocessing to dataset """

chunked_train = apply_preprocess(
    train,
    chunking,
    remove_columns=train.column_names
)

chunked_valid = apply_preprocess(
    valid,
    chunking,
    remove_columns=valid.column_names
)

Map (num_proc=4):   0%|          | 0/1025250 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/256313 [00:00<?, ? examples/s]

In [8]:
""" Grouping text data to fit the maximum input length for the model """

grouped_train = apply_preprocess(
    chunked_train,
    group_texts,
    num_proc=8,
)

grouped_valid = apply_preprocess(
    chunked_train,
    group_texts,
    num_proc=8,
)

Map (num_proc=8):   0%|          | 0/1025250 [00:00<?, ? examples/s]

TimeoutError: 

In [5]:
text = 'I am a boy [PAD]'
CFG.tokenizer(text)


{'input_ids': [1, 273, 481, 266, 2388, 0, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [7]:
metric_list = ['accuracy', 'recall', 'precision', 'f_beta']
metric_module = []
for metrics in metric_list:
    metric_module.append(getattr(metric, f'{metrics}'))
metric_module

[<function metrics.metric.accuracy(y_true: <built-in function array>, y_pred: <built-in function array>) -> float>,
 <function metrics.metric.recall(y_true: numpy.ndarray, y_pred: numpy.ndarray) -> float>,
 <function metrics.metric.precision(y_true, y_pred) -> float>,
 <function metrics.metric.f_beta(y_true: numpy.ndarray, y_pred: numpy.ndarray, beta: float = 2) -> float>]