In [17]:
import re
import pickle
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from tqdm.auto import tqdm
from datasets import Dataset, DatasetDict, load_dataset
from typing import Dict, List, Tuple, Callable, Any
from configuration import CFG

In [20]:
""" Experiment for Huggingface Tokenizer for Building MLM Algorithm 
meaning of special token in Huggingface Tokenizer is corrspoding to [CLS], [SEP], [MASK], [PAD] ... etc
"""

text = 'I am a boy [MASK] [MASK] are a girl [PAD]'
tokens = CFG.tokenizer(text)
input_ids = [torch.tensor(x) for x in tokens["input_ids"]]
input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)

special_tokens_mask = CFG.tokenizer.get_special_tokens_mask(input_ids, already_has_special_tokens=True)
special_tokens_mask

IndexError: Dimension specified as 0 but tensor has no dimensions

In [16]:
""" Experiment for Huggingface Tokenizer for Building MLM Algorithm 
"""
mlm_probability = 0.15
probability_matrix = torch.full(input_ids.shape, mlm_probability)
probability_matrix

AttributeError: 'list' object has no attribute 'shape'

In [6]:
def hf_load_dataset(cfg: CFG) -> DatasetDict:
    """ Load dataset from Huggingface Datasets
    Notes:
        This function is temporary just fit-able for Wikipedia dataset
    References:
        https://github.com/huggingface/datasets/blob/main/src/datasets/load.py#2247
    """
    dataset = load_dataset(cfg.hf_dataset, cfg.language)
    return dataset


def hf_split_dataset(cfg: CFG, dataset: Dataset) -> Tuple[Dataset, Dataset]:
    """ Split dataset from Huggingface Datasets with huggingface method "train_test_split"
    Args:
        cfg: configuration.CFG, needed to load split ratio, seed value
        dataset: Huggingface Datasets object, dataset from Huggingface Datasets
    Notes:
        This function is temporary just fit-able for Wikipedia dataset & MLM Task
    """
    dataset = dataset.train_test_split(cfg.split_ratio, seed=cfg.seed)
    train, valid = dataset['train'], dataset['test']
    return train, valid


def chunking(sequences: Dict, cfg: CFG = CFG) -> List[str]:
    """ Chunking sentence to token using pretrained tokenizer
    Args:
        cfg: configuration.CFG, needed to load pretrained tokenizer
        sequences: list, sentence to chunking
    References:
        https://huggingface.co/docs/transformers/main/tasks/masked_language_modeling
    """
    return cfg.tokenizer([" ".join(x) for x in sequences['text']])


def group_texts(sequences: Dict, cfg: CFG = CFG) -> Dict:
    """ Dealing Problem: some of data instances are longer than the maximum input length for the model,
    This function is ONLY used to HF Dataset Object
    1) Concatenate all texts
    2) We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    3) customize this part to your needs
    4) Split by chunks of max_len
    """
    concatenated_sequences = {k: sum(sequences[k], []) for k in sequences.keys()}
    total_length = len(concatenated_sequences[list(sequences.keys())[0]])
    if total_length >= cfg.max_seq:
        total_length = (total_length // cfg.max_seq) * cfg.max_seq
    result = {
        k: [t[i: i + cfg.max_seq] for i in range(0, total_length, cfg.max_seq)]
        for k, t in concatenated_sequences.items()
    }
    return result


def apply_preprocess(dataset: Dataset, function: Callable, batched: bool = True, num_proc: int = 4, remove_columns: any = None) -> Dataset:
    """ Apply preprocessing to text data, which is using huggingface dataset method "map()"
    for pretrained training (MLM, CLM)
    Args:
        dataset: Huggingface Datasets object, dataset from Huggingface Datasets
        function: Callable, function that you want to apply
        batched: bool, default True, if you want to apply function to batched data, set True
        num_proc: int, default 4, number of process for multiprocessing
        remove_columns: any, default None, if you want to remove some columns, set column name
    References:
        https://huggingface.co/docs/transformers/main/tasks/masked_language_modeling
    """
    mapped_dataset = dataset.map(
        function,
        batched=batched,
        num_proc=num_proc,
        remove_columns=remove_columns,
    )
    return mapped_dataset


def load_data(data_path: str) -> pd.DataFrame:
    """ Load data_folder from csv file like as train.csv, test.csv, val.csv
    """
    df = pd.read_csv(data_path)
    return df


def no_char(text):
    text = re.sub(r"\s+[a-zA-Z]\s+", " ", text)
    text = re.sub(r"\^[a-zA-Z]\s+", " ", text)
    text = re.sub(r"\s+[a-zA-Z]$", " ", text)
    return text


def no_multi_spaces(text):
    return re.sub(r"\s+", " ", text, flags=re.I)


def underscore_to_space(text: str):
    text = text.replace("_", " ")
    text = text.replace("-", " ")
    return text


def preprocess_text(source):
    """ Remove all the special characters
    """
    source = re.sub(r'\W', ' ', str(source))
    source = re.sub(r'^b\s+', '', source)
    source = source.lower()
    return source


def cleaning_words(text: str) -> str:
    """ Apply all of cleaning process to text data
    """
    tmp_text = underscore_to_space(text)
    tmp_text = no_char(tmp_text)
    tmp_text = preprocess_text(tmp_text)
    tmp_text = no_multi_spaces(tmp_text)
    return tmp_text


def split_token(inputs: str):
    """ Convert malform list to Python List Object & elementwise type casting
    """
    inputs = cleaning_words(inputs)
    tmp = inputs.split()
    result = list(map(int, tmp))
    return result


def split_list(inputs: List, max_length: int) -> List[List]:
    """ Split List into sub shorter list, which is longer than max_length
    """
    result = [inputs[i:i + max_length] for i in range(0, len(inputs), max_length)]
    return result


def flatten_sublist(inputs: List[List], max_length: int = 512) -> List[List]:
    """ Flatten Nested List to 1D-List """
    result = []
    for instance in tqdm(inputs):
        tmp = split_token(instance)
        if len(tmp) > max_length:
            tmp = split_list(tmp, max_length)
            for i in range(len(tmp)):
                result.append(tmp[i])
        else:
            result.append(tmp)
    return result


def preprocess4tokenizer(input_ids: List, token_type_ids: List, attention_mask: List):
    for i, inputs in tqdm(enumerate(input_ids)):
        if inputs[0] != 1:
            inputs.insert(0, 1)
            token_type_ids[i].insert(0, 0)
            attention_mask[i].insert(0, 1)
        if inputs[-1] != 2:
            inputs.append(2)
            token_type_ids[i].append(0)
            attention_mask[i].append(1)
    return input_ids, token_type_ids, attention_mask


def cut_instance(input_ids: List, token_type_ids: List, attention_mask: List, min_length: int = 256):
    n_input_ids, n_token_type_ids, n_attention_mask = [], [], []
    for i, inputs in tqdm(enumerate(input_ids)):
        if len(inputs) >= min_length:
            n_input_ids.append(inputs)
            n_token_type_ids.append(token_type_ids[i])
            n_attention_mask.append(attention_mask[i])
    return n_input_ids, n_token_type_ids, n_attention_mask


def save_pkl(input_dict: Any, filename: str) -> None:
    with open(f'{filename}.pkl', 'wb') as file:
        pickle.dump(input_dict, file)


def load_pkl(filepath: str) -> Any:
    """  Load pickle file
    Examples:
        filepath = './dataset_class/data_folder/train'
    """
    with open(f'{filepath}.pkl', 'rb') as file:
        output = pickle.load(file)
    return output


In [2]:
"""
1) Load Dataset, Tokenizer
2) Split Dataset, preprocess dataset for MLM Task
"""
ds = hf_load_dataset(CFG)
_, sub_ds = hf_split_dataset(CFG, ds['train'])
train, valid = hf_split_dataset(CFG, sub_ds)

NameError: name 'hf_load_dataset' is not defined

In [4]:
""" Apply preprocessing to dataset """

chunked_train = apply_preprocess(
    train,
    chunking,
    remove_columns=train.column_names
)

chunked_valid = apply_preprocess(
    valid,
    chunking,
    remove_columns=valid.column_names
)

Map (num_proc=4):   0%|          | 0/1025250 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/256313 [00:00<?, ? examples/s]

In [8]:
""" Grouping text data to fit the maximum input length for the model """

grouped_train = apply_preprocess(
    chunked_train,
    group_texts,
    num_proc=8,
)

grouped_valid = apply_preprocess(
    chunked_train,
    group_texts,
    num_proc=8,
)

Map (num_proc=8):   0%|          | 0/1025250 [00:00<?, ? examples/s]

TimeoutError: 