In [1]:
import re
import pickle
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torch.nn.utils.rnn import pad_sequence
from tqdm.auto import tqdm
from datasets import Dataset, DatasetDict, load_dataset
from typing import Dict, List, Tuple, Callable, Any
from configuration import CFG

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [96]:
""" Huggingface Tokenizer Experiment """
from transformers import AutoTokenizer

bpe_tokenizer = AutoTokenizer.from_pretrained('gpt2')  # roberta, gpt2
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')  # bert
sentencepiece_tokenizer = CFG.tokenizer  # deberta, T5

text = "trained train Pretrained Pretraining Pretrained"
print(f'BPE Tokenizer Output: {bpe_tokenizer.tokenize(text)}', end="\n\n")
print(f'Bert Tokenizer in HF Output: {bert_tokenizer.tokenize(text)}', end="\n\n")
print(f'Sentencepiece Tokenizer (BPE) Output: {sentencepiece_tokenizer.tokenize(text)}', end="\n\n")

BPE Tokenizer Output: ['trained', 'Ġtrain', 'ĠPret', 'rained', 'ĠPret', 'raining', 'ĠPret', 'rained']

Bert Tokenizer in HF Output: ['trained', 'train', 'pre', '##train', '##ed', 'pre', '##train', '##ing', 'pre', '##train', '##ed']

Sentencepiece Tokenizer (BPE) Output: ['▁trained', '▁train', '▁Pre', 'trained', '▁Pre', 'training', '▁Pre', 'trained']


In [92]:
""" Find tokenizer type in huggingface pretrained tokenizer """

bpe_tokenizer.__class__.__name__

'XLMRobertaTokenizerFast'

In [60]:
test = 'spm.model'
test[-6:] == '.model'

True

In [7]:
""" Helper Function """

def select_alphanumeric_and_non_english(text: str) -> str:
    pattern = re.compile(r'[^\w\d\s]|_')
    result = pattern.sub('', text)
    return result

def select_tokens(tokens: List[str]) -> List[str]:
    selected_tokens = [token for token in tokens if re.match(r'^[\w\d\s]+$', token) and '_' not in token]
    return selected_tokens

def select_post_string(token: str) -> str:
    pattern = re.compile(r'[^\w\d\s]|_')
    flag = False if re.match(pattern, token) else True
    return flag

def select_src_string(token: str) -> bool:
    """ set flag value for selecting src tokens to mask in sub-word
    Args:
        token: str, token to check
    """
    flag = False
    if tokenizer_type == 'SPM':
        flag = True if token.startswith("▁") else False

    elif tokenizer_type == 'BPE':
        flag = True if token.startswith("Ġ") else False

    elif tokenizer_type == 'WORDPIECE':
        pattern = re.compile(r'[^\w\d\s]|_')
        flag = False if re.match(pattern, token) else True
    return flag


text = "_Hello, World! 123 / Example_____ 테스트 文字列"
test = "나는는"
result = select_alphanumeric_and_non_english(text)
print(select_post_string(test))
print(select_src_string(test))

True
True


In [64]:
from transformers import AutoTokenizer, DataCollatorForWholeWordMask
import torch

# Example input text
input_text = "This is an example sentence for Span Masking algorithm. It is important to keep the token length below 100 for demonstration purposes."

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')

# Tokenize input text
input_tokens = tokenizer.tokenize(input_text)

# Whole Word Masking: Create a list of random labels (1 for [MASK], 0 for others)
mask_labels = [1 if token.startswith("▁") else 0 for token in input_tokens]

# Whole Word Masking: Apply masking to input tokens
input_tokens = tokenizer.convert_tokens_to_ids(input_tokens)
masked_tokens = DataCollatorForWholeWordMask(tokenizer)([input_tokens], mask_labels=mask_labels)

# Convert masked tokens back to text for visualization
masked_text = tokenizer.decode(masked_tokens[0], skip_special_tokens=True)

# Print results
print("Input Text:", input_text)
print("Masked Text:", masked_text)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


KeyError: 0

In [44]:
# """ Test for Whole Word Masking """
# 
# mlm_probability = 0.15
# # input_tokens = """
# # trained train Pretrained Pretraining Pretrained Pretraining Pretrained Pretraining Pretrained Pretraining Pretrained Pretraining trained train Pretrained Pretraining Pretrained Pretraining Pretrained Pretraining Pretrained Pretraining Pretrained Pretraining trained train Pretrained Pretraining Pretrained Pretraining Pretrained Pretraining Pretrained Pretraining Pretrained Pretraining  
# # """
# input_tokens = "trained! train! Pretrained? Pretraining. Pretrained. Pretraining. Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained! Pretraining! Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining? Pretrained? Pretraining, /Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining. Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining"
# 
# 
# # tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# tokenizer = CFG.tokenizer
# input_tokens = tokenizer(
#     input_tokens,
# )
# input_tokens["input_ids"]
# tokenizer_type = 'SPM'
# 
# 
# def select_post_string(token: str) -> bool:
#     """ set flag value for selecting post tokens to mask in sub-word
#     Args:
#         token: str, token to check
#     """
#     flag = False
#     if tokenizer_type == 'SPM':
#         pattern = re.compile(r'[^\w\d\s]|_')
#         flag = False if re.match(pattern, token[0]) else True
# 
#     elif tokenizer_type == 'BPE':
#         flag = False if token.startswith("Ġ") else True
# 
#     elif tokenizer_type == 'WORDPIECE':
#         flag = True if token.startswith("##") else False
# 
#     return flag
# 
# def select_src_string(token: str) -> bool:
#     """ set flag value for selecting src tokens to mask in sub-word
#     Args:
#         token: str, token to check
#     """
#     flag = False
#     if tokenizer_type == 'SPM':
#         flag = True if token.startswith("▁") else False
# 
#     elif tokenizer_type == 'BPE':
#         flag = True if token.startswith("Ġ") else False
# 
#     elif tokenizer_type == 'WORDPIECE':
#         pattern = re.compile(r'[^\w\d\s]|_')
#         flag = False if re.match(pattern, token) else True
#     return flag
# 
# 
# def get_padding_mask(input_id: Tensor) -> Tensor:
#     return torch.zeros(input_id.shape).bool()
# 
# 
# def _whole_word_mask(
#         input_tokens: List[str],
#         max_predictions: int = CFG.max_seq
# ) -> List[int]:
#     """ 
#     1) split input_tokens by space into single token
#     2) check if token is src token or post token
#         - if cand_indexes not empty and token is post token, append index to cand_indexes
#         - if token is src token, append index list to cand_indexes
#     """
#     cand_indexes = []
#     for i, token in enumerate(input_tokens):
#         if token == "[CLS]" or token == "[SEP]":
#             continue
#         if len(cand_indexes) >= 1 and select_post_string(token): 
#             cand_indexes[-1].append(i)
#         elif select_src_string(token):
#             cand_indexes.append([i])
#     print(cand_indexes)  # 여기서부터 변형해서 만들면 되겠다.
#     random.shuffle(cand_indexes)  # shuffle cand_indexes list
#     num_to_predict = min(max_predictions, max(1, int(round(len(input_tokens) * mlm_probability))))
#     masked_lms = []
#     covered_indexes = set()
#     for index_set in cand_indexes:
#         if len(masked_lms) >= num_to_predict:
#             break
#         if len(masked_lms) + len(index_set) > num_to_predict:
#             continue
#         is_any_index_covered = False
#         for index in index_set:
#             if index in covered_indexes:
#                 is_any_index_covered = True
#                 break
#         if is_any_index_covered:
#             continue
#         for index in index_set:
#             covered_indexes.add(index)
#             masked_lms.append(index)
# 
#     if len(covered_indexes) != len(masked_lms):
#         raise ValueError("Length of covered_indexes is not equal to length of masked_lms.")
#     mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_tokens))]
#     return mask_labels
# 
# def get_mask_tokens(inputs, mask_labels):
#     """ Prepare masked tokens inputs/labels for masked language modeling(15%):
#     80% MASK, 10% random, 10% original. Set 'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref
#     """
#     labels = inputs.clone()
#     probability_matrix = mask_labels
# 
#     special_tokens_mask = [
#         tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
#     ]
#     probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
#     if tokenizer.pad_token is not None:
#         padding_mask = labels.eq(tokenizer.pad_token_id)
#         probability_matrix.masked_fill_(padding_mask, value=0.0)
# 
#     masked_indices = probability_matrix.bool()
#     labels[~masked_indices] = -100  # We only compute loss on masked tokens
# 
#     # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
#     indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
#     inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
# 
#     # 10% of the time, we replace masked input tokens with random word
#     indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
#     random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
#     inputs[indices_random] = random_words[indices_random]
# 
#     # The rest of the time (10% of the time) we keep the masked input tokens unchanged
#     return inputs, labels
# 
# def testing(batched):
#     """ Masking for MLM with whole-word tokenizing """
#     batched = batched["input_ids"]
#     input_ids = [torch.tensor(batched)]
#     padding_mask = [get_padding_mask(x) for x in input_ids]
#     padding_mask = pad_sequence(padding_mask, batch_first=True, padding_value=True)
#     input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
# 
#     mask_labels = []
#     ref_tokens = []
#     for input_id in batched:
#         token = tokenizer._convert_id_to_token(input_id)
#         ref_tokens.append(token)
#     mask_labels.append(_whole_word_mask(ref_tokens))
# 
#     mask_labels = [torch.tensor(x) for x in mask_labels]
#     mask_labels = pad_sequence(mask_labels, batch_first=True, padding_value=0)
#     inputs, labels = get_mask_tokens(
#         input_ids,
#         mask_labels
#     )
#     return inputs, labels
# 
# inputs, labels = testing(input_tokens)
# print(inputs, end="\n\n")
# print(labels)

[[1], [3], [5, 6], [8, 9], [11, 12], [14, 15], [17, 18], [20, 21], [23, 24], [26, 27], [29, 30], [32, 33], [35, 36], [38, 39], [41, 42], [44, 45], [47, 48], [50, 51], [53, 54], [56, 57], [59, 60], [62, 63], [65, 66], [68, 69], [71, 72], [74, 75], [77, 78], [80, 81], [83, 84, 85], [87, 88], [90, 91], [93, 94], [96, 97], [99, 100], [102, 103], [105, 106], [108, 109], [111, 112], [114, 115], [117, 118], [120, 121], [123, 124], [126, 127], [129, 130], [132, 133], [135, 136], [138, 139], [141, 142], [144, 145], [147, 148], [150, 151], [153, 154], [156, 157], [159, 160], [162, 163], [165, 166], [168, 169], [171, 172], [174, 175], [177, 178], [180, 181], [183, 184]]
tensor([[     1,   3266,    300,   2184,    300, 128000, 128000,    302,   3810,
          18782,    260,   3810,  16676,    260,   3810,  18782,    260,   3810,
          16676,    261,   3810,  18782,    261,   3810,  16676,    261,   3810,
          18782,    261,   3810,  16676,    261,   3810,  18782,    261,   3810,
        

In [62]:
""" Test for Span Masking Algorithm """

input_tokens = "trained! train! Pretrained? Pretraining. Pretrained. Pretraining. Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained! Pretraining! Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining? Pretrained? Pretraining, /Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining. Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining"

mlm_probability = 0.15
masking_budget = 0.15
span_probability = 0.2
max_span_length = 10

tokenizer = CFG.tokenizer
input_tokens = tokenizer(
    input_tokens,
)
input_tokens["input_ids"]
tokenizer_type = 'SPM'


def random_non_negative_integer(max_value: int):
    return random.randint(0, max_value)


def select_post_string(token: str) -> bool:
    """ set flag value for selecting post tokens to mask in sub-word
    Args:
        token: str, token to check
    """
    flag = False
    if tokenizer_type == 'SPM':
        pattern = re.compile(r'[^\w\d\s]|_')
        flag = False if re.match(pattern, token[0]) else True

    elif tokenizer_type == 'BPE':
        flag = False if token.startswith("Ġ") else True

    elif tokenizer_type == 'WORDPIECE':
        flag = True if token.startswith("##") else False

    return flag

def select_src_string(token: str) -> bool:
    """ set flag value for selecting src tokens to mask in sub-word
    Args:
        token: str, token to check
    """
    flag = False
    if tokenizer_type == 'SPM':
        flag = True if token.startswith("▁") else False

    elif tokenizer_type == 'BPE':
        flag = True if token.startswith("Ġ") else False

    elif tokenizer_type == 'WORDPIECE':
        pattern = re.compile(r'[^\w\d\s]|_')
        flag = False if re.match(pattern, token) else True
    return flag


def get_padding_mask(input_id: Tensor) -> Tensor:
    return torch.zeros(input_id.shape).bool()


def _whole_word_mask(
        input_tokens: List[str],
) -> List[int]:
    """
    0) apply Whole Word Masking Algorithm for make gathering original token index in natural language 
    1) calculate number of convert into masking tokens with masking budget*len(input_tokens)
    2) define span length of this iteration
        - span length follow geometric distribution
        - span length is limited by max_span_length
    """
    cand_indexes = []
    for i, token in enumerate(input_tokens):
        if token == "[CLS]" or token == "[SEP]":
            continue
        if len(cand_indexes) >= 1 and select_post_string(token): 
            cand_indexes[-1].append(i)
        elif select_src_string(token):
            cand_indexes.append([i])
    
    l = len(input_tokens)
    src_l = len(cand_indexes)
    num_convert_tokens = int(masking_budget * l)  # 27
    budget = num_convert_tokens  # int is immutable object, so not need to copy manually 
    masked_lms = []
    covered_indexes = set()
    while budget:
        span_length = max(1, min(10, int(torch.distributions.Geometric(probs=span_probability).sample())))
        src_index = random_non_negative_integer(src_l-1)
        if span_length > budget:
            if budget < 5:  # 남은 예산이 너무 적은 경우 수많은 Iteration 발생을 막기 위해서 스팬 길이를 budget으로 설정
                span_length = budget
            else:
                continue 
        if cand_indexes[src_index][0] + span_length > l-1:  # 스팬의 마지막 토큰의 인덱스가 시퀀스 범위를 벗어나는 경우
            continue
        if len(cand_indexes[src_index]) > span_length:  # 처음부터 형태소를 마스킹하게 되는 경우
            continue
        span_token_index = cand_indexes[src_index][0]  # init span token index: src
        while span_length:    
            if span_length == 0:
                break
            if span_token_index in covered_indexes:  # 이미 마스킹 된 index 만나면 끝내고, 다음 순회 시작
                break
            else:  # 스팬 길이가 처음 선택 되었던 시작 토큰 인덱스가 해당되는 리스트 길이를 넘는 경우, 이후 선택되는 토큰은 wwm 위배 가능성
                covered_indexes.add(span_token_index)
                masked_lms.append(span_token_index)
                span_length -= 1
                budget -= 1
                span_token_index += 1
                continue

    if len(covered_indexes) != len(masked_lms):
        raise ValueError("Length of covered_indexes is not equal to length of masked_lms.")
    mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_tokens))]
    return mask_labels

def get_mask_tokens(inputs, mask_labels):
    """ All of masking tokens are replaced by tokenizer.mask_token ([MASK]) unlikely BERT 
    """
    labels = inputs.clone()
    probability_matrix = mask_labels
    
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    if tokenizer.pad_token is not None:
        padding_mask = labels.eq(tokenizer.pad_token_id)
        probability_matrix.masked_fill_(padding_mask, value=0.0)
        
    masked_indices = probability_matrix.bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens
    inputs[masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
    return inputs, labels

def testing(batched):
    """ Masking for MLM with whole-word tokenizing """
    batched = batched["input_ids"]
    input_ids = [torch.tensor(batched)]
    padding_mask = [get_padding_mask(x) for x in input_ids]
    padding_mask = pad_sequence(padding_mask, batch_first=True, padding_value=True)
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)

    mask_labels = []
    ref_tokens = []
    for input_id in batched:
        token = tokenizer._convert_id_to_token(input_id)
        ref_tokens.append(token)
    mask_labels.append(_whole_word_mask(ref_tokens))
    mask_labels = [torch.tensor(x) for x in mask_labels]
    mask_labels = pad_sequence(mask_labels, batch_first=True, padding_value=0)
    inputs, labels = get_mask_tokens(
        input_ids,
        mask_labels
    )
    return inputs, labels

inputs, labels = testing(input_tokens)
print(inputs, end="\n\n")
print(labels)

[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0]]
27
tensor([[     1,   3266,    300, 128000,    300,   3810,  16676,    302,   3810,
          18782,    260,   3810,  16676,    260,   3810,  18782,    260,   3810,
          16676,    261,   3810,  18782,    261,   3810,  16676,    261,   3810,
          18782,    261,   3810,  16676,    261, 128000, 128000, 128000, 128000,
         128000, 128000, 128000,  18782,    261,   3810,  16676,    300,   3810,
          18782,    300,   3810

In [61]:
tokenizer._convert_id_to_token(2184)

'▁train'

In [49]:
tokenizer.vocab_files_names

{'vocab_file': 'spm.model', 'tokenizer_file': 'tokenizer.json'}

In [8]:
""" Experiment for ELECTRA get discriminator input
1) flatten logit tensor and label tensor
2) get highest logit
3) masked select for mlm masking index
4) get index of mlm masking index
5) index select for discriminator input 
"""
flat_logit = torch.tensor([99, 98, 97, 96])
test = torch.tensor([-100, -100, 1, 2, 3, 4])
mlm_mask_idx = torch.where(test != -100)

test2 = test.clone()
test[mlm_mask_idx] = flat_logit
test


tensor([-100, -100,   99,   98,   97,   96])

In [10]:
torch.eq(test, test2).long()

tensor([1, 1, 0, 0, 0, 0])

In [18]:

a = torch.tensor([1, 2])

test = torch.tensor([-100, -100, 1, 2, 3, 4])
test.view(-1, a.size(0))

tensor([[-100, -100],
        [   1,    2],
        [   3,    4]])

In [34]:
""" Experiment for Huggingface Tokenizer for Building MLM Algorithm 
meaning of special token in Huggingface Tokenizer is corrspoding to [CLS], [SEP], [MASK], [PAD] ... etc
"""

text = 'I am a boy [MASK] [MASK] are a girl [PAD]'
tokens = CFG.tokenizer(text)
input_ids = [torch.tensor(x) for x in tokens["input_ids"]]
special_tokens_mask = CFG.tokenizer.get_special_tokens_mask(input_ids, already_has_special_tokens=True)
special_tokens_mask

[1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1]

In [35]:
""" Experiment for Huggingface Tokenizer for Building MLM Algorithm 
"""
mlm_probability = 0.15
probability_matrix = torch.full(torch.tensor(input_ids).shape, mlm_probability)

tensor([0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500,
        0.1500, 0.1500, 0.1500])

In [36]:
""" Experiment for Huggingface Tokenizer for Building MLM Algorithm 
"""
probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
masked_indices = torch.bernoulli(probability_matrix).bool()
masked_indices

TypeError: masked_fill_() received an invalid combination of arguments - got (list, value=float), but expected one of:
 * (Tensor mask, Tensor value)
      didn't match because some of the arguments have invalid types: (!list of [int, int, int, int, int, int, int, int, int, int, int, int]!, !value=float!)
 * (Tensor mask, Number value)
      didn't match because some of the arguments have invalid types: (!list of [int, int, int, int, int, int, int, int, int, int, int, int]!, !value=float!)


In [32]:
input_ids[~masked_indices] = -100

TypeError: only integer tensors of a single element can be converted to an index

In [99]:
def hf_load_dataset(cfg: CFG) -> DatasetDict:
    """ Load dataset from Huggingface Datasets
    Notes:
        This function is temporary just fit-able for Wikipedia dataset
    References:
        https://github.com/huggingface/datasets/blob/main/src/datasets/load.py#2247
    """
    dataset = load_dataset(cfg.hf_dataset, cfg.language)
    return dataset


def hf_split_dataset(cfg: CFG, dataset: Dataset) -> Tuple[Dataset, Dataset]:
    """ Split dataset from Huggingface Datasets with huggingface method "train_test_split"
    Args:
        cfg: configuration.CFG, needed to load split ratio, seed value
        dataset: Huggingface Datasets object, dataset from Huggingface Datasets
    Notes:
        This function is temporary just fit-able for Wikipedia dataset & MLM Task
    """
    dataset = dataset.train_test_split(cfg.split_ratio, seed=cfg.seed)
    train, valid = dataset['train'], dataset['test']
    return train, valid


def chunking(sequences: Dict, cfg: CFG = CFG) -> List[str]:
    """ Chunking sentence to token using pretrained tokenizer
    Args:
        cfg: configuration.CFG, needed to load pretrained tokenizer
        sequences: list, sentence to chunking
    References:
        https://huggingface.co/docs/transformers/main/tasks/masked_language_modeling
    """
    return cfg.tokenizer([" ".join(x) for x in sequences['text']])


def group_texts(sequences: Dict, cfg: CFG = CFG) -> Dict:
    """ Dealing Problem: some of data instances are longer than the maximum input length for the model,
    This function is ONLY used to HF Dataset Object
    1) Concatenate all texts
    2) We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    3) customize this part to your needs
    4) Split by chunks of max_len
    """
    concatenated_sequences = {k: sum(sequences[k], []) for k in sequences.keys()}
    total_length = len(concatenated_sequences[list(sequences.keys())[0]])
    if total_length >= cfg.max_seq:
        total_length = (total_length // cfg.max_seq) * cfg.max_seq
    result = {
        k: [t[i: i + cfg.max_seq] for i in range(0, total_length, cfg.max_seq)]
        for k, t in concatenated_sequences.items()
    }
    return result


def apply_preprocess(dataset: Dataset, function: Callable, batched: bool = True, num_proc: int = 4, remove_columns: any = None) -> Dataset:
    """ Apply preprocessing to text data, which is using huggingface dataset method "map()"
    for pretrained training (MLM, CLM)
    Args:
        dataset: Huggingface Datasets object, dataset from Huggingface Datasets
        function: Callable, function that you want to apply
        batched: bool, default True, if you want to apply function to batched data, set True
        num_proc: int, default 4, number of process for multiprocessing
        remove_columns: any, default None, if you want to remove some columns, set column name
    References:
        https://huggingface.co/docs/transformers/main/tasks/masked_language_modeling
    """
    mapped_dataset = dataset.map(
        function,
        batched=batched,
        num_proc=num_proc,
        remove_columns=remove_columns,
    )
    return mapped_dataset


def load_data(data_path: str) -> pd.DataFrame:
    """ Load data_folder from csv file like as train.csv, test.csv, val.csv
    """
    df = pd.read_csv(data_path)
    return df


def no_char(text):
    text = re.sub(r"\s+[a-zA-Z]\s+", " ", text)
    text = re.sub(r"\^[a-zA-Z]\s+", " ", text)
    text = re.sub(r"\s+[a-zA-Z]$", " ", text)
    return text


def no_multi_spaces(text):
    return re.sub(r"\s+", " ", text, flags=re.I)


def underscore_to_space(text: str):
    text = text.replace("_", " ")
    text = text.replace("-", " ")
    return text


def preprocess_text(source):
    """ Remove all the special characters
    """
    source = re.sub(r'\W', ' ', str(source))
    source = re.sub(r'^b\s+', '', source)
    source = source.lower()
    return source


def cleaning_words(text: str) -> str:
    """ Apply all of cleaning process to text data
    """
    tmp_text = underscore_to_space(text)
    tmp_text = no_char(tmp_text)
    tmp_text = preprocess_text(tmp_text)
    tmp_text = no_multi_spaces(tmp_text)
    return tmp_text


def split_token(inputs: str):
    """ Convert malform list to Python List Object & elementwise type casting
    """
    inputs = cleaning_words(inputs)
    tmp = inputs.split()
    result = list(map(int, tmp))
    return result


def split_list(inputs: List, max_length: int) -> List[List]:
    """ Split List into sub shorter list, which is longer than max_length
    """
    result = [inputs[i:i + max_length] for i in range(0, len(inputs), max_length)]
    return result


def flatten_sublist(inputs: List[List], max_length: int = 512) -> List[List]:
    """ Flatten Nested List to 1D-List """
    result = []
    for instance in tqdm(inputs):
        tmp = split_token(instance)
        if len(tmp) > max_length:
            tmp = split_list(tmp, max_length)
            for i in range(len(tmp)):
                result.append(tmp[i])
        else:
            result.append(tmp)
    return result


def preprocess4tokenizer(input_ids: List, token_type_ids: List, attention_mask: List):
    for i, inputs in tqdm(enumerate(input_ids)):
        if inputs[0] != 1:
            inputs.insert(0, 1)
            token_type_ids[i].insert(0, 0)
            attention_mask[i].insert(0, 1)
        if inputs[-1] != 2:
            inputs.append(2)
            token_type_ids[i].append(0)
            attention_mask[i].append(1)
    return input_ids, token_type_ids, attention_mask


def cut_instance(input_ids: List, token_type_ids: List, attention_mask: List, min_length: int = 256):
    n_input_ids, n_token_type_ids, n_attention_mask = [], [], []
    for i, inputs in tqdm(enumerate(input_ids)):
        if len(inputs) >= min_length:
            n_input_ids.append(inputs)
            n_token_type_ids.append(token_type_ids[i])
            n_attention_mask.append(attention_mask[i])
    return n_input_ids, n_token_type_ids, n_attention_mask


def save_pkl(input_dict: Any, filename: str) -> None:
    with open(f'{filename}.pkl', 'wb') as file:
        pickle.dump(input_dict, file)


def load_pkl(filepath: str) -> Any:
    """  Load pickle file
    Examples:
        filepath = './dataset_class/data_folder/train'
    """
    with open(f'{filepath}.pkl', 'rb') as file:
        output = pickle.load(file)
    return output


In [2]:
"""
1) Load Dataset, Tokenizer
2) Split Dataset, preprocess dataset for MLM Task
"""
ds = hf_load_dataset(CFG)
_, sub_ds = hf_split_dataset(CFG, ds['train'])
train, valid = hf_split_dataset(CFG, sub_ds)

NameError: name 'hf_load_dataset' is not defined

In [4]:
""" Apply preprocessing to dataset """

chunked_train = apply_preprocess(
    train,
    chunking,
    remove_columns=train.column_names
)

chunked_valid = apply_preprocess(
    valid,
    chunking,
    remove_columns=valid.column_names
)

Map (num_proc=4):   0%|          | 0/1025250 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/256313 [00:00<?, ? examples/s]

In [8]:
""" Grouping text data to fit the maximum input length for the model """

grouped_train = apply_preprocess(
    chunked_train,
    group_texts,
    num_proc=8,
)

grouped_valid = apply_preprocess(
    chunked_train,
    group_texts,
    num_proc=8,
)

Map (num_proc=8):   0%|          | 0/1025250 [00:00<?, ? examples/s]

TimeoutError: 

In [100]:
input_text = "trained, train, Pretrained, Pretraining. Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining. Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining, Pretrained, Pretraining"

cleaning_words(input_text)

'trained train pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining pretrained pretraining'