In [6]:
import os; import psutil; import timeit
from datasets import load_dataset
import logging
from transformers import AutoTokenizer
from icecream import ic
ic.configureOutput(includeContext=True, argToStringFunction=lambda _: str(_))
ic.lineWrapWidth = 120

logger = logging.getLogger()
logging.basicConfig(
    level=logging.INFO, datefmt='%y-%m-%d %H:%M',
    format='%(asctime)s %(filename)s %(lineno)d: %(message)s')

cache_dir = '/mnt/nas1/huggingface/cache'
gpt2_model_name_or_path = '/mnt/nas1/models/gpt2'
bert_model_name_or_path = '/mnt/nas1/models/bert-base-cased'

tokenizer = AutoTokenizer.from_pretrained(gpt2_model_name_or_path)
example = "My name is Sylvain, and I work at Hugging Face in Brooklyn 我是中国人81st."
example2 = " = Valkyria Chronicles III = \n"
examples = [example, example2]
encoding2 = tokenizer(example2, return_offsets_mapping=True)
ic(encoding2.tokens(), encoding2.word_ids(), encoding2['offset_mapping'])

# default to left padding
tokenizer.padding_side = "left"
ic(tokenizer.pad_token)  # Default pad token is None
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token
ic(tokenizer.eos_token)
pad_result = tokenizer(examples, padding=True, return_tensors="pt", truncation=True, max_length=12)
ic(pad_result)

encodings = tokenizer(examples, return_offsets_mapping=False)
ic(type(encodings), type(encodings[0]))
ic(encodings.tokens(), encodings.word_ids(), encodings[0].word_ids)
ic(encodings)
start, end = encodings[0].word_to_chars(3)
ic(start, end, example[start:end])

examples = [["Hello, y'all!", "How are you doing?"], 
            ["playing video game developed by Sega", "Valkyria Chronicles III is a tactical role"]]
tokenizer_bert = AutoTokenizer.from_pretrained(bert_model_name_or_path)
inputs_with_offsets = tokenizer_bert(
    # [example, 'test'], [example2, 'test2'],
    examples,
    return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"]
ic(tokens, offsets);

ic| 1736770681.py:23 in <module>
    encoding2.tokens(): ['Ġ=', 'ĠV', 'alky', 'ria', 'ĠChronicles', 'ĠIII', 'Ġ=', 'Ġ', 'Ċ']
    encoding2.word_ids(): [0, 1, 1, 1, 2, 3, 4, 5, 5]
    encoding2['offset_mapping']: [(0, 2), (2, 4), (4, 8), (8, 11), (11, 22), (22, 26), (26, 28), (28, 29), (29, 30)]
Using pad_token, but it is not set yet.
ic| 1736770681.py:27 in <module>- tokenizer.pad_token: None
ic| 1736770681.py:30 in <module>- tokenizer.eos_token: <|endoftext|>
ic| 1736770681.py:32 in <module>
    pad_result: {'input_ids': tensor([[ 3666,  1438,   318, 24286,   391,    11,   290,   314,   670,   379,
                         12905,  2667],
                        [50256, 50256, 50256,   796,   569, 18354,  7496, 17740,  6711,   796,
                           220,   198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                        [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
ic| 1736770681.py:36 in <module>
    type(encodings): <class 'transformers.tokenization_u

In [1]:
import transformers
import logging, sys, os
from datasets import load_dataset
from transformers import AutoTokenizer
sys.path.append(os.path.abspath('.'))
import logging
from icecream import ic
ic.configureOutput(includeContext=True, argToStringFunction=lambda _: str(_))
ic.lineWrapWidth = 120
from itertools import chain


logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(lineno)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
    handlers=[logging.StreamHandler(sys.stdout)],
)
cache_dir = '/mnt/nas1/huggingface/cache'
gpt2_model_name_or_path = '/mnt/nas1/models/gpt2'
tokenizer = AutoTokenizer.from_pretrained(gpt2_model_name_or_path)
ic(tokenizer.model_max_length)


def load_wikitext_2_raw_v1(offline=True, verbose=True):
    """
    DatasetDict({
        train: Dataset({
            features: ['text'],
            num_rows: 36718
        })
        test: Dataset({
            features: ['text'],
            num_rows: 4358
        })
        validation: Dataset({
            features: ['text'],
            num_rows: 3760
        })
    })
    """
    wikitext_2_raw_v1_dir = '/mnt/nas1/huggingface/wikitext/wikitext-2-raw-v1'
    logger.info('load_wikitext_2_raw_v1')
    if offline:
        data_files = {
            'train': wikitext_2_raw_v1_dir + '/train/' + '0000.parquet',
            'test': wikitext_2_raw_v1_dir + '/test/' + '0000.parquet',
            'validation': wikitext_2_raw_v1_dir + '/validation/' + '0000.parquet',
        }
        raw_datasets = load_dataset(
            'parquet',
            data_files=data_files,
            cache_dir=cache_dir,
        )
    else:
        raw_datasets = load_dataset(
            'wikitext',
            'wikitext-2-raw-v1',
            cache_dir=cache_dir,
        )
    logger.info(raw_datasets)
    if verbose:
        train_dataset = raw_datasets['train']
        count = 0
        for item in train_dataset:
            logger.info(item)
            count += 1
            if count > 10:
                break
    return raw_datasets


def test():
    """  """
    from transformers.testing_utils import CaptureLogger

    # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
    tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
    text_column_name = 'text'
    raw_datasets = load_wikitext_2_raw_v1(verbose=False)

    def tokenize_function(examples):
        with CaptureLogger(tok_logger) as cl:
            output = tokenizer(examples[text_column_name])
        # clm input could be much much longer than block_size
        if "Token indices sequence length is longer than the" in cl.out:
            tok_logger.warning(
                "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
                " before being passed to the model."
            )
        return output

    train_dataset = raw_datasets['train']
    logger.info(train_dataset[10])
    for i in range(4):
        logger.info(len(raw_datasets['train'][i]['text']))
        logger.info(raw_datasets['train'][i]['text'])
    output = tokenize_function(train_dataset[:4])
    logger.info(output)

    column_names = list(raw_datasets["train"].features)
    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        num_proc=4,
        remove_columns=column_names,
        load_from_cache_file=True,
        desc="Running tokenizer on dataset",
    )
    logger.info(tokenized_datasets)
    for i in range(4):
        logger.info(len(tokenized_datasets['train'][i]['input_ids']))
        logger.info(tokenized_datasets['train'][i]['input_ids'])
    assert (tokenized_datasets['train'][:4] == output)

    block_size = 1024

    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
        # len(examples), list(examples.keys()), len(examples['input_ids'])
        # 2, ['input_ids', 'attention_mask'], 1000 which is batch_size.
        logger.info('%s, %s, %s', len(examples), list(examples.keys()), len(examples['input_ids']))
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
        # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
        total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result
    
    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        num_proc=4,
        # load_from_cache_file=False,
        desc=f"Grouping texts in chunks of {block_size}",
    )
    train_dataset = lm_datasets["train"]
    logger.info(lm_datasets)


test()

  from .autonotebook import tqdm as notebook_tqdm
ic| 4180611331.py:23 in <module>- tokenizer.model_max_length: 1000000000000000019884624838656


10/02/2023 14:27:25 - 21 - torch.distributed.nn.jit.instantiator - Created a temporary directory at /tmp/tmpzr47u8nj
10/02/2023 14:27:25 - 76 - torch.distributed.nn.jit.instantiator - Writing /tmp/tmpzr47u8nj/_remote_module_non_scriptable.py
10/02/2023 14:27:25 - 44 - __main__ - load_wikitext_2_raw_v1
10/02/2023 14:27:26 - 62 - __main__ - DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})
10/02/2023 14:27:26 - 95 - __main__ - {'text': ' The game \'s battle system , the BliTZ system , is carried over directly from Valkyira Chronicles . During missions , players select each unit using a top @-@ down perspective of the battlefield map : once a character is selected , the player moves the character around the battlefield in third @-@ person . A character can only act once per @-@ turn 

Grouping texts in chunks of 1024 (num_proc=4):   0%|          | 0/36718 [00:00<?, ? examples/s]

10/02/2023 14:27:27 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:27 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:27 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:27 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000


Grouping texts in chunks of 1024 (num_proc=4):   3%|▎         | 1000/36718 [00:00<00:09, 3753.39 examples/s]

10/02/2023 14:27:27 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:27 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000


Grouping texts in chunks of 1024 (num_proc=4):   8%|▊         | 3000/36718 [00:00<00:04, 7471.10 examples/s]

10/02/2023 14:27:27 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:27 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:27 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:27 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000


Grouping texts in chunks of 1024 (num_proc=4):  19%|█▉        | 7000/36718 [00:00<00:02, 11483.15 examples/s]

10/02/2023 14:27:27 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:27 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:27 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:27 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000


Grouping texts in chunks of 1024 (num_proc=4):  38%|███▊      | 14000/36718 [00:01<00:01, 16646.98 examples/s]

10/02/2023 14:27:28 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:28 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:28 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:28 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000


Grouping texts in chunks of 1024 (num_proc=4):  44%|████▎     | 16000/36718 [00:01<00:01, 14914.66 examples/s]

10/02/2023 14:27:28 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:28 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000


Grouping texts in chunks of 1024 (num_proc=4):  49%|████▉     | 18000/36718 [00:01<00:01, 15787.59 examples/s]

10/02/2023 14:27:28 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:28 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:28 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000


Grouping texts in chunks of 1024 (num_proc=4):  54%|█████▍    | 20000/36718 [00:01<00:01, 14790.73 examples/s]

10/02/2023 14:27:28 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:28 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:28 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:28 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000


Grouping texts in chunks of 1024 (num_proc=4):  65%|██████▌   | 24000/36718 [00:01<00:00, 16636.94 examples/s]

10/02/2023 14:27:28 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000


Grouping texts in chunks of 1024 (num_proc=4):  71%|███████   | 26000/36718 [00:01<00:00, 15585.84 examples/s]

10/02/2023 14:27:28 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:28 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:29 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:29 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000


Grouping texts in chunks of 1024 (num_proc=4):  84%|████████▍ | 31000/36718 [00:02<00:00, 16338.42 examples/s]

10/02/2023 14:27:29 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:29 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:29 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 179
10/02/2023 14:27:29 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000


Grouping texts in chunks of 1024 (num_proc=4):  90%|█████████ | 33179/36718 [00:02<00:00, 16193.12 examples/s]

10/02/2023 14:27:29 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:29 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 179
10/02/2023 14:27:29 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 180


Grouping texts in chunks of 1024 (num_proc=4):  99%|█████████▉| 36358/36718 [00:02<00:00, 17953.34 examples/s]

10/02/2023 14:27:29 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 180


Grouping texts in chunks of 1024 (num_proc=4): 100%|██████████| 36718/36718 [00:02<00:00, 13777.95 examples/s]
Grouping texts in chunks of 1024 (num_proc=4):   0%|          | 0/4358 [00:00<?, ? examples/s]

10/02/2023 14:27:29 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:29 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:29 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000
10/02/2023 14:27:30 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 1000


Grouping texts in chunks of 1024 (num_proc=4):  23%|██▎       | 1000/4358 [00:00<00:01, 3350.62 examples/s]

10/02/2023 14:27:30 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 90
10/02/2023 14:27:30 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 90
10/02/2023 14:27:30 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 89
10/02/2023 14:27:30 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 89


Grouping texts in chunks of 1024 (num_proc=4): 100%|██████████| 4358/4358 [00:00<00:00, 8176.10 examples/s] 
Grouping texts in chunks of 1024 (num_proc=4):   0%|          | 0/3760 [00:00<?, ? examples/s]

10/02/2023 14:27:30 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 940
10/02/2023 14:27:30 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 940
10/02/2023 14:27:30 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 940
10/02/2023 14:27:30 - 124 - __main__ - 2, ['input_ids', 'attention_mask'], 940


Grouping texts in chunks of 1024 (num_proc=4): 100%|██████████| 3760/3760 [00:00<00:00, 5843.95 examples/s]

10/02/2023 14:27:31 - 146 - __main__ - DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2314
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 272
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 240
    })
})
10/02/2023 14:27:31 - 147 - __main__ - 2314





In [2]:
from transformers.testing_utils import CaptureLogger
import logging, sys, os

msg = "Testing 1, 2, 3"
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
with CaptureLogger(logger) as cl:
    logger.info(msg)

logger.info('ok')
logger.info(cl.out)
assert cl.out, msg + "\n"

10/01/2023 16:22:16 - INFO - __main__ - Testing 1, 2, 3
10/01/2023 16:22:16 - INFO - __main__ - ok
10/01/2023 16:22:16 - INFO - __main__ - Testing 1, 2, 3

