In [1]:
import os; import psutil; import timeit
from datasets import load_dataset
import logging
from transformers import AutoTokenizer
from icecream import ic
ic.configureOutput(includeContext=True, argToStringFunction=lambda _: str(_))
ic.lineWrapWidth = 120


logger = logging.getLogger()
logging.basicConfig(
    level=logging.INFO, datefmt='%y-%m-%d %H:%M',
    format='%(asctime)s %(filename)s %(lineno)d: %(message)s')

cache_dir = '/mnt/nas1/huggingface/cache'
gpt2_model_name_or_path = '/mnt/nas1/models/gpt2'
bert_model_name_or_path = '/mnt/nas1/models/bert-base-cased'

tokenizer = AutoTokenizer.from_pretrained(gpt2_model_name_or_path)
example = "My name is Sylvain, and I work at Hugging Face in Brooklyn 我是中国人81st Street"
example2 = " = Valkyria Chronicles III = \n"
encoding = tokenizer(example2, return_offsets_mapping=True)
ic(type(encoding), tokenizer.is_fast)
ic(encoding.tokens(), encoding.word_ids())
ic(encoding)

start, end = encoding.word_to_chars(3)
ic(example[start:end])

inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"]
ic(example, tokens, offsets)

tokenizer_bert = AutoTokenizer.from_pretrained(bert_model_name_or_path)
inputs_with_offsets = tokenizer_bert(example, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"]
ic(tokens, offsets);

  from .autonotebook import tqdm as notebook_tqdm
ic| 1155108691.py:23 in <module>
    type(encoding): <class 'transformers.tokenization_utils_base.BatchEncoding'>
    tokenizer.is_fast: True
ic| 1155108691.py:24 in <module>
    encoding.tokens(): ['Ġ=', 'ĠV', 'alky', 'ria', 'ĠChronicles', 'ĠIII', 'Ġ=', 'Ġ', 'Ċ']
    encoding.word_ids(): [0, 1, 1, 1, 2, 3, 4, 5, 5]
ic| 1155108691.py:25 in <module>
    encoding: {'input_ids': [796, 569, 18354, 7496, 17740, 6711, 796, 220, 198], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 2), (2, 4), (4, 8), (8, 11), (11, 22), (22, 26), (26, 28), (28, 29), (29, 30)]}
ic| 1155108691.py:28 in <module>- example[start:end]: d I 
ic| 1155108691.py:33 in <module>
    example: My name is Sylvain, and I work at Hugging Face in Brooklyn 我是中国人81st Street
    tokens: ['My', 'Ġname', 'Ġis', 'ĠSylv', 'ain', ',', 'Ġand', 'ĠI', 'Ġwork', 'Ġat', 'ĠHug', 'ging', 'ĠFace', 'Ġin', 'ĠBrooklyn', 'Ġæ', 'Ī', 'ĳ', 'æĺ¯', 'ä¸Ń', 'åĽ', '½', 'äºº', '81', 's

In [1]:
import transformers
import logging, sys, os
from datasets import load_dataset
from transformers import AutoTokenizer
sys.path.append(os.path.abspath('.'))
import logging
from icecream import ic
ic.configureOutput(includeContext=True, argToStringFunction=lambda _: str(_))
ic.lineWrapWidth = 120
from itertools import chain


logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
    handlers=[logging.StreamHandler(sys.stdout)],
)
cache_dir = '/mnt/nas1/huggingface/cache'
gpt2_model_name_or_path = '/mnt/nas1/models/gpt2'
tokenizer = AutoTokenizer.from_pretrained(gpt2_model_name_or_path)
ic(tokenizer.model_max_length)


def load_wikitext_2_raw_v1(offline=True, verbose=True):
    """  
    DatasetDict({
        train: Dataset({
            features: ['text'],
            num_rows: 36718
        })
        test: Dataset({
            features: ['text'],
            num_rows: 4358
        })
        validation: Dataset({
            features: ['text'],
            num_rows: 3760
        })
    })    
    """
    wikitext_2_raw_v1_dir = '/mnt/nas1/huggingface/wikitext/wikitext-2-raw-v1'
    logger.info('load_wikitext_2_raw_v1')
    if offline:
        data_files = {
            'train': wikitext_2_raw_v1_dir + '/train/' + '0000.parquet',
            'test': wikitext_2_raw_v1_dir + '/test/' + '0000.parquet',
            'validation': wikitext_2_raw_v1_dir + '/validation/' + '0000.parquet',
        }
        raw_datasets = load_dataset(
            'parquet',
            data_files=data_files,
            cache_dir=cache_dir,
        )
    else:
        raw_datasets = load_dataset(
            'wikitext',
            'wikitext-2-raw-v1',
            cache_dir=cache_dir,
        )
    if verbose:
        logger.info(raw_datasets)
        train_dataset = raw_datasets['train']
        count = 0
        for item in train_dataset:
            logger.info(item)
            count += 1
            if count > 10:
                break
    return raw_datasets


def test():
    """  """
    from transformers.testing_utils import CaptureLogger

    # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
    tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
    text_column_name = 'text'
    raw_datasets = load_wikitext_2_raw_v1(verbose=False)

    def tokenize_function(examples):
        with CaptureLogger(tok_logger) as cl:
            output = tokenizer(examples[text_column_name])
        # clm input could be much much longer than block_size
        if "Token indices sequence length is longer than the" in cl.out:
            tok_logger.warning(
                "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
                " before being passed to the model."
            )
        return output
    
    train_dataset = raw_datasets['train']
    logger.info(train_dataset[10])
    output = tokenize_function(train_dataset[10])
    logger.info(output)

    column_names = list(raw_datasets["train"].features)
    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        num_proc=4,
        remove_columns=column_names,
        load_from_cache_file=True,
        desc="Running tokenizer on dataset",
    )
    # logger.info(tokenized_datasets['train'])
    logger.info(tokenized_datasets['train'][10])

    block_size = 1024

    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
        logger.info(examples)
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
        # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
        total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result
    
    out = group_texts(tokenized_datasets['train'][10])
    logger.info(out)
    # lm_datasets = tokenized_datasets.map(
    #     group_texts,
    #     batched=True,
    #     num_proc=4,
    #     load_from_cache_file=True,
    #     desc=f"Grouping texts in chunks of {block_size}",
    # )

test()

  from .autonotebook import tqdm as notebook_tqdm
ic| 1337538979.py:23 in <module>- tokenizer.model_max_length: 1000000000000000019884624838656


10/01/2023 17:53:18 - INFO - torch.distributed.nn.jit.instantiator - Created a temporary directory at /tmp/tmpemmv_m6u
10/01/2023 17:53:18 - INFO - torch.distributed.nn.jit.instantiator - Writing /tmp/tmpemmv_m6u/_remote_module_non_scriptable.py
10/01/2023 17:53:18 - INFO - __main__ - load_wikitext_2_raw_v1
10/01/2023 17:53:19 - INFO - __main__ - {'text': ' The game \'s battle system , the BliTZ system , is carried over directly from Valkyira Chronicles . During missions , players select each unit using a top @-@ down perspective of the battlefield map : once a character is selected , the player moves the character around the battlefield in third @-@ person . A character can only act once per @-@ turn , but characters can be granted multiple turns at the expense of other characters \' turns . Each character has a field and distance of movement limited by their Action Gauge . Up to nine characters can be assigned to a single mission . During gameplay , characters will call out if someth

TypeError: 'int' object is not iterable

In [2]:
from transformers.testing_utils import CaptureLogger
import logging, sys, os

msg = "Testing 1, 2, 3"
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
with CaptureLogger(logger) as cl:
    logger.info(msg)

logger.info('ok')
logger.info(cl.out)
assert cl.out, msg + "\n"

10/01/2023 16:22:16 - INFO - __main__ - Testing 1, 2, 3
10/01/2023 16:22:16 - INFO - __main__ - ok
10/01/2023 16:22:16 - INFO - __main__ - Testing 1, 2, 3

