In [1]:
import multiprocessing, torch

from transformers import AutoTokenizer, PreTrainedTokenizerFast, GPT2LMHeadModel, AutoConfig, PreTrainedModel, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset
from tqdm.auto import tqdm
from time import strftime, time, localtime
from os import listdir
from functools import partial

print(torch.__version__)

2.0.0+cu117


In [2]:
model_path = "skt/kogpt2-base-v2"
# tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path, 
                                                    # bos_token='</s>', eos_token='</s>', unk_token='<unk>',
                                                    # pad_token='<pad>', mask_token='<mask>', max_length=512)
# model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")
model = GPT2LMHeadModel.from_pretrained(model_path)

In [3]:
# dataset = load_dataset("HuggingFaceH4/ultrachat_200k")
# dataset = load_dataset("wikipedia", "20220301.en")
# dataset = dataset.remove_columns([col for col in dataset.column_names if col != 'text'])
dataset = load_dataset("bookcorpus", split="train", streaming=True).with_format('torch')
# dataset = load_dataset("bookcorpus", split="train")
# len(dataset)

In [4]:
def get_time_dir(): return f"tokenizer_{strftime('%m-%d-%H-%M', localtime(time()))}"

In [5]:
def batch_iterator(batch_size=10000):
    for i in tqdm(range(0, len(dataset), batch_size)):
        yield dataset[i:i+batch_size]['text']
if [_dir for _dir in listdir() if "tokenizer" in _dir] != []:
    latest_tokenizer_path = sorted([_dir for _dir in listdir() if "tokenizer" in _dir])[-1]
    tokenizer = PreTrainedTokenizerFast.from_pretrained(latest_tokenizer_path)
else:
    tokenizer = PreTrainedTokenizerFast.train_new_from_iterator(text_iterator=batch_iterator(), vocab_size=32_000)
    tokenizer.save_pretrained(get_time_dir())

tokenizer.pad_token = '<pad>'
tokenizer.eos_token = '</s>'
tokenizer.bos_token = '</s>'
tokenizer.unk_token = '<unk>'
tokenizer.mask_token = '<mask>'
tokenizer.model_max_length = 128

In [6]:
num_proc = multiprocessing.cpu_count()
print(f"The max length for the tokenizer is: {tokenizer.model_max_length}")

The max length for the tokenizer is: 128


In [7]:
def group_texts(examples, tokenizer=tokenizer):
    tokenized_inputs = tokenizer(
        examples['text'], return_special_tokens_mask=True, truncation=True, max_length=tokenizer.model_max_length, padding=True
    )
    return tokenized_inputs

In [8]:
# tokenized_dataset = dataset.map(partial(group_texts, tokenizer=tokenizer), batched=True, remove_columns=['text'], num_proc=num_proc)
tokenized_dataset = dataset.map(partial(group_texts, tokenizer=tokenizer), batched=True, remove_columns=['text'])
tokenized_dataset.features

In [9]:
config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size = len(tokenizer),
    n_ctx = tokenizer.model_max_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [10]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 110.4M parameters


In [11]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [12]:
args = TrainingArguments(
    output_dir=f"AutocompleteLM-{get_time_dir()}",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    # num_train_epochs=1,
    weight_decay=.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
    max_steps=74004228 * (num_train_epochs:=1),
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

In [13]:
trainer.train()



  0%|          | 0/74004228 [00:00<?, ?it/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
s = "안녕하세요, "
tokens = tokenizer(s, return_tensors="pt")
output = model.generate(**tokens)



In [None]:
tokens['input_ids'].shape, output.shape

In [None]:
tokenizer.decode(output[0])