In [1]:
import torch
import torch.nn as nn
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, Trainer, default_data_collator, TrainingArguments
from datasets import load_dataset
import math
import os

In [2]:
assert torch.cuda.is_available()
os.chdir(os.path.abspath(""))
os.getcwd()

'/root/CS408-Team-2/model'

In [3]:
NUM_WORKERS = 8
TRAIN_TEST_RATIO = 90
BLOCK_SIZE = 512
NUM_EPOCHS = 2

In [4]:
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
    bos_token='</s>', eos_token='</s>', unk_token='<unk>',
    pad_token='<pad>', mask_token='<mask>')

koGPT2 = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [5]:
dataset = load_dataset("text", data_files="/root/CS408-Team-2/preprocessed.txt")
dataset["train"] = load_dataset("text", data_files="/root/CS408-Team-2/preprocessed.txt", split = f"train[:{TRAIN_TEST_RATIO}%]")
dataset["val"] = load_dataset("text", data_files="/root/CS408-Team-2/preprocessed.txt", split = f"train[{TRAIN_TEST_RATIO}%:]")
col_names = dataset["train"].column_names

def _tokenizer(example):
        return tokenizer(example["text"])

tokenized_dataset = dataset.map(
        _tokenizer,
        batched = True,
        num_proc = NUM_WORKERS,
        remove_columns = col_names
)



Using custom data configuration default-9759f7442d8339e3
Reusing dataset text (/root/.cache/huggingface/datasets/text/default-9759f7442d8339e3/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-9759f7442d8339e3
Reusing dataset text (/root/.cache/huggingface/datasets/text/default-9759f7442d8339e3/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)
Using custom data configuration default-9759f7442d8339e3
Reusing dataset text (/root/.cache/huggingface/datasets/text/default-9759f7442d8339e3/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)
Loading cached processed dataset at /root/.cache/huggingface/datasets/text/default-9759f7442d8339e3/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5/cache-6b85a7399629baab.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/text/default-9759f7442d8339e3/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5/cache-21f2a9f521f61c52.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/text/default-9759f7442d8339e3/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5

In [6]:
def group_text(text):
    concated_text = {k: sum(text[k], []) for k in text.keys()}
    length = len(concated_text[list(text.keys())[0]])
    length = length // BLOCK_SIZE * BLOCK_SIZE
    groupped = {
        k: [l[i: i+ BLOCK_SIZE] for i in range(0, length, BLOCK_SIZE)]
        for k, l in concated_text.items()
    }
    groupped["labels"] = groupped["input_ids"].copy()
    return groupped

groupped_dataset = tokenized_dataset.map(
    group_text,
    batched = True,
    num_proc= NUM_WORKERS,
)

Loading cached processed dataset at /root/.cache/huggingface/datasets/text/default-9759f7442d8339e3/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5/cache-53fc6ffd08169412.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/text/default-9759f7442d8339e3/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5/cache-f7eda142ad27eaea.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/text/default-9759f7442d8339e3/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5/cache-e927132e2206735e.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/text/default-9759f7442d8339e3/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5/cache-4b101fabb8b72296.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/text/default-9759f7442d8339e3/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5/cache-aba5fb9aadbe90ed.arrow


In [7]:
args = TrainingArguments(
    output_dir="./run/",
    per_device_train_batch_size=4,
    fp16=True
)
trainer = Trainer(
    model = koGPT2,
    args = args,
    train_dataset = groupped_dataset["train"],
    eval_dataset = groupped_dataset["val"],
    tokenizer = tokenizer,
    data_collator = default_data_collator
)

Using amp fp16 backend


In [8]:
for i in range(NUM_EPOCHS):
    train_result = trainer.train()

    trainer.save_model()
    trainer.log_metrics("train", train_result.metrics)
    trainer.save_metrics("train", train_result.metrics)
    trainer.save_state()

    val_result = trainer.evaluate()

    try:
        perplexity = math.exp(val_result["eval_loss"])
    except OverflowError:
        perplexity = float("inf")
    val_result["perplexity"] = perplexity

    trainer.log_metrics("eval", val_result)
    trainer.save_metrics("eval", val_result)

    trainer.create_model_card()

***** Running training *****
  Num examples = 278
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 210


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./model/run/
Configuration saved in ./model/run/config.json
Model weights saved in ./model/run/pytorch_model.bin
tokenizer config file saved in ./model/run/tokenizer_config.json
Special tokens file saved in ./model/run/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 23528
  Batch size = 8


***** train metrics *****
  epoch                    =        3.0
  total_flos               =   202951GF
  train_loss               =     3.8265
  train_runtime            = 0:01:22.98
  train_samples_per_second =      10.05
  train_steps_per_second   =      2.531


KeyboardInterrupt: 

In [13]:
model = GPT2LMHeadModel.from_pretrained("/root/CS408-Team-2/model/run/", local_files_only = True)

In [21]:
from transformers import pipeline
lm = pipeline(
    "text2text-generation",
    model = model,
    tokenizer = tokenizer
)

The model 'GPT2LMHeadModel' is not supported for text2text-generation. Supported models are ['BigBirdPegasusForConditionalGeneration', 'M2M100ForConditionalGeneration', 'LEDForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'MT5ForConditionalGeneration', 'T5ForConditionalGeneration', 'PegasusForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BartForConditionalGeneration', 'FSMTForConditionalGeneration', 'EncoderDecoderModel', 'XLMProphetNetForConditionalGeneration', 'ProphetNetForConditionalGeneration'].


In [24]:
lm("그런데 나는")

[{'generated_text': '그런데 나는 그 말을 듣고도 아무 대꾸도 하지 않았다. 그 말을 들은 나는 다시 한번 그 말을'}]