# Import

In [None]:
import os
import nltk
import json
from time import gmtime, strftime
# from nltk import sent_tokenize
from tqdm import tqdm
from itertools import chain

import multiprocessing
import parmap

from datasets import load_dataset, load_from_disk, concatenate_datasets
from transformers import AutoTokenizer, AutoConfig, TrainingArguments, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer
from accelerate import notebook_launcher

# Load Dataset

In [None]:
data_path = "./data"
model_cache_dir = "./gpt2_cache"
current_time = strftime("%Y-%m-%d-%H:%M:%S", gmtime())
model_output_dir = './BERT'+"_"+current_time
original_train_datasets_path = data_path + "/original_datasets/TS1"
original_valid_datasets_path = data_path + "/original_datasets/VS1"
train_sentence_list_file_name = "train_sentence_list.txt"
valid_sentence_list_file_name = "valid_sentence_list.txt"
# raw_datasets_folder_name = "raw_datasets"
raw_datasets_cache_name = ".raw_datasets_cache"
# raw_datasets_path = os.path.join(data_path, raw_datasets_folder_name)
raw_datasets_cache_path = os.path.join(data_path, raw_datasets_cache_name)
tokenizer_name = "tokenizer_aihub_news_gpt2"
tokenized_datasets_folder_name = ["gpt2_tokenized_datasets", "gpt2_tokenized_datasets_1", "gpt2_tokenized_datasets_2", "gpt2_tokenized_datasets_3"]
grouped_tokenized_datasets_folder_name = "gpt2_grouped_tokenized_datasets"

In [None]:
raw_datasets = load_dataset('text', data_files={"train": os.path.join(data_path, train_sentence_list_file_name), "valid": os.path.join(data_path, valid_sentence_list_file_name)}, cache_dir=raw_datasets_cache_path)

# Tokenizing

### from scratch

In [None]:
# training a tokenizer from scratch
def batch_iterator(batch_size=10000):
    for i in tqdm(range(0, len(raw_datasets), batch_size)):
        yield raw_datasets[i:i+batch_size]["text"]
        
old_tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer = old_tokenizer.train_new_from_iterator(text_iterator=batch_iterator(), vocab_size=51200)
tokenizer.save_pretrained(tokenizer_name)

In [None]:
raw_datasets_1 = raw_datasets.select(range(20000000))
raw_datasets_2 = raw_datasets.select(range(20000000, 40000000))
raw_datasets_3 = raw_datasets.select(range(40000000, len(raw_datasets)))

In [None]:
def preprocess_texts(examples):
    tokenized_inputs = tokenizer(
       examples["text"], return_special_tokens_mask=True, truncation=True, max_length=tokenizer.model_max_length
    )
    return tokenized_inputs

In [None]:
num_proc = multiprocessing.cpu_count()

In [None]:
tokenized_datasets_1 = raw_datasets_1.map(preprocess_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
tokenized_datasets_1.save_to_disk(os.path.join(data_path, tokenized_datasets_folder_name[1]))

In [None]:
tokenized_datasets_2 = raw_datasets_2.map(preprocess_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
tokenized_datasets_2.save_to_disk(os.path.join(data_path, tokenized_datasets_folder_name[2]))

In [None]:
tokenized_datasets_3 = raw_datasets_3.map(preprocess_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
tokenized_datasets_3.save_to_disk(os.path.join(data_path, tokenized_datasets_folder_name[3]))

In [None]:
tokenized_datasets = concatenate_datasets([tokenized_datasets_1, tokenized_datasets_2, tokenized_datasets_1])
tokenized_datasets.save_to_disk(os.path.join(data_path, tokenized_datasets_folder_name[0]))

### load pretrained

In [None]:
# tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [None]:
# tokenized_datasets_1 = load_from_disk(os.path.join(data_path, tokenized_datasets_folder_name[1]))
# tokenized_datasets_2 = load_from_disk(os.path.join(data_path, tokenized_datasets_folder_name[2]))
# tokenized_datasets_3 = load_from_disk(os.path.join(data_path, tokenized_datasets_folder_name[3]))

In [None]:
# tokenized_datasets = load_from_disk(os.path.join(data_path, tokenized_datasets_folder_name[0]))

# Grouping

### from scratch

In [None]:
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= tokenizer.model_max_length:
        total_length = (total_length // tokenizer.model_max_length) * tokenizer.model_max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + tokenizer.model_max_length] for i in range(0, total_length, tokenizer.model_max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

In [None]:
tokenized_datasets = tokenized_datasets.map(group_texts, batched=True, num_proc=16)
# shuffle dataset
tokenized_datasets = tokenized_datasets.shuffle(seed=34)

print(f"the dataset contains in total {len(tokenized_datasets)*tokenizer.model_max_length} tokens")

In [None]:
tokenized_datasets.save_to_disk(os.path.join(data_path, grouped_tokenized_datasets_folder_name))

### load pre-tokenized

In [None]:
# tokenized_datasets = load_from_disk(os.path.join(data_path, grouped_tokenized_datasets_folder_name))

In [None]:
tokenized_datasets

# DDP Train

In [None]:
model_config = AutoConfig.from_pretrained('gpt2')

In [None]:
# DDP Train
def train_trainer_ddp():
    model = AutoModelForCausalLM.from_pretrained('gpt2', config=model_config, cache_dir=model_cache_dir)
    model.resize_token_embeddings(len(tokenizer))

    training_args = TrainingArguments(
        output_dir = model_output_dir,
        logging_dir="runs/"+model_output_dir,
        do_train = True,
        do_eval = True,
        per_device_train_batch_size = 48,
        per_device_eval_batch_size = 48,        
        evaluation_strategy = "steps",
        eval_steps=1000,
        save_strategy="steps",
        save_steps=5000,
        logging_steps = 100,
        learning_rate = 5e-5,
        weight_decay = 0,
        adam_epsilon = 1e-8,
        max_grad_norm = 1.0,
        num_train_epochs = 2,
        disable_tqdm="false",
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, 
                                                mlm=False)

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['valid'][:100000]
    )   

    trainer.train()

notebook_launcher(train_trainer_ddp, args=(), num_processes=4)