# Import

In [2]:
import os
import nltk
import json
from nltk import sent_tokenize
from tqdm import tqdm
from itertools import chain

import multiprocessing
import parmap


from datasets import Dataset, load_from_disk, concatenate_datasets
from transformers import AutoTokenizer, AutoConfig, TrainingArguments, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer

import pickle



# Load Dataset

In [21]:
data_path = "./data"
model_cache_dir = "./BERT_cache"
model_output_dir = './BERT'
original_datasets_path = data_path + "/original_datasets"
total_sentence_list_file_name = "total_sentence_list.pickle"
raw_datasets_folder_name = "raw_datasets"
tokenizer_name = "tokenizer_aihub_news"
tokenized_datasets_folder_name = ["tokenized_datasets", "tokenized_datasets_1", "tokenized_datasets_2", "tokenized_datasets_3"]
grouped_tokenized_datasets_folder_name = "grouped_tokenized_datasets"

In [3]:
def listize_dataset(json_file_path):    
    sentence_list = []

    with open(json_file_path, 'r', encoding='UTF-8') as f:
        json_object = json.load(f)

    for line in json_object['SJML']['text']:
        raw_text = line['content'].replace('..', '.')
        raw_text_list = sent_tokenize(raw_text)
        
        for item in raw_text_list:
            if len(item) < 5:
                continue
            else:
                sentence_list.append(item.strip())

    return sentence_list

In [4]:
num_proc = multiprocessing.cpu_count()

In [5]:
file_path_list = []
upper_folder_list = os.listdir(original_datasets_path)
for upper_dir_name in upper_folder_list:
    file_list = os.listdir(os.path.join(original_datasets_path, upper_dir_name))
    for dir_name in file_list:
        file_path = os.path.join(original_datasets_path, upper_dir_name, dir_name)

        json_file_list = os.listdir(file_path)
        for json_file_name in json_file_list:
            json_file_path = os.path.join(file_path, json_file_name)

            file_path_list.append(json_file_path)

sentence_list_of_list = parmap.map(listize_dataset, file_path_list, pm_pbar=True, pm_processes=num_proc)

  0%|          | 0/58997 [00:00<?, ?it/s]

In [6]:
total_sentence_list = list(chain(*sentence_list_of_list))

In [9]:
with open(os.path.join(data_path, total_sentence_list_file_name), 'wb') as fw:
    pickle.dump(total_sentence_list, fw)

In [14]:
with open(os.path.join(data_path, total_sentence_list_file_name), 'rb') as f:
    total_sentence_list = pickle.load(f)

In [10]:
raw_datasets = Dataset.from_dict({"text":total_sentence_list})

In [11]:
print(len(raw_datasets))
raw_datasets[10]

60011706


{'text': '여러 통신사와 복수계약을 맺고 있는 대형 유통점 혹은 판매점들은 그나마 사정이 나은 편.'}

In [15]:
raw_datasets.save_to_disk(os.path.join(data_path, raw_datasets_folder_name))

Saving the dataset (0/24 shards):   0%|          | 0/60011706 [00:00<?, ? examples/s]

In [16]:
raw_datasets = load_from_disk(os.path.join(data_path, raw_datasets_folder_name))

# Tokenizing

In [30]:
# # training a tokenizer from scratch
# def batch_iterator(batch_size=10000):
#     for i in tqdm(range(0, len(raw_datasets), batch_size)):
#         yield raw_datasets[i:i+batch_size]["text"]
        
# old_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# tokenizer = old_tokenizer.train_new_from_iterator(text_iterator=batch_iterator(), vocab_size=15000)
# tokenizer.save_pretrained(tokenizer_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  3%|▎         | 159/6002 [03:01<2:29:25,  1.53s/it]

In [18]:
raw_datasets_1 = raw_datasets.select(range(20000000))
raw_datasets_2 = raw_datasets.select(range(20000000, 40000000))
raw_datasets_3 = raw_datasets.select(range(40000000, len(raw_datasets)))

In [19]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [20]:
def preprocess_texts(examples):
    tokenized_inputs = tokenizer(
       examples["text"], return_special_tokens_mask=True, truncation=True, max_length=tokenizer.model_max_length
    )
    return tokenized_inputs

In [None]:
tokenized_datasets_1 = raw_datasets_1.map(preprocess_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
tokenized_datasets_1.save_to_disk(os.path.join(data_path, tokenized_datasets_folder_name[1]))

In [None]:
tokenized_datasets_2 = raw_datasets_2.map(preprocess_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
tokenized_datasets_2.save_to_disk(os.path.join(data_path, tokenized_datasets_folder_name[2]))

In [None]:
tokenized_datasets_3 = raw_datasets_3.map(preprocess_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
tokenized_datasets_3.save_to_disk(os.path.join(data_path, tokenized_datasets_folder_name[3]))

In [22]:
tokenized_datasets_1 = load_from_disk(os.path.join(data_path, tokenized_datasets_folder_name[1]))

In [23]:
tokenized_datasets_2 = load_from_disk(os.path.join(data_path, tokenized_datasets_folder_name[2]))

In [24]:
tokenized_datasets_3 = load_from_disk(os.path.join(data_path, tokenized_datasets_folder_name[3]))

In [25]:
tokenized_datasets = concatenate_datasets([tokenized_datasets_1, tokenized_datasets_2, tokenized_datasets_1])

In [26]:
tokenized_datasets

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 60000000
})

In [None]:
tokenized_datasets.save_to_disk(os.path.join(data_path, tokenized_datasets_folder_name[0]))

In [None]:
tokenized_datasets = load_from_disk(os.path.join(data_path, tokenized_datasets_folder_name[0]))

# Grouping

In [None]:
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= tokenizer.model_max_length:
        total_length = (total_length // tokenizer.model_max_length) * tokenizer.model_max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + tokenizer.model_max_length] for i in range(0, total_length, tokenizer.model_max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

NameError: name 'tokenized_datasets' is not defined

In [None]:
tokenized_datasets = tokenized_datasets.map(group_texts, batched=True, num_proc=16)
# shuffle dataset
tokenized_datasets = tokenized_datasets.shuffle(seed=34)

print(f"the dataset contains in total {len(tokenized_datasets)*tokenizer.model_max_length} tokens")

In [None]:
tokenized_datasets.save_to_disk(os.path.join(data_path, grouped_tokenized_datasets_folder_name))

In [None]:
tokenized_datasets = load_from_disk(os.path.join(data_path, grouped_tokenized_datasets_folder_name))

# Prepare Training

In [None]:
model_config = AutoConfig.from_pretrained('bert-base-uncased')

In [None]:
model = AutoModelForMaskedLM.from_pretrained('bert-base-uncased', config=model_config, cache_dir=model_cache_dir)
model.resize_token_embeddings(len(tokenizer))

In [None]:
training_args = TrainingArguments(
    output_dir = model_output_dir,
    overwrite_output_dir = True,
    do_train = True,
    do_eval = True,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    logging_steps = 50,
    prediction_loss_only = True,
    learning_rate = 5e-5,
    weight_decay = 0,
    adam_epsilon = 1e-8,
    max_grad_norm = 1.0,
    num_train_epochs = 1,
    save_steps = -1
)

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, 
                                                mlm=True, 
                                                mlm_probability=0.15,)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
)

# Train

In [10]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,12.178
2,9.0122
3,8.871
4,10.2788
5,8.6166
6,8.4548
7,8.3969
8,8.2701
9,8.145
10,8.0277
