# Import

In [1]:
import os
import nltk
import json
from time import gmtime, strftime
from nltk import sent_tokenize
from tqdm import tqdm
from itertools import chain

import multiprocessing
import parmap


from datasets import Dataset, load_from_disk, concatenate_datasets
from transformers import AutoTokenizer, AutoConfig, TrainingArguments, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer
from accelerate import notebook_launcher

import pickle



# Load Dataset

In [None]:
data_path = "./data"
model_cache_dir = "./BERT_cache"
current_time = strftime("%Y-%m-%d-%H:%M:%S", gmtime())
model_output_dir = './BERT'+"_"+current_time
original_train_datasets_path = data_path + "/original_datasets/TS1"
original_valid_datasets_path = data_path + "/original_datasets/VS1"
train_sentence_list_file_name = "train_sentence_list.pickle"
valid_sentence_list_file_name = "valid_sentence_list.pickle"
raw_datasets_folder_name = "raw_datasets"
tokenizer_name = "tokenizer_aihub_news"
tokenized_datasets_folder_name = ["tokenized_datasets", "tokenized_datasets_1", "tokenized_datasets_2", "tokenized_datasets_3"]
grouped_tokenized_datasets_folder_name = "grouped_tokenized_datasets"

In [5]:
def listize_dataset(json_file_path):    
    sentence_list = []

    with open(json_file_path, 'r', encoding='UTF-8') as f:
        json_object = json.load(f)

    for line in json_object['SJML']['text']:
        raw_text = line['content'].replace('..', '.')
        raw_text_list = sent_tokenize(raw_text)
        
        for item in raw_text_list:
            if len(item) < 5:
                continue
            else:
                sentence_list.append(item.strip())

    return sentence_list

In [6]:
num_proc = multiprocessing.cpu_count()

In [7]:
file_path_list = []

file_list = os.listdir(os.path.join(original_train_datasets_path))
for dir_name in file_list:
    file_path = os.path.join(original_train_datasets_path, dir_name)

    json_file_list = os.listdir(file_path)
    for json_file_name in json_file_list:
        json_file_path = os.path.join(file_path, json_file_name)

        file_path_list.append(json_file_path)

train_sentence_list_of_list = parmap.map(listize_dataset, file_path_list, pm_pbar=True, pm_processes=num_proc)

  0%|          | 0/51830 [00:00<?, ?it/s]

In [8]:
file_path_list = []
file_list = os.listdir(os.path.join(original_valid_datasets_path))
for dir_name in file_list:
    file_path = os.path.join(original_valid_datasets_path, dir_name)

    json_file_list = os.listdir(file_path)
    for json_file_name in json_file_list:
        json_file_path = os.path.join(file_path, json_file_name)

        file_path_list.append(json_file_path)

valid_sentence_list_of_list = parmap.map(listize_dataset, file_path_list, pm_pbar=True, pm_processes=num_proc)

  0%|          | 0/7167 [00:00<?, ?it/s]

In [6]:
train_sentence_list = list(chain(*train_sentence_list_of_list))
valid_sentence_list = list(chain(*valid_sentence_list_of_list))

In [7]:
# with open(os.path.join(data_path, train_sentence_list_file_name), 'wb') as fw:
#     pickle.dump(train_sentence_list, fw)

# with open(os.path.join(data_path, valid_sentence_list_file_name), 'wb') as fw:
#     pickle.dump(valid_sentence_list, fw)

In [8]:
# with open(os.path.join(data_path, train_sentence_list_file_name), 'rb') as f:
#     train_sentence_list = pickle.load(f)

# with open(os.path.join(data_path, valid_sentence_list_file_name), 'rb') as f:
#     valid_sentence_list = pickle.load(f)

In [9]:
# raw_datasets = Dataset.from_dict({"text":total_sentence_list})

In [10]:
# print(len(raw_datasets))
# raw_datasets[10]

In [11]:
# raw_datasets.save_to_disk(os.path.join(data_path, raw_datasets_folder_name))

In [12]:
# raw_datasets = load_from_disk(os.path.join(data_path, raw_datasets_folder_name))

# Tokenizing

In [13]:
# # training a tokenizer from scratch
# def batch_iterator(batch_size=10000):
#     for i in tqdm(range(0, len(raw_datasets), batch_size)):
#         yield raw_datasets[i:i+batch_size]["text"]
        
# old_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# tokenizer = old_tokenizer.train_new_from_iterator(text_iterator=batch_iterator(), vocab_size=15000)
# tokenizer.save_pretrained(tokenizer_name)

In [14]:
# raw_datasets_1 = raw_datasets.select(range(20000000))
# raw_datasets_2 = raw_datasets.select(range(20000000, 40000000))
# raw_datasets_3 = raw_datasets.select(range(40000000, len(raw_datasets)))

In [15]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [16]:
# def preprocess_texts(examples):
#     tokenized_inputs = tokenizer(
#        examples["text"], return_special_tokens_mask=True, truncation=True, max_length=tokenizer.model_max_length
#     )
#     return tokenized_inputs

In [17]:
# tokenized_datasets_1 = raw_datasets_1.map(preprocess_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
# tokenized_datasets_1.save_to_disk(os.path.join(data_path, tokenized_datasets_folder_name[1]))

In [18]:
# tokenized_datasets_2 = raw_datasets_2.map(preprocess_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
# tokenized_datasets_2.save_to_disk(os.path.join(data_path, tokenized_datasets_folder_name[2]))

In [19]:
# tokenized_datasets_3 = raw_datasets_3.map(preprocess_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
# tokenized_datasets_3.save_to_disk(os.path.join(data_path, tokenized_datasets_folder_name[3]))

In [20]:
# tokenized_datasets_1 = load_from_disk(os.path.join(data_path, tokenized_datasets_folder_name[1]))

In [21]:
# tokenized_datasets_2 = load_from_disk(os.path.join(data_path, tokenized_datasets_folder_name[2]))

In [22]:
# tokenized_datasets_3 = load_from_disk(os.path.join(data_path, tokenized_datasets_folder_name[3]))

In [23]:
# tokenized_datasets = concatenate_datasets([tokenized_datasets_1, tokenized_datasets_2, tokenized_datasets_1])

In [24]:
# tokenized_datasets

In [25]:
# tokenized_datasets.save_to_disk(os.path.join(data_path, tokenized_datasets_folder_name[0]))

In [26]:
# tokenized_datasets = load_from_disk(os.path.join(data_path, tokenized_datasets_folder_name[0]))

# Grouping

In [27]:
# # Main data processing function that will concatenate all texts from our dataset and generate chunks of
# # max_seq_length.
# def group_texts(examples):
#     # Concatenate all texts.
#     concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
#     total_length = len(concatenated_examples[list(examples.keys())[0]])
#     # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
#     # customize this part to your needs.
#     if total_length >= tokenizer.model_max_length:
#         total_length = (total_length // tokenizer.model_max_length) * tokenizer.model_max_length
#     # Split by chunks of max_len.
#     result = {
#         k: [t[i : i + tokenizer.model_max_length] for i in range(0, total_length, tokenizer.model_max_length)]
#         for k, t in concatenated_examples.items()
#     }
#     return result

In [28]:
# tokenized_datasets = tokenized_datasets.map(group_texts, batched=True, num_proc=16)
# # shuffle dataset
# tokenized_datasets = tokenized_datasets.shuffle(seed=34)

# print(f"the dataset contains in total {len(tokenized_datasets)*tokenizer.model_max_length} tokens")

In [29]:
# tokenized_datasets.save_to_disk(os.path.join(data_path, grouped_tokenized_datasets_folder_name))

In [30]:
tokenized_datasets = load_from_disk(os.path.join(data_path, grouped_tokenized_datasets_folder_name))

# DDP Train

In [31]:
model_config = AutoConfig.from_pretrained('bert-base-uncased')

In [32]:
# DDP Train
def train_trainer_ddp():
    model = AutoModelForMaskedLM.from_pretrained('bert-base-uncased', config=model_config, cache_dir=model_cache_dir)
    model.resize_token_embeddings(len(tokenizer))

    training_args = TrainingArguments(
        output_dir = model_output_dir,
        logging_dir="runs/"+model_output_dir,
        do_train = True,
        do_eval = True,
        per_device_train_batch_size = 48,
        per_device_eval_batch_size = 48,        
        evaluation_strategy = "epoch",        
        save_strategy="steps",
        save_steps=5000,
        logging_steps = 50,
        prediction_loss_only = True,
        learning_rate = 5e-5,
        weight_decay = 0,
        adam_epsilon = 1e-8,
        max_grad_norm = 1.0,
        num_train_epochs = 2,
        disable_tqdm="false",
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, 
                                                mlm=True, 
                                                mlm_probability=0.15,)

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_datasets,
    )   

    trainer.train()

notebook_launcher(train_trainer_ddp, args=(), num_processes=4)

Launching training on 4 GPUs.


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model

{'loss': 8.7211, 'learning_rate': 4.994944592736391e-05, 'epoch': 0.0}
{'loss': 6.7045, 'learning_rate': 4.989889185472782e-05, 'epoch': 0.0}
{'loss': 6.1564, 'learning_rate': 4.984833778209173e-05, 'epoch': 0.01}
{'loss': 5.9144, 'learning_rate': 4.979778370945564e-05, 'epoch': 0.01}
{'loss': 5.7327, 'learning_rate': 4.974722963681954e-05, 'epoch': 0.01}
{'loss': 5.5685, 'learning_rate': 4.969667556418345e-05, 'epoch': 0.01}
{'loss': 5.4161, 'learning_rate': 4.964612149154736e-05, 'epoch': 0.01}
{'loss': 5.2591, 'learning_rate': 4.959556741891127e-05, 'epoch': 0.02}
{'loss': 5.0988, 'learning_rate': 4.9545013346275176e-05, 'epoch': 0.02}
{'loss': 4.9543, 'learning_rate': 4.949445927363909e-05, 'epoch': 0.02}
{'loss': 4.8139, 'learning_rate': 4.9443905201002995e-05, 'epoch': 0.02}
{'loss': 4.697, 'learning_rate': 4.939335112836691e-05, 'epoch': 0.02}
{'loss': 4.5746, 'learning_rate': 4.9342797055730814e-05, 'epoch': 0.03}
{'loss': 4.4999, 'learning_rate': 4.929224298309472e-05, 'epoch'

KeyboardInterrupt: 

{'loss': 2.2576, 'learning_rate': 4.049583434441479e-05, 'epoch': 0.38}
{'loss': 2.2298, 'learning_rate': 4.0445280271778696e-05, 'epoch': 0.38}
{'loss': 2.2318, 'learning_rate': 4.039472619914261e-05, 'epoch': 0.38}
{'loss': 2.2288, 'learning_rate': 4.0344172126506514e-05, 'epoch': 0.39}
{'loss': 2.2412, 'learning_rate': 4.029361805387043e-05, 'epoch': 0.39}
{'loss': 2.2214, 'learning_rate': 4.024306398123433e-05, 'epoch': 0.39}
{'loss': 2.2224, 'learning_rate': 4.019250990859824e-05, 'epoch': 0.39}
{'loss': 2.2284, 'learning_rate': 4.0141955835962145e-05, 'epoch': 0.39}
{'loss': 2.2156, 'learning_rate': 4.009140176332605e-05, 'epoch': 0.4}
{'loss': 2.2186, 'learning_rate': 4.0040847690689964e-05, 'epoch': 0.4}
{'loss': 2.2218, 'learning_rate': 3.999029361805387e-05, 'epoch': 0.4}
{'loss': 2.2168, 'learning_rate': 3.993973954541778e-05, 'epoch': 0.4}
{'loss': 2.2098, 'learning_rate': 3.988918547278169e-05, 'epoch': 0.4}
{'loss': 2.2, 'learning_rate': 3.98386314001456e-05, 'epoch': 0.4

# Prepare Training

In [None]:
# model_config = AutoConfig.from_pretrained('bert-base-uncased')

In [None]:
# model = AutoModelForMaskedLM.from_pretrained('bert-base-uncased', config=model_config, cache_dir=model_cache_dir)
# model.resize_token_embeddings(len(tokenizer))

In [None]:
# training_args = TrainingArguments(
#     output_dir = model_output_dir,
#     overwrite_output_dir = True,
#     do_train = True,
#     do_eval = True,
#     per_device_train_batch_size = 32,
#     per_device_eval_batch_size = 32,
#     logging_steps = 50,
#     prediction_loss_only = True,
#     learning_rate = 5e-5,
#     weight_decay = 0,
#     adam_epsilon = 1e-8,
#     max_grad_norm = 1.0,
#     num_train_epochs = 2,
#     save_steps = -1
# )

In [None]:
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, 
#                                                 mlm=True, 
#                                                 mlm_probability=0.15,)

In [None]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     data_collator=data_collator,
#     train_dataset=tokenized_datasets,
# )

In [None]:
# trainer.train()