## Prepare the dataset

In [1]:
import os
import nltk
import json
from nltk import sent_tokenize
from tqdm import tqdm

path = "./data/aihub_web/TS1/"
file_list = os.listdir(path)

total_sentence_list = []
for dir_name in file_list:
    file_path = os.path.join(path, dir_name)
    json_file_list = os.listdir(file_path)
    
    for json_file_name in tqdm(json_file_list):
        json_file_path = os.path.join(file_path, json_file_name)
        
        with open(json_file_path, 'r', encoding='UTF-8') as f:
            json_object = json.load(f)

        for line in json_object['SJML']['text']:
            raw_text = line['content'].replace('..', '.')
            raw_text_list = sent_tokenize(raw_text)
            
            for item in raw_text_list:
                if len(item) < 5:
                    continue
                else:
                    total_sentence_list.append(item.strip())
        
    break
# import json

# with open('input.json') as f:
#     json_object = json.load(f)

100%|███████████████████████████████████████████████████████████████████████████| 1217/1217 [01:16<00:00, 15.92it/s]


In [2]:
from datasets import Dataset

raw_datasets = Dataset.from_dict({"text":total_sentence_list})

In [3]:
print(len(raw_datasets))
raw_datasets[1]

2217912


{'text': '위법성 논란부터 불공정 경쟁 시비, 미디어 시장의 특수성 보호 여부 등 쟁점의 폭도 넓어지고 있다.'}

In [4]:
from transformers import AutoTokenizer
import multiprocessing
# from tqdm import tqdm


# training a tokenizer from scratch
def batch_iterator(batch_size=10000):
    for i in tqdm(range(0, len(raw_datasets), batch_size)):
        yield raw_datasets[i:i+batch_size]["text"]
        
old_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokenizer = old_tokenizer.train_new_from_iterator(text_iterator=batch_iterator(), vocab_size=15000)
tokenizer.save_pretrained("tokenizer_aihub_news2")

100%|█████████████████████████████████████████████████████████████████████████████| 222/222 [00:11<00:00, 19.32it/s]







('tokenizer_aihub_news2/tokenizer_config.json',
 'tokenizer_aihub_news2/special_tokens_map.json',
 'tokenizer_aihub_news2/vocab.txt',
 'tokenizer_aihub_news2/added_tokens.json',
 'tokenizer_aihub_news2/tokenizer.json')

In [5]:
tokenizer = AutoTokenizer.from_pretrained("tokenizer_aihub_news2")
num_proc = multiprocessing.cpu_count()

def group_texts(examples):
    tokenized_inputs = tokenizer(
       examples["text"], return_special_tokens_mask=True, truncation=True, max_length=tokenizer.model_max_length
    )
    return tokenized_inputs

# preprocess dataset
tokenized_datasets = raw_datasets.map(group_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
tokenized_datasets.features

Map (num_proc=64):   0%|          | 0/2217912 [00:00<?, ? examples/s]

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'special_tokens_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [6]:
from itertools import chain

# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= tokenizer.model_max_length:
        total_length = (total_length // tokenizer.model_max_length) * tokenizer.model_max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + tokenizer.model_max_length] for i in range(0, total_length, tokenizer.model_max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

tokenized_datasets = tokenized_datasets.map(group_texts, batched=True, num_proc=num_proc)
# shuffle dataset
tokenized_datasets = tokenized_datasets.shuffle(seed=34)

print(f"the dataset contains in total {len(tokenized_datasets)*tokenizer.model_max_length} tokens")

Map (num_proc=64):   0%|          | 0/2217912 [00:00<?, ? examples/s]

the dataset contains in total 68260864 tokens


In [7]:
from transformers import AutoConfig

model_config = AutoConfig.from_pretrained('bert-base-uncased')

In [8]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained('bert-base-uncased', config=model_config, cache_dir="./BERT_cache")
model.resize_token_embeddings(len(tokenizer))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Embedding(15000, 768)

In [9]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir = "./BERT",
    overwrite_output_dir = True,
    do_train = True,
    do_eval = True,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    logging_strategy = "steps",
    logging_steps = 1,
    prediction_loss_only = True,
    learning_rate = 5e-5,
    weight_decay = 0,
    adam_epsilon = 1e-8,
    max_grad_norm = 1.0,
    num_train_epochs = 1,
    save_steps = -1
)

In [10]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, 
                                                mlm=True, 
                                                mlm_probability=0.15,)

In [11]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,10.7012
2,10.2319
3,10.0193
4,9.7796
5,9.7226
6,9.7015
7,9.6315
8,9.5956
9,9.5528
10,9.4873
