# Import

In [1]:
import os
import nltk
import json
from time import gmtime, strftime
# from nltk import sent_tokenize
from tqdm import tqdm
from itertools import chain

import multiprocessing

import torch

from datasets import load_dataset, load_from_disk, concatenate_datasets
from transformers import AutoTokenizer, AutoConfig, TrainingArguments, AutoModelForSeq2SeqLM, Trainer, DataCollatorForLanguageModeling
from accelerate import notebook_launcher

from data_collator import DataCollatorForDenoisingTasks

[nltk_data] Downloading package punkt to /home/work/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load Dataset

In [2]:
data_path = "../data"
original_train_datasets_path = data_path + "/original_datasets/TS1"
original_valid_datasets_path = data_path + "/original_datasets/VS1"

current_dir = os.getcwd()
current_time = strftime("%Y-%m-%d-%H:%M:%S", gmtime())
model_output_dir = './bart'+"_"+current_time
model_cache_dir = "./bart_cache"
raw_datasets_cache_name = ".raw_datasets_cache"
raw_datasets_cache_path = os.path.join(current_dir, raw_datasets_cache_name)

train_sentence_list_file_name = "train_sentence_list.txt"
valid_sentence_list_file_name = "valid_sentence_list.txt"
tokenizer_name = "tokenizer_aihub_news_bart"
tokenized_datasets_folder_name = ["bart_tokenized_datasets", "bart_tokenized_datasets_1", "bart_tokenized_datasets_2", "bart_tokenized_datasets_3"]
grouped_tokenized_datasets_folder_name = "bart_grouped_tokenized_datasets"

old_model_name = "facebook/bart-base"

In [3]:
num_proc = multiprocessing.cpu_count()

In [4]:
# raw_datasets = load_dataset('text', data_files={"train": os.path.join(data_path, train_sentence_list_file_name), "valid": os.path.join(data_path, valid_sentence_list_file_name)}, cache_dir=raw_datasets_cache_path)

# Tokenizing

### from scratch

In [5]:
# def get_training_corpus(batch_size=10000):
#     for dataset in [raw_datasets['train'], raw_datasets['valid']]:
#         for start_idx in range(0, len(dataset), batch_size):
#             yield dataset[start_idx : start_idx + batch_size]["text"]
        
# old_tokenizer = AutoTokenizer.from_pretrained(old_model_name)
# tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(), 30000)
# tokenizer.save_pretrained(tokenizer_name)

In [6]:
# raw_datasets_1 = raw_datasets.select(range(20000000))
# raw_datasets_2 = raw_datasets.select(range(20000000, 40000000))
# raw_datasets_3 = raw_datasets.select(range(40000000, len(raw_datasets)))

In [7]:
# def preprocess_texts(examples):
#     tokenized_inputs = tokenizer(
#        examples["text"], return_special_tokens_mask=True, truncation=True, max_length=512
#     )
#     return tokenized_inputs

In [8]:
# tokenized_datasets = raw_datasets.map(preprocess_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
# tokenized_datasets.save_to_disk(os.path.join(current_dir, tokenized_datasets_folder_name[0]))

In [9]:
# tokenized_datasets_1 = raw_datasets_1.map(preprocess_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
# tokenized_datasets_1.save_to_disk(os.path.join(current_dir, tokenized_datasets_folder_name[1]))

In [10]:
# tokenized_datasets_2 = raw_datasets_2.map(preprocess_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
# tokenized_datasets_2.save_to_disk(os.path.join(current_dir, tokenized_datasets_folder_name[2]))

In [11]:
# tokenized_datasets_3 = raw_datasets_3.map(preprocess_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
# tokenized_datasets_3.save_to_disk(os.path.join(current_dir, tokenized_datasets_folder_name[3]))

In [12]:
# tokenized_datasets = concatenate_datasets([tokenized_datasets_1, tokenized_datasets_2, tokenized_datasets_1])
# tokenized_datasets.save_to_disk(os.path.join(current_dir, tokenized_datasets_folder_name[0]))

### load pretrained

In [13]:
# tokenized_datasets_1 = load_from_disk(os.path.join(current_dir, tokenized_datasets_folder_name[1]))
# tokenized_datasets_2 = load_from_disk(os.path.join(current_dir, tokenized_datasets_folder_name[2]))
# tokenized_datasets_3 = load_from_disk(os.path.join(current_dir, tokenized_datasets_folder_name[3]))

In [14]:
# tokenized_datasets = load_from_disk(os.path.join(current_dir, tokenized_datasets_folder_name[0]))

# Grouping

### from scratch

In [15]:
# # Main data processing function that will concatenate all texts from our dataset and generate chunks of
# # max_seq_length.
# model_max_length = 512
# def group_texts(examples):
#     # Concatenate all texts.
#     concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
#     total_length = len(concatenated_examples[list(examples.keys())[0]])
#     # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
#     # customize this part to your needs.
#     if total_length >= model_max_length:
#         total_length = (total_length // model_max_length) * model_max_length
#     # Split by chunks of max_len.
#     result = {
#         k: [t[i : i + model_max_length] for i in range(0, total_length, model_max_length)]
#         for k, t in concatenated_examples.items()
#     }
#     return result

In [16]:
# tokenized_datasets = tokenized_datasets.map(group_texts, batched=True, num_proc=num_proc)
# # shuffle dataset
# tokenized_datasets = tokenized_datasets.shuffle(seed=34)

# print(f"the dataset contains in total {len(tokenized_datasets)*model_max_length} tokens")

In [17]:
# tokenized_datasets.save_to_disk(os.path.join(current_dir, grouped_tokenized_datasets_folder_name))

### load pre-tokenized

In [18]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [19]:
tokenized_datasets = load_from_disk(os.path.join(current_dir, grouped_tokenized_datasets_folder_name))

# DDP Train

In [20]:
model_config = AutoConfig.from_pretrained(old_model_name)

In [21]:
# # DDP Train
# def train_trainer_ddp():
#     model = AutoModelForSeq2SeqLM.from_pretrained(old_model_name, config=model_config, cache_dir=model_cache_dir)
#     model.resize_token_embeddings(len(tokenizer))

#     training_args = TrainingArguments(
#         output_dir = model_output_dir,
#         logging_dir="runs/"+model_output_dir,
#         do_train = True,
#         do_eval = True,
#         no_cuda = False,
#         per_device_train_batch_size = 28,
#         per_device_eval_batch_size = 28,        
#         evaluation_strategy = "steps",
#         eval_steps=1000,
#         save_strategy="steps",
#         save_steps=5000,
#         logging_steps = 100,
#         learning_rate = 5e-5,
#         weight_decay = 0,
#         adam_epsilon = 1e-8,
#         max_grad_norm = 1.0,
#         num_train_epochs = 10,
#         disable_tqdm="false",
          report_to="tensorboard"
#     )

#     data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         data_collator=data_collator,
#         train_dataset=tokenized_datasets['train'],
#         eval_dataset=tokenized_datasets['valid'].select(range(100000))
#     )   
    
#     trainer.train()

# notebook_launcher(train_trainer_ddp, args=(), num_processes=4)

In [22]:
model = AutoModelForSeq2SeqLM.from_pretrained(old_model_name, config=model_config, cache_dir=model_cache_dir)
model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir = model_output_dir,
    logging_dir="runs/"+model_output_dir,
    do_train = True,
    do_eval = True,
    no_cuda = False,
    per_device_train_batch_size = 28,
    per_device_eval_batch_size = 28,        
    evaluation_strategy = "steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=5000,
    logging_steps = 100,
    learning_rate = 5e-5,
    weight_decay = 0,
    adam_epsilon = 1e-8,
    max_grad_norm = 1.0,
    num_train_epochs = 10,
    disable_tqdm="false",
    report_to="tensorboard"
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['valid'].select(range(100000))
)   

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: