In [None]:
import datasets
tokenized_datasets = datasets.load_from_disk('../4_str_to_token/tokenized_datasets')
print(tokenized_datasets)

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" #tokenizerの警告を消す
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #DataCollatorForLanguageModelingの警告を減らす

In [None]:
from transformers import LlamaConfig, LlamaForCausalLM
from transformers import PreTrainedTokenizerFast
my_model_type="llama_60m"
config = LlamaConfig.from_json_file(f"configs/{my_model_type}.json")
model = LlamaForCausalLM(config)

spm_folder = "../3_make_tokenizer/sentencepiece/"
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file = spm_folder + "spm_tokenizer.json",
)
print(tokenizer.encode("[NL]"),tokenizer.vocab_size)

In [None]:
import torch
model = model.to(dtype=torch.bfloat16)
model_size = sum(t.numel() for t in model.parameters())
print(f"{model_size/1000**2:.1f}M parameters")
print(config)

In [None]:
from transformers import DataCollatorForLanguageModeling
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
from transformers import TrainingArguments
#num_train_epochs=1,
#resume_from_checkpoint=True,
#gradient_checkpointing=True,
#torch_compile=True,
#logging_dir = out_dir/run

training_args = TrainingArguments(
    output_dir=f"model_{my_model_type}",

    resume_from_checkpoint=True,

    max_steps=5000,
    warmup_steps=300,
    eval_steps=1000,
    evaluation_strategy="steps",
    save_steps=5000,
    
    learning_rate=1e-4,
    lr_scheduler_type="cosine",
    weight_decay=0.1,
    adam_beta2=0.95,

    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=32,
    
    bf16=True,
    bf16_full_eval=True,
    optim="adamw_torch",
    #gradient_checkpointing=True,
    #activation_checkpointing=True,
    torch_compile=True,
    
    report_to="tensorboard",
    logging_steps=1,
    logging_strategy="steps",
    push_to_hub=False,
)

In [None]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args, 
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
)

In [None]:
trainer.train()

In [None]:
tensorboard --logdir ./model_llama_60m/runs