In [21]:
# https://huggingface.co/docs/trl/sft_trainer

In [22]:
import torch
import random
import torch.backends.cudnn as cudnn

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
cudnn.benchmark = False
cudnn.deterministic = True
random.seed(0)

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments

import os
# os.environ['CUDA_VISIBLE_DEVICES']="0"

In [23]:
max_length = 512
MODEL = "EleutherAI/polyglot-ko-1.3b"
MODEL = os.path.join('model/further_train', "checkpoint-last")
# MODEL = "facebook/xglm-564M"
# MODEL = "facebook/xglm-1.7B"
# MODEL = "ai-forever/mGPT"
# MODEL = "maywell/Synatra-42dot-1.3B"

model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
)
# model = model.half()

tokenizer = AutoTokenizer.from_pretrained(MODEL)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [24]:
tokenizer.model_max_length = max_length
tokenizer.max_len = max_length

In [25]:
# from datasets import load_dataset
# dataset = load_dataset("csv", data_files="./data/preproc_train.csv", split="train").
# dataset['question']

In [26]:
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

raw_datasets = load_dataset("csv", data_files="./data/add_data/preproc_dictionary.csv", split="train")

In [27]:
def tokenize(element):
    outputs = tokenizer(
        element['text']
    )
    
    input_batch = []
    for input_ids in outputs['input_ids']:
        input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(tokenize, batched=True)
tokenized_datasets

Map:   0%|          | 0/6666 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (651 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 6666/6666 [00:00<00:00, 17449.96 examples/s]


Dataset({
    features: ['text', 'input_ids'],
    num_rows: 6666
})

In [28]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

args = TrainingArguments(
    output_dir="model/further_train",
    per_device_train_batch_size=1,
    # per_device_eval_batch_size=32,
    # evaluation_strategy="steps",
    # eval_steps=5000,
    # logging_steps=5000,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    weight_decay=0.1,
    warmup_steps=1000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5000,
    save_total_limit=1,
    fp16=True,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
    # eval_dataset=tokenized_datasets["valid"],
)

In [29]:
trainer.train()

100%|██████████| 416/416 [3:07:56<00:00, 27.11s/it]  

{'train_runtime': 11276.8747, 'train_samples_per_second': 1.182, 'train_steps_per_second': 0.037, 'train_loss': 1.9272014911358173, 'epoch': 2.0}





TrainOutput(global_step=416, training_loss=1.9272014911358173, metrics={'train_runtime': 11276.8747, 'train_samples_per_second': 1.182, 'train_steps_per_second': 0.037, 'train_loss': 1.9272014911358173, 'epoch': 2.0})

In [30]:
import os
model.save_pretrained(os.path.join('model/further_train', "checkpoint-last"))
tokenizer.save_pretrained(os.path.join('model/further_train', "checkpoint-last"))

('model/further_train\\checkpoint-last\\tokenizer_config.json',
 'model/further_train\\checkpoint-last\\special_tokens_map.json',
 'model/further_train\\checkpoint-last\\tokenizer.json')

In [31]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [32]:
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
# from peft import LoraConfig

MODEL_PATH = "model/"

DEBUG = False
if DEBUG:
    epochs = 1
    dataset = load_dataset("csv", data_files="./data/preproc_train_debug.csv", split="train")
else:
    epochs = 5
    dataset = load_dataset("csv", data_files="./data/preproc_train.csv", split="train")

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['quenstion'])):
        text = f"""###질문: {example['quenstion'][i]}
###답변: {example['answer'][i]}"""
        output_texts.append(text)
    return output_texts

# instruction_template = "###질문:"
response_template = "###답변:"
collator = DataCollatorForCompletionOnlyLM(
    # instruction_template=instruction_template,
    response_template=response_template,
    tokenizer=tokenizer)

training_args = TrainingArguments(
    learning_rate=1e-4,
    output_dir=MODEL_PATH + MODEL + "_sft_" + str(epochs),         # output directory to where save model checkpoint
    # evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,
    num_train_epochs=epochs,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=1, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=8,  # accumulating the gradients before updating the weights
    # per_device_eval_batch_size=16,  # evaluation batch size
    logging_steps=200,             # evaluate, log and save model checkpoints every 1000 step
    # save_steps=50000,
    fp16=True,                      # memory save
    # weight_decay=0.01,
    # warmup_steps=2970,
    #dataloader_num_workers=2,
#     remove_unused_columns=False
    #optim="adafactor",
    # load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=2,           # whether you don't have much space so you let only 3 model weights saved in the disk
#     report_to=False               # when you don't want to log with wandb
)

trainer = SFTTrainer(
    model,
    train_dataset=dataset,
    args=training_args,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    max_seq_length=max_length,
)

trainer.train()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 20%|█▉        | 200/1005 [1:19:59<5:17:01, 23.63s/it]

{'loss': 1.2091, 'learning_rate': 8.00995024875622e-05, 'epoch': 0.99}


 40%|███▉      | 400/1005 [2:40:07<4:05:52, 24.38s/it]

{'loss': 0.4157, 'learning_rate': 6.019900497512438e-05, 'epoch': 1.99}


 60%|█████▉    | 600/1005 [4:00:20<2:41:42, 23.96s/it]

{'loss': 0.1186, 'learning_rate': 4.029850746268657e-05, 'epoch': 2.98}


 80%|███████▉  | 800/1005 [5:20:28<1:22:53, 24.26s/it]

{'loss': 0.0541, 'learning_rate': 2.0398009950248755e-05, 'epoch': 3.98}


100%|█████████▉| 1000/1005 [6:40:55<02:04, 24.86s/it] 

{'loss': 0.0367, 'learning_rate': 4.975124378109453e-07, 'epoch': 4.97}


100%|██████████| 1005/1005 [6:43:20<00:00, 24.08s/it]

{'train_runtime': 24200.4037, 'train_samples_per_second': 1.331, 'train_steps_per_second': 0.042, 'train_loss': 0.3652077594206701, 'epoch': 4.99}





TrainOutput(global_step=1005, training_loss=0.3652077594206701, metrics={'train_runtime': 24200.4037, 'train_samples_per_second': 1.331, 'train_steps_per_second': 0.042, 'train_loss': 0.3652077594206701, 'epoch': 4.99})

In [33]:
# import random
# import pandas as pd

# train_df = pd.read_csv('./data/train.csv')
# idx = random.randint(0, 664)
# data = train_df.iloc[1, 1:3]
# ans = train_df.iloc[1, 4]
# q1, q2 = data
# tokened = tokenizer(q1, return_tensors='pt').to(device=f"cuda")
# del tokened['token_type_ids']
# result = model(**tokened)
# result.logits

In [34]:
import os
model.save_pretrained(os.path.join(MODEL_PATH + MODEL + "_f-sft_" + str(epochs), "checkpoint-last"))
tokenizer.save_pretrained(os.path.join(MODEL_PATH + MODEL + "_f-sft_" + str(epochs), "checkpoint-last"))

('model/model/further_train\\checkpoint-last_f-sft_5\\checkpoint-last\\tokenizer_config.json',
 'model/model/further_train\\checkpoint-last_f-sft_5\\checkpoint-last\\special_tokens_map.json',
 'model/model/further_train\\checkpoint-last_f-sft_5\\checkpoint-last\\tokenizer.json')