In [6]:
# https://huggingface.co/docs/trl/sft_trainer

In [1]:
import torch
import random
import torch.backends.cudnn as cudnn

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
cudnn.benchmark = False
cudnn.deterministic = True
random.seed(0)

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments

import os
# os.environ['CUDA_VISIBLE_DEVICES']="0"

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
from bitnet_llama.modeling_llama import LlamaForCausalLM, BitLinear
from transformers import AutoTokenizer
from transformers import LlamaConfig

tokenizer = AutoTokenizer.from_pretrained("beomi/llama-2-ko-7b")
config = LlamaConfig(
    vocab_size=len(tokenizer),
    hidden_size=512,
    intermediate_size=2048,
    max_position_embeddings=512,
    num_attention_heads=32,
    num_hidden_layers=4,
    num_key_value_heads=32,
    pretraining_tp=1,
)
model = LlamaForCausalLM(config)
model.save_pretrained('test_1bit', save_binarized_weights=True)

In [19]:
max_length = 512
model = AutoModelForCausalLM.from_pretrained('test_1bit')

Some weights of the model checkpoint at test_1bit were not used when initializing LlamaForCausalLM: ['model.layers.0.mlp.down_proj.weight', 'model.layers.0.mlp.gate_proj.weight', 'model.layers.0.mlp.up_proj.weight', 'model.layers.0.self_attn.k_proj.weight', 'model.layers.0.self_attn.o_proj.weight', 'model.layers.0.self_attn.q_proj.weight', 'model.layers.0.self_attn.v_proj.weight', 'model.layers.1.mlp.down_proj.weight', 'model.layers.1.mlp.gate_proj.weight', 'model.layers.1.mlp.up_proj.weight', 'model.layers.1.self_attn.k_proj.weight', 'model.layers.1.self_attn.o_proj.weight', 'model.layers.1.self_attn.q_proj.weight', 'model.layers.1.self_attn.v_proj.weight', 'model.layers.2.mlp.down_proj.weight', 'model.layers.2.mlp.gate_proj.weight', 'model.layers.2.mlp.up_proj.weight', 'model.layers.2.self_attn.k_proj.weight', 'model.layers.2.self_attn.o_proj.weight', 'model.layers.2.self_attn.q_proj.weight', 'model.layers.2.self_attn.v_proj.weight', 'model.layers.3.mlp.down_proj.weight', 'model.laye

In [23]:
max_length = 512
MODEL = "EleutherAI/polyglot-ko-1.3b"
MODEL = os.path.join('model/further_train', "checkpoint-last")
# MODEL = "facebook/xglm-564M"
# MODEL = "facebook/xglm-1.7B"
# MODEL = "ai-forever/mGPT"
# MODEL = "maywell/Synatra-42dot-1.3B"

model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
)
# model = model.half()

tokenizer = AutoTokenizer.from_pretrained(MODEL)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [24]:
tokenizer.model_max_length = max_length
tokenizer.max_len = max_length

In [25]:
# from datasets import load_dataset
# dataset = load_dataset("csv", data_files="./data/preproc_train.csv", split="train").
# dataset['question']

In [4]:
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
data_files = "/Users/kwon/Desktop/repository/dacon/도배하자질의/data/preproc_train.csv"
raw_datasets = load_dataset("csv", data_files=data_files, split="train")

In [19]:
raw_datasets

Dataset({
    features: ['quenstion', 'category', 'answer'],
    num_rows: 6440
})

In [None]:
def tokenize(element):
    outputs = tokenizer(
        element['text']
    )
    
    input_batch = []
    for input_ids in outputs['input_ids']:
        input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(tokenize, batched=True)
tokenized_datasets

In [18]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

args = TrainingArguments(
    output_dir="model/further_train",
    per_device_train_batch_size=1,
    # per_device_eval_batch_size=32,
    # evaluation_strategy="steps",
    # eval_steps=5000,
    # logging_steps=5000,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    weight_decay=0.1,
    warmup_steps=1000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5000,
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
    # eval_dataset=tokenized_datasets["valid"],
)

NameError: name 'tokenized_datasets' is not defined

In [12]:
trainer.train()

NameError: name 'trainer' is not defined

In [30]:
import os
model.save_pretrained(os.path.join('model/further_train', "checkpoint-last"))
tokenizer.save_pretrained(os.path.join('model/further_train', "checkpoint-last"))

('model/further_train\\checkpoint-last\\tokenizer_config.json',
 'model/further_train\\checkpoint-last\\special_tokens_map.json',
 'model/further_train\\checkpoint-last\\tokenizer.json')

In [20]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [24]:
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
# from peft import LoraConfig

MODEL_PATH = "model/"
MODEL = "bit_llama"
DEBUG = False
if DEBUG:
    epochs = 1
    dataset = load_dataset("csv", data_files="./data/preproc_train_debug.csv", split="train")
else:
    epochs = 5
    dataset = load_dataset("csv", data_files="/Users/kwon/Desktop/repository/dacon/도배하자질의/data/preproc_train.csv", split="train")

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['quenstion'])):
        text = f"""###질문: {example['quenstion'][i]}
###답변: {example['answer'][i]}"""
        output_texts.append(text)
    return output_texts

# instruction_template = "###질문:"
response_template = "###답변:"
collator = DataCollatorForCompletionOnlyLM(
    # instruction_template=instruction_template,
    response_template=response_template,
    tokenizer=tokenizer)

training_args = TrainingArguments(
    learning_rate=1e-4,
    output_dir=MODEL_PATH + MODEL + "_sft_" + str(epochs),         # output directory to where save model checkpoint
    # evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,
    num_train_epochs=epochs,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=1, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=8,  # accumulating the gradients before updating the weights
    # per_device_eval_batch_size=16,  # evaluation batch size
    logging_steps=200,             # evaluate, log and save model checkpoints every 1000 step
    # save_steps=50000,
    # fp16=True,                      # memory save
    # weight_decay=0.01,
    # warmup_steps=2970,
    #dataloader_num_workers=2,
#     remove_unused_columns=False
    #optim="adafactor",
    # load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=2,           # whether you don't have much space so you let only 3 model weights saved in the disk
#     report_to=False               # when you don't want to log with wandb
)

trainer = SFTTrainer(
    model,
    tokenizer = tokenizer,
    train_dataset=dataset,
    args=training_args,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    max_seq_length=max_length,
)

trainer.train()

Map: 100%|██████████| 6440/6440 [00:00<00:00, 27602.17 examples/s]
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mqja1998[0m. Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling

###답변: 발포우레탄폼은 경량 재료로 제작되어 우수한 단열성능을 보유하고 있습니다. 또한, 발포우레탄폼은 쉽게 형태를 변형시킬 수 있어 구조물에 적합하게 사용할 수 있는 유연성을 가지고 있습니다. 뿐만 아니라 발포우레탄폼은 난연성, 방수성, 밀폐성 등의 우수한 물리적 특성을 갖추고 있어 다양한 건축 및 공사 현장에서 다목적으로 활용될 수 있습니다. This instance will be ignored in loss calculation. Note, if this happens often, consider increasing the `max_seq_length`.
###답변: 아이가 있는 집에는 두꺼운 장판을 사용하는 것이 좋습니다. 아이가 있는 경우 층간 소음이 발생할 수 있으며, 또한 음료 등을 쏟을 위험이 있기 때문에 소음을 줄이고 오염을 제거하기 쉬운 두꺼운 장판을 사용하는 것이 좋습니다. 추가로 두꺼운 장판은 따뜻한 느낌을 주어 아이가 땀을 많이 흘릴 수 있는 활발한 모습을 보일 때 편안함을 줄 수 있습니다. This instance will be ignored in loss calculation. Note, if this happens often, consider increasing the `max_seq_length`.


TypeError: BFloat16 is not supported on MPS

In [33]:
# import random
# import pandas as pd

# train_df = pd.read_csv('./data/train.csv')
# idx = random.randint(0, 664)
# data = train_df.iloc[1, 1:3]
# ans = train_df.iloc[1, 4]
# q1, q2 = data
# tokened = tokenizer(q1, return_tensors='pt').to(device=f"cuda")
# del tokened['token_type_ids']
# result = model(**tokened)
# result.logits

In [34]:
import os
model.save_pretrained(os.path.join(MODEL_PATH + MODEL + "_f-sft_" + str(epochs), "checkpoint-last"))
tokenizer.save_pretrained(os.path.join(MODEL_PATH + MODEL + "_f-sft_" + str(epochs), "checkpoint-last"))

('model/model/further_train\\checkpoint-last_f-sft_5\\checkpoint-last\\tokenizer_config.json',
 'model/model/further_train\\checkpoint-last_f-sft_5\\checkpoint-last\\special_tokens_map.json',
 'model/model/further_train\\checkpoint-last_f-sft_5\\checkpoint-last\\tokenizer.json')