In [1]:
import gc

def cleanup():
    if 'model' in globals():
        del globals()['model']
    if 'dataset' in globals():
        del globals()['dataset']
    gc.collect()
    torch.cuda.empty_cache()

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model

In [3]:
import torch

In [4]:
def load_model_and_tokenizer(model_id, peft=None):

    tokenizer = AutoTokenizer.from_pretrained(model_id)

    if peft is None:
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map={"":0})

    elif peft == 'lora':
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16,  device_map={"":0})
        lora_config = LoraConfig(
                    r=8,
                    lora_alpha=32,
                    target_modules=[
                                        "q_proj", "k_proj", "v_proj", "o_proj",
                                        "gate_proj", "up_proj", "down_proj"
                                    ],
                    lora_dropout=0.05,
                    bias="none",
                    task_type="CAUSAL_LM"
                )

        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()

    print_gpu_utilization()
    return model, tokenizer

In [5]:
import torch
def print_gpu_utilization():
    if torch.cuda.is_available():
        used_memory = torch.cuda.memory_allocated() / 1024**3
        print(f"GPU 메모리 사용량: {used_memory:.3f} GB")
    else:
        print("런타임 유형을 GPU로 변경하세요")

In [6]:
model_id = "Bllossom/llama-3.2-Korean-Bllossom-AICA-5B"

In [7]:
model, tokenizer = load_model_and_tokenizer(model_id, peft='lora')

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/835M [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.58G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

trainable params: 15,196,160 || all params: 4,326,660,878 || trainable%: 0.3512
GPU 메모리 사용량: 8.088 GB


In [8]:
tokenizer("심장이 매우 뛰어요")

{'input_ids': [128000, 102612, 114784, 121520, 123151, 105807], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [9]:
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()

print(f"""trainable params: {trainable_params}
all params: {all_param}
trainable%: {100 * trainable_params / all_param}""")

trainable params: 15196160
all params: 4326660878
trainable%: 0.3512214252165848


In [11]:
from datasets import load_dataset

In [12]:
data = load_dataset("csv", data_files={"train" : "./train_data.csv"})

Generating train split: 0 examples [00:00, ? examples/s]

In [13]:
data2 = data.map(lambda x : {'text' : f"""User: {x['question']} 
                    Assistant: {x['answer']}<|endoftext|>"""})

Map:   0%|          | 0/12228 [00:00<?, ? examples/s]

In [14]:
for x in data2['train']:
    print(x['text'])
    break

User: 10세 소년이 충치 치료를 위해 치과에 방문했다. 치과의사는 수복 재료로 아말감 대신 복합 레진을 권장하고 있다. 복합 레진 사용의 장점으로 가장 적절한 것은?  
1) 심미성이 좋다.  
2) 수복물의 강도가 높다.  
3) 수은 함유로 인한 독성 위험이 있다.  
4) 비용이 저렴하다.  
5) 이차 우식증 발생 위험이 낮다. 
                    Assistant: 1) 심미성이 좋다.<|endoftext|>


In [15]:
from transformers import TrainingArguments, Trainer

In [16]:
argu = TrainingArguments(
         per_device_train_batch_size=1,
        num_train_epochs=2, 
        gradient_accumulation_steps=4,
        learning_rate = 0.0001,
        fp16=True,
        output_dir="./output/"
    )

In [17]:
train_dataset = data2.map(lambda samples: tokenizer(samples["text"]), batched=True)

Map:   0%|          | 0/12228 [00:00<?, ? examples/s]

In [18]:
from transformers import DataCollatorForLanguageModeling

In [19]:
trainer = Trainer(
    model = model,
    train_dataset=train_dataset['train'],
    args=argu,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [20]:
trainer.train()

Step,Training Loss
500,1.4013
1000,1.2456
1500,1.2116
2000,1.1833
2500,1.1472
3000,1.1248
3500,0.9881
4000,0.99
4500,0.9739
5000,0.9664


TrainOutput(global_step=6114, training_loss=1.0923873532001669, metrics={'train_runtime': 5366.4132, 'train_samples_per_second': 4.557, 'train_steps_per_second': 1.139, 'total_flos': 9.5783623801107e+16, 'train_loss': 1.0923873532001669, 'epoch': 2.0})