In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, LlamaForCausalLM, LlamaTokenizer, DataCollatorForSeq2Seq
from peft import get_peft_config, LoraConfig, get_peft_model, get_peft_model_state_dict
import torch
import sys
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# model/data params
model_name = "meta-llama/Llama-3.2-3B-Instruct"
data_path = 'train_data.json'
output_path = "mycheckpoint"
# training hyperparams
batch_size = 128#used to be 128
micro_batch_size = 4
num_epochs = 3
learning_rate = 3e-4
cutoff_len = 256
val_set_size = 0
# lora hyperparams
lora_r= 16
lora_alpha = 32
lora_dropout = 0.05
lora_target_modules = [
    "q_proj",
    "v_proj",
]
# llm hyperparams
train_on_inputs = True  # if False, masks out inputs in loss
group_by_length = False  # faster, but produces an odd training loss curve
# wandb params
wandb_project= ""
wandb_run_name= ""
wandb_watch = ""  # options: false | gradients | all
wandb_log_model = ""  # options: false | true
resume_from_checkpoint = None  # either training checkpoint or final adapter

In [3]:
ACCESS_TOKEN="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" ## hugging face access token 입니다.

In [5]:
# 사전 학습 모델 로드

model = LlamaForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16, use_auth_token=ACCESS_TOKEN
    )
# The problem is that tokenizer is expecting a local file path. 
# This can be fixed by specifying the 'tokenizer_class' 
# and removing the 'add_eos_token' for this specific model
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    use_auth_token=ACCESS_TOKEN, 
    tokenizer_class=LlamaTokenizer,
)  
tokenizer.pad_token_id = (
    0  # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left"  # Allow batched inference

# 데이터셋 준비 (예시 데이터)
data = [
    {"input": "Question: What is AI?", "output": "AI stands for Artificial Intelligence."},
    {"input": "Explain machine learning.", "output": "Machine learning is a subset of AI."}
]




In [6]:
lora_config = LoraConfig(
    r=lora_r,                      # Low-rank 차원
    lora_alpha=lora_alpha,            # Scaling factor
    target_modules=lora_target_modules,  # 적용할 모듈 (모델 구조에 따라 다름)
    lora_dropout=lora_dropout,         # Dropout 비율
    bias="none",              # Bias 처리 방식
    task_type="CAUSAL_LM"     # 작업 유형 (예: CAUSAL_LM, SEQ_2_SEQ_LM 등)
)

# LoRA를 모델에 적용
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # Trainable parameter 확인


trainable params: 4,587,520 || all params: 3,217,337,344 || trainable%: 0.1426


In [7]:
data_set = load_dataset("json", data_files=data_path)


enerating train split: 610 examples [00:00, 3618.58 examples/s]

In [8]:
def tokenize(prompt, tokenizer, cutoff_len, add_eos_token=True):
    # there's probably a way to do this with the tokenizer settings
    # but again, gotta move fast
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=cutoff_len,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < cutoff_len
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def generate_prompt(data_point) :
    if data_point["input"] :
        prompt = "{instruction}  \ninput : {input}  \noutput : {output}".format(instruction=data_point["instruction"], input=data_point["input"], output=data_point["output"])
        return prompt

def generate_and_tokenize_prompt(data_point, tokenizer, cutoff_len):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenize(full_prompt, tokenizer, cutoff_len)
    if not train_on_inputs:
        user_prompt = generate_prompt({**data_point, "output": ""})
        tokenized_user_prompt = tokenize(user_prompt, add_eos_token=False)
        user_prompt_len = len(tokenized_user_prompt["input_ids"])

        tokenized_full_prompt["labels"] = [
            -100
        ] * user_prompt_len + tokenized_full_prompt["labels"][
            user_prompt_len:
        ]  # could be sped up, probably
    return tokenized_full_prompt

In [9]:
split_data = data_set.shuffle().map(lambda prompt: generate_and_tokenize_prompt(prompt, tokenizer, cutoff_len))
train_data = split_data["train"]
validation_data = None


ap: 100%|█████████████████████████████████████████████████████████████| 610/610 [00:02<00:00, 234.44 examples/s]

In [10]:
trainer = Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=validation_data,
    args=TrainingArguments(
        per_device_train_batch_size=micro_batch_size,
        gradient_accumulation_steps=batch_size // micro_batch_size,
        warmup_steps=100,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        fp16=True,
        logging_steps=10,
        optim="adamw_torch",
        evaluation_strategy="steps" if val_set_size > 0 else "no",
        save_strategy="steps",
        eval_steps=200 if val_set_size > 0 else None,
        save_steps=200,
        output_dir=output_path,
        save_total_limit=3,
        load_best_model_at_end=True if val_set_size > 0 else False,
        group_by_length=group_by_length,
        ),
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)
model.config.use_cache = False

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
old_state_dict = model.state_dict
model.state_dict = (
    lambda self, *_, **__: get_peft_model_state_dict(
        self, old_state_dict()
    )
).__get__(model, type(model))

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
trainer.train(resume_from_checkpoint=resume_from_checkpoint)

Step,Training Loss
10,1.9282



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-3B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-3B-Instruct.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-3B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-3B-Instruct.


TrainOutput(global_step=12, training_loss=1.8688889543215434, metrics={'train_runtime': 169.7165, 'train_samples_per_second': 10.783, 'train_steps_per_second': 0.071, 'total_flos': 6661082764541952.0, 'train_loss': 1.8688889543215434, 'epoch': 2.810457516339869})

In [13]:
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-3B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-3B-Instruct.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-3B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-3B-Instruct.


('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/tokenizer.json')