In [1]:
import os, torch
from datasets import Dataset, load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    BitsAndBytesConfig, 
    pipeline, 
    Trainer, 
    TrainingArguments)
from accelerate import Accelerator
accelerator = Accelerator()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [2]:
torch.cuda.empty_cache()

In [3]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"

BASE_MODEL = 'meta-llama/Meta-Llama-3-8B-Instruct'
# BASE_MODEL = 'BM-K/KoChatBART'

In [4]:
dataset = load_dataset("beomi/KoAlpaca-v1.1a", split='train')
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.bfloat16,
    device_map="cuda:0")

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

messages = [
    {"role": "system", "content": "You are 'Llama', a helpful assistant. Reply in 한국어 only."},
    {"role": "user", "content": "니 한국어 잘하나?"},
]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
def preprocess_function(examples):
    instructions = examples['instruction']
    responses = examples['output']
    return tokenizer.prepare_seq2seq_batch(
        src_texts=instructions,
        tgt_texts=responses,
        return_tensors='pt',
        max_length=364,  # Adjust the max length as needed
        padding='max_length',
        # truncation=True  # Enable truncation
    )

In [6]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, batch_size=2)

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

In [11]:
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = model.generate(
    input_ids,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
response = outputs[0][input_ids.shape[-1]:]
print(tokenizer.decode(response, skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


네, 저는 한국어를 잘하겠습니다! 저는 Llama라는 이름의 언어 지원 도구입니다. 당신에게 도움이 필요하면 언제든지 물어보세요!


In [12]:
training_args = TrainingArguments(
    output_dir="./finetuned_model",
    num_train_epochs=3,           # Adjust based on your needs
    per_device_train_batch_size=1,  # Adjust based on your hardware
    logging_strategy='steps',
    save_steps=10000,             # Adjust based on your preferences
    eval_steps=5000,              # Adjust based on your preferences
    # Add additional arguments like learning rate, weight decay, etc.
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    # eval_dataset=tokenized_dataset["validation"],
    # Add data collator if needed
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [14]:
# Start training
trainer.train()

: 

In [None]:
# Save the fine-tuned model
trainer.save_model("./finetuned_model")

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
