In [1]:
!nvidia-smi

Mon Jun 10 06:28:07 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   56C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
#!pip install datasets
#!pip install accelerate -U
#!pip install peft
#!pip install trl

In [3]:
import torch
import transformers
import argparse
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from peft import (
        get_peft_model,
        prepare_model_for_kbit_training,
        LoraConfig
    )
from trl import SFTTrainer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct",
                                            #load_in_8bit=True,   --> pip install accerelate
                                            #device_map="auto"
                                            )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
device = 'cuda'

In [36]:
data = load_dataset("heegyu/open-korean-instructions", split = 'train')

In [37]:
dataset = data.train_test_split(test_size = 0.2)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['source', 'text'],
        num_rows: 300127
    })
    test: Dataset({
        features: ['source', 'text'],
        num_rows: 75032
    })
})


# Generation 1

In [38]:
model.to('cuda')
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to('cuda')

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [39]:
print(response)

A large language model is a type of artificial intelligence that can generate human-like text based on patterns in natural language data. These models are trained using large amounts of data and have been shown to be capable of generating coherent, meaningful, and informative responses to questions or prompts.

Large language models use advanced algorithms and techniques to learn from vast amounts of data and improve their performance over time. They can understand the meaning behind complex sentences and phrases, recognize patterns in language usage, and generate human-like language with varying degrees of coherence and fluency.

Some examples of large language models include GPT-3, which has surpassed the capabilities of many current AI systems in several domains such as language translation, summarization, and question answering. Additionally, BERT, an advanced variant of the Transformer architecture, has become popular for its ability to generate human-like text, particularly in th

In [59]:
text

'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nGive me a short introduction to large language model.<|im_end|>\n<|im_start|>assistant\n'

In [69]:
def generate_prompt(input, eos_token="<|im_end|>"):
  instruction = "<|im_start|>system\n당신은 한국어 AI 어시스턴트입니다.<|im_end|>\n"
  prompt = input
  return prompt

In [70]:
tokenizer.add_special_tokens({"pad_token": "<|endoftext|>"})
model.resize_token_embeddings(len(tokenizer))

Embedding(151646, 896)

In [71]:
lora_config = LoraConfig(
        r=8,
        lora_alpha=8,
        lora_dropout=0.1,
        target_modules=["q_proj","k_proj","v_proj","o_proj"],
        bias="none",
        task_type="CAUSAL_LM",
    )

In [72]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [73]:
output_dir = "./"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
per_device_eval_batch_size = 4
eval_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 10
learning_rate = 5e-4
max_grad_norm = 0.3
max_steps = 50
warmup_ratio = 0.03
evaluation_strategy="steps"
lr_scheduler_type = "constant"

training_args = transformers.TrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=per_device_train_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optim=optim,
            evaluation_strategy=evaluation_strategy,
            save_steps=save_steps,
            learning_rate=learning_rate,
            logging_steps=logging_steps,
            max_grad_norm=max_grad_norm,
            max_steps=max_steps,
            warmup_ratio=warmup_ratio,
            group_by_length=True,
            lr_scheduler_type=lr_scheduler_type,
            ddp_find_unused_parameters=False,
            eval_accumulation_steps=eval_accumulation_steps,
            per_device_eval_batch_size=per_device_eval_batch_size,
        )



In [74]:
dataset['train']

Dataset({
    features: ['source', 'text'],
    num_rows: 300127
})

In [75]:
def formatting_func(prompt):
  output = []

  for t in prompt["text"]:

    op = generate_prompt(t)
    output.append(op)

  return output


In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    peft_config=lora_config,
    formatting_func=formatting_func,
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_args
)

# We will also pre-process the model by upcasting the layer norms in float 32 for more stable training
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

trainer.train()
trainer.save_model(f"{output_dir}/final")


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/300127 [00:00<?, ? examples/s]