In [1]:
!pip list  # 라이브러리 버전 확인

Package                   Version             
------------------------- --------------------
-ransformers              4.45.2              
accelerate                0.29.1              
aiohttp                   3.9.3               
aiosignal                 1.3.1               
annotated-types           0.6.0               
anyio                     4.3.0               
appdirs                   1.4.4               
apturl                    0.5.2               
argon2-cffi               23.1.0              
argon2-cffi-bindings      21.2.0              
asttokens                 2.4.1               
async-lru                 2.0.4               
async-timeout             4.0.3               
attrs                     23.2.0              
Babel                     2.14.0              
backcall                  0.2.0               
bcrypt                    3.1.7               
beautifulsoup4            4.12.3              
bitsandbytes              0.42.0              
bleach       

In [2]:
# 실행 중 warning 무시 + wandb라는 시각화 라이브러리 기능 끄기

import logging
logging.disable(logging.INFO)
logging.disable(logging.WARNING)

import warnings
warnings.filterwarnings('ignore')

import os
os.environ["WANDB_DISABLED"] = "true"

In [4]:
from datasets import load_dataset
from random import randrange

# huggingface hub에서 데이터셋 가져오기, 여기서는 cosmopedia의 stories에서 3000개만을 가져옴
dataset = load_dataset("HuggingFaceTB/cosmopedia", "stories", split="train[:3000]") # 

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/43 [00:00<?, ?it/s]

dataset size: 3000


In [5]:
# instruction tuning을 할 때 입력으로 줄 형식
# instruction 없이 데이터만을 입력으로 주면 원하는 출력을 얻지 못할 확률이 높음

def format_instruction(sample):
	return f"""### Instruction:
You are provided with a prompt. 
Your task is to generate a coherent and creative continuation of the given prompt, crafting a well-developed story. 
The story should maintain relevance to the context provided by the prompt 
and expand upon it with logical and imaginative progression.
 
### Input:
{sample['prompt']}
 
### Response:
{sample['text']}
"""

In [6]:
# instruction 형식에 맞춰진 데이터 예시

from random import randrange

print(format_instruction(dataset[randrange(len(dataset))]))

### Instruction:
You are provided with a prompt. 
Your task is to generate a coherent and creative continuation of the given prompt, crafting a well-developed story. 
The story should maintain relevance to the context provided by the prompt 
and expand upon it with logical and imaginative progression.
 
### Input:
Write an educational story (3-5 paragraphs) targeted at young children using simple words. The story should be inspired from this text snippet: 
“Q:I have been preparing our house for the market and in doing so, I have gotten rid of a lot of stuff! I am definitely the hoarder in our house. My husband could live out of two bags, use about five kitchen items each year, and doesn’t gather anything for future use or hang on to much for sentimental value. (Which probably means the items he has hung on to mean a whole lot more!) I am always tucking something away here or stashing materials there…all in preparation for “some day.” It’s also part of the teacher in me. Do you know man

In [7]:
# tuning할 모델 불러오기

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from accelerate import PartialState
 
use_flash_attention = False
 
# huggingface hub에서 사전학습된 모델 불러오기
model_id = "meta-llama/Llama-3.2-1B"

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=torch.bfloat16
)
 
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    use_cache=False,
    use_flash_attention_2=use_flash_attention,
    device_map="auto",
)

model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [8]:
# 모델 크기 확인

model.num_parameters()

361821120

In [9]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

targetmodule=[
  "q_proj",
  "k_proj",
  "v_proj",
  "o_proj",
  "gate_proj",
  "up_proj",
  "down_proj",
  "lm_head",
]

# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=targetmodule,
)
 
 
# prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

from transformers import TrainingArguments
 
args = TrainingArguments(
    output_dir=dir_name, # checkpoint나 학습 완료된 모델 저장할 경로
    num_train_epochs=3,
    per_device_train_batch_size=1 if use_flash_attention else 1,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=True,
    tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    disable_tqdm=False # disable tqdm since with packing values are in correct
)

In [11]:
from trl import SFTTrainer
 
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=2048,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=format_instruction,
    args=args,
)

In [12]:
# train
trainer.train() # there will not be a progress bar since tqdm is disabled
 
# save model
trainer.save_model()

Step,Training Loss
10,2.126
20,1.9837
30,1.8762
40,1.7681
50,1.6695
60,1.6108
70,1.5338
80,1.5079
90,1.4699
100,1.4339


In [13]:
if use_flash_attention:
    from utils.llama_patch import unplace_flash_attn_with_attn
    unplace_flash_attn_with_attn()
 
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
 
args.output_dir = dir_name # checkpoint나 학습 완료된 모델 저장된 경로
 
# 저장된 모델 불러오기
model = AutoPeftModelForCausalLM.from_pretrained(
    args.output_dir,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(args.output_dir)

In [14]:
# tuning이 아닌 text generation을 위한 instruction format

def format_instruction_inference(sample):
    return f"""### Instruction:
You are provided with a prompt. 
Your task is to generate a coherent and creative continuation of the given prompt, crafting a well-developed story. 
The story should maintain relevance to the context provided by the prompt 
and expand upon it with logical and imaginative progression.
 
### Input:
{sample['prompt']}
 
### Response:
"""

In [15]:
from tqdm import tqdm
 
# Load dataset from the hub and get a sample
dataset = load_dataset("HuggingFaceTB/cosmopedia", "stories", split="train[3000:3100]")

test_result = []

# text generation
for i in tqdm(range(len(dataset))):
    prompt = format_instruction_inference(dataset[i])

    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
    outputs = model.generate(input_ids=input_ids, max_new_tokens=1024, do_sample=True, top_p=0.9,temperature=0.9)
    gen_story = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]
    test_result.append(gen_story)

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/43 [00:00<?, ?it/s]

100%|██████████| 100/100 [1:00:45<00:00, 36.45s/it]


In [16]:
# 생성된 text와 원 데이터셋에 있는 text를 함께 dataframe으로 저장

import pandas as pd

result = pd.DataFrame()
result['prompt'] = dataset['prompt']
result['gen_story'] = test_result
result['ground_truth'] = dataset['text']

result.to_csv("./result/"+dir_name+".csv", encoding='utf-8')