In [1]:
import numpy as np
import pandas as pd
import os, gc
from tqdm.auto import tqdm
from datetime import datetime, timezone, timedelta

import torch
from torch.utils.data import DataLoader ,Dataset

import datasets
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM 

# HuggingFace peft 라이브러리
# pip install peft 
from peft import get_peft_model, PeftModel, TaskType, LoraConfig

In [2]:
tokenizer = AutoTokenizer.from_pretrained("kakaobrain/kogpt", revision="KoGPT6B-ryan1.5b") 

Downloading (…)okenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading (…)n1.5b/tokenizer.json:   0%|          | 0.00/2.51M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

In [3]:
train_df = pd.read_csv("train.csv") 
train_set = datasets.Dataset.from_pandas(train_df) 

In [4]:
def train_batch_preprocess(batch): 
    prompt = "{text} 한줄 요약:" 
    query_text = [prompt.format(text=text) for text in batch["text"]] 
    target_text = batch["summary"] 
    query = tokenizer(query_text) 
    target = tokenizer(target_text) 
    input_ids = [q + t + [tokenizer.eos_token_id] for q, t in zip(query["input_ids"], target["input_ids"])] 
    attention_mask = [q + t + [1] for q, t in zip(query["attention_mask"], target["attention_mask"])] 
    labels = [[-100] * len(q) + t + [tokenizer.eos_token_id] for q, t in zip(query["input_ids"], target["input_ids"])] # some masking applied I guess 
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels} 
    

In [5]:
train_set = train_set.map(
    train_batch_preprocess, 
    remove_columns = ["id", "text", "summary"], 
    batched = True,  
    batch_size = 1000, 
) 

Map:   0%|          | 0/40400 [00:00<?, ? examples/s]

In [6]:
# GPTJ는 문장의 오른쪽부터 생성하는 autoregressive 모델이므로 오른쪽 끝이 같아야합니다. Left padding을 사용합니다. 

def left_pad(sequence, value, max_len): 
    return [value] * (max_len - len(sequence)) + sequence 

def collate_fn(batch, device="cuda"): 
    length = max(len(row["input_ids"]) for row in batch)
    input_ids = [
        left_pad(row["input_ids"], tokenizer.pad_token_id, length) 
        for row in batch
    ] 
    attention_mask = [
        left_pad(row["attention_mask"], 0, length) 
        for row in batch
    ] 
    labels = [
        left_pad(row["input_ids"], -100, length) 
        for row in batch
    ]  
    return {
        "input_ids": torch.tensor(input_ids, dtype=torch.long, device=device), 
        "attention_mask": torch.tensor(attention_mask, dtype=torch.long, device=device), 
        "labels": torch.tensor(labels, dtype=torch.long, device=device) 
    } 

In [7]:
train_loader = DataLoader(train_set, batch_size=4, shuffle=True, collate_fn=collate_fn) 

In [8]:
base_model = AutoModelForCausalLM.from_pretrained(
    "kakaobrain/kogpt", revision = "KoGPT6B-ryan1.5b-float16", 
    torch_dtype = torch.float16, 
    device_map="auto") 

Downloading (…)-float16/config.json:   0%|          | 0.00/839 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/12.3G [00:00<?, ?B/s]

In [10]:
peft_config = LoraConfig(
    task_type = TaskType.CAUSAL_LM, 
    r = 8, 
    lora_alpha = 32, 
    lora_dropout = 0.1, 
    target_modules = ["q_proj", "v_proj"])  

print(peft_config)

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, base_model_name_or_path=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules=['q_proj', 'v_proj'], lora_alpha=32, lora_dropout=0.1, merge_weights=False, fan_in_fan_out=False, enable_lora=None, bias='none', modules_to_save=None)


In [11]:
peft_model = get_peft_model(base_model, peft_config) 
peft_model.to("cuda") 
peft_model.train()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPTJForCausalLM(
      (transformer): GPTJModel(
        (wte): Embedding(64512, 4096)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-27): 28 x GPTJBlock(
            (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
            (attn): GPTJAttention(
              (attn_dropout): Dropout(p=0.1, inplace=False)
              (resid_dropout): Dropout(p=0.0, inplace=False)
              (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (v_proj): Linear(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): Dropout(p=0.1, inplace=False)
                (lora_A): Linear(in_features=4096, out_features=8, bias=False)
                (lora_B): Linear(in_features=8, out_features=4096, bias=False)
              )
              (q_proj): Linear(
                in_features=4096, out_features=4096, bias=False
       

In [17]:
pd.DataFrame([
    (param.dtype, param.shape, param.device, param.requires_grad, name) 
    for name, param in peft_model.named_parameters()], 
    columns = ["dtype", "shape", "device", "requires_grad", "name"]) 

Unnamed: 0,dtype,shape,device,requires_grad,name
0,torch.float16,"(64512, 4096)",cuda:0,False,base_model.model.transformer.wte.weight
1,torch.float16,"(4096,)",cuda:0,False,base_model.model.transformer.h.0.ln_1.weight
2,torch.float16,"(4096,)",cuda:0,False,base_model.model.transformer.h.0.ln_1.bias
3,torch.float16,"(4096, 4096)",cuda:0,False,base_model.model.transformer.h.0.attn.k_proj.w...
4,torch.float16,"(4096, 4096)",cuda:0,False,base_model.model.transformer.h.0.attn.v_proj.w...
...,...,...,...,...,...
392,torch.float16,"(4096,)",cuda:0,False,base_model.model.transformer.h.27.mlp.fc_out.bias
393,torch.float16,"(4096,)",cuda:0,False,base_model.model.transformer.ln_f.weight
394,torch.float16,"(4096,)",cuda:0,False,base_model.model.transformer.ln_f.bias
395,torch.float16,"(64512, 4096)",cuda:0,False,base_model.model.lm_head.weight


In [12]:
learning_rate = 3e-5 

optimizer = torch.optim.Adam(peft_model.parameters(), lr = learning_rate) 
scaler = torch.cuda.amp.GradScaler() 

In [13]:
def training_step(model, batch, optimizer, scaler): 
    optimizer.zero_grad() 
    with torch.cuda.amp.autocast(): 
        outputs = model(
            input_ids = batch["input_ids"], 
            attention_mask = batch["attention_mask"], 
            labels = batch["labels"],
        ) 
        step_loss = outputs[0] 
    scaler.scale(step_loss).backward() 
    scaler.step(optimizer) 
    scaler.update() 
    return step_loss.detach() 

In [14]:
NUM_EPOCHS = 3 # just train for one epoch 

peft_model.train() 
for epoch in range(NUM_EPOCHS): 
    total_loss = 0 
    tr_loss = torch.tensor(0.0).to("cuda") 
    for batch_idx, batch in enumerate(tqdm(train_loader), start=1):
        step_loss = training_step(peft_model, batch, optimizer, scaler) 
        tr_loss += step_loss 
        if batch_idx % 100 == 0: 
            print("{}. tr_loss: {}".format(batch_idx, tr_loss.item())) 
            tr_loss = torch.tensor(0.0).to("cuda") 

  0%|          | 0/10100 [00:00<?, ?it/s]

100. tr_loss: 248.7427978515625
200. tr_loss: 239.6448211669922
300. tr_loss: 229.6232147216797
400. tr_loss: 227.33828735351562
500. tr_loss: 225.2498779296875
600. tr_loss: 224.90945434570312
700. tr_loss: 222.11276245117188
800. tr_loss: 221.3392333984375
900. tr_loss: 221.53042602539062
1000. tr_loss: 218.19895935058594
1100. tr_loss: 220.40008544921875
1200. tr_loss: 219.1332550048828
1300. tr_loss: 217.6351776123047
1400. tr_loss: 217.30918884277344
1500. tr_loss: 220.42088317871094
1600. tr_loss: 218.1422576904297
1700. tr_loss: 219.2422332763672
1800. tr_loss: 217.1498565673828
1900. tr_loss: 215.6063995361328
2000. tr_loss: 217.3845977783203
2100. tr_loss: 213.9405517578125


KeyboardInterrupt: 

In [24]:
TIME_SERIAL = datetime.now(timezone(timedelta(hours=9))).strftime("%y%m%d-%H%M%S") 
peft_model.save_pretrained(f"exp_{TIME_SERIAL}")

print("done saving!") 

done saving!


# Inference

In [27]:
import numpy as np 
import pandas as pd 
import torch 
from torch.utils.data import DataLoader, Dataset 
from transformers import *
from peft import PeftModel 
import os, gc   
from datetime import datetime, timezone, timedelta 
from tqdm.auto import tqdm 



RuntimeError: Failed to import transformers.sagemaker because of the following error (look up to see its traceback):
No module named 'torch._six'

In [26]:
tokenizer = AutoTokenizer.form_pretrained("kakaobrain/kogpt", revision="KoGPT6B-ryan1.5b-float16")

AttributeError: type object 'AutoTokenizer' has no attribute 'form_pretrained'