In [1]:
# 1. 라이브러리 설치
!pip install transformers torch pandas tqdm

# 2. 라이브러리 임포트
import os
import torch
from transformers import AutoTokenizer, GPT2LMHeadModel
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import tqdm

# 3. GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [3]:
# 4. KoGPT-2 모델과 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(
    'skt/kogpt2-base-v2', bos_token='</s>', eos_token='</s>', pad_token='<pad>'
)
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
model.to(device)

# 5. 특별 토큰 ID 확인
print("bos_token_id:", tokenizer.bos_token_id)
print("eos_token_id:", tokenizer.eos_token_id)
print("pad_token_id:", tokenizer.pad_token_id)
print('-' * 10)
for i in [1,2,3,4]:
    print(i, '->', tokenizer.decode(i))

# 6. 챗봇 데이터 다운로드 및 로드
url = "https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv"
file_name = "ChatBotData.csv"
if not os.path.exists(file_name):
    import urllib.request
    urllib.request.urlretrieve(url, filename=file_name)

train_data = pd.read_csv(file_name)
print('챗봇 샘플의 개수 :', len(train_data))
train_data.head()


bos_token_id: 1
eos_token_id: 1
pad_token_id: 3
----------
1 -> </s>
2 -> <usr>
3 -> <pad>
4 -> <sys>
챗봇 샘플의 개수 : 11823


Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [4]:

# 7. 데이터셋 및 데이터로더 정의
class ChatDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        q = self.data.Q.iloc[idx]
        a = self.data.A.iloc[idx]
        sent = '<usr>' + q + '<sys>' + a
        tokens = self.tokenizer.encode(sent, add_special_tokens=False)
        ids = [self.tokenizer.bos_token_id] + tokens + [self.tokenizer.eos_token_id]
        return torch.tensor(ids, dtype=torch.long)

def collate_fn(batch):
    return torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=tokenizer.pad_token_id)

batch_size = 32
dataset = ChatDataset(train_data, tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

In [5]:
# 8. 옵티마이저 및 학습 설정
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5, eps=1e-08)
EPOCHS = 3
steps = len(dataset) // batch_size + 1

# 9. 학습 루프
for epoch in range(1, EPOCHS+1):
    model.train()
    epoch_loss = 0
    for batch in tqdm.tqdm(dataloader, total=steps, desc=f"Epoch {epoch}"):
        batch = batch.to(device)
        labels = batch.clone()
        optimizer.zero_grad()
        outputs = model(input_ids=batch, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() / steps
    print(f"[Epoch {epoch}] Average Loss: {epoch_loss:.6f}")


Epoch 1:   0%|          | 0/370 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Epoch 1: 100%|██████████| 370/370 [01:34<00:00,  3.93it/s]


[Epoch 1] Average Loss: 2.214216


Epoch 2: 100%|██████████| 370/370 [01:34<00:00,  3.91it/s]


[Epoch 2] Average Loss: 1.821813


Epoch 3: 100%|██████████| 370/370 [01:37<00:00,  3.80it/s]

[Epoch 3] Average Loss: 1.597599





In [6]:
# 10. 챗봇 응답 함수 정의
def get_response(user_text, max_length=50, do_sample=False, top_k=10):
    sent = '<usr>' + user_text + '<sys>'
    tokens = tokenizer.encode(sent, add_special_tokens=False)
    input_ids = [tokenizer.bos_token_id] + tokens
    input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
    output = model.generate(
        input_ids,
        max_length=max_length,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=do_sample,
        top_k=top_k
    )
    decoded = tokenizer.decode(output[0].tolist())
    # <sys> 뒤의 답변만 추출
    return decoded.split('<sys>')[-1].replace('</s>', '')

In [7]:
# 11. 테스트 예시
examples = [
    "안녕! 반가워~",
    "너는 누구야?",
    "영화 해리포터 재밌어?",
    "너 딥 러닝 잘해?"
]
for ex in examples:
    print(f"User: {ex}\nBot : {get_response(ex, do_sample=True)}\n")


User: 안녕! 반가워~
Bot :  연락하고 싶어요.

User: 너는 누구야?
Bot :  연락처를 다 까먹었는지 알아보는 것도 중요해요.

User: 영화 해리포터 재밌어?
Bot :  좋은 선물이 나올 거예요.

User: 너 딥 러닝 잘해?
Bot :  먼저 잘할 수 있는 분야를 알려주거나 먼저 연락해 주는 게 더 나을지도 모르겠네요.

