# QA bot baseline
## Pretrained Model Load(SKT-KoGPT2)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
bos_token='</s>', eos_token='</s>', unk_token='<unk>',
pad_token='<pad>', mask_token='<mask>')
tokenizer.tokenize("</s> 안녕하세요. 한국어 GPT-2 입니다.😤:)l^o </s>")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


['</s>',
 '▁안녕',
 '하',
 '세',
 '요.',
 '▁한국어',
 '▁G',
 'P',
 'T',
 '-2',
 '▁입',
 '니다.',
 '😤',
 ':)',
 'l^o',
 '▁',
 '</s>']

In [11]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup

# JSON 파일 로드 함수
def load_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    pairs = []
    for entry in data["data_info"]:
        question = entry["question"]
        answer = entry["answer"]["contents"]
        pairs.append((question, answer))

    return pairs


# Dataset 클래스
class QADataset(Dataset):
    def __init__(self, qa_pairs, tokenizer, max_length=256):
        self.qa_pairs = qa_pairs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.qa_pairs)

    def __getitem__(self, idx):
        question, answer = self.qa_pairs[idx]
        input_text = f"</s> 질문: {question} 답변: {answer} </s>"

        encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        # labels를 input_ids의 복사본으로 생성
        labels = input_ids.clone()

        # 패딩 토큰에 대한 손실을 무시하기 위해 패딩 토큰 인덱스를 -100으로 설정
        labels[input_ids == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }


# 파일 경로
train_filepath = '/content/drive/MyDrive/멋사자 AI 2차 실전 프로젝트/한국어 성능이 개선된 초거대AI 언어모델 개발 및 데이터/Training/02.라벨링데이터/TL_02.RLHF데이터/SFTlabel.json'
valid_filepath = '/content/drive/MyDrive/멋사자 AI 2차 실전 프로젝트/한국어 성능이 개선된 초거대AI 언어모델 개발 및 데이터/Validation/02.라벨링데이터/VL/SFTlabel.json'

# JSON 파일 데이터 로드
train_pairs = load_data(train_filepath)
valid_pairs = load_data(valid_filepath)

# Tokenizer 초기화 (예: Hugging Face KoGPT2 사용)
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    "skt/kogpt2-base-v2",
    bos_token='</s>', eos_token='</s>', unk_token='<unk>',
    pad_token='<pad>', mask_token='<mask>'
)

# Dataset 및 DataLoader 생성
train_dataset = QADataset(train_pairs, tokenizer)
valid_dataset = QADataset(valid_pairs, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=16, shuffle=False)

# 학습용 모델 불러오기 (예: Hugging Face GPT)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2').to(device)

# 옵티마이저 및 스케줄러 정의
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 2
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# 학습 및 검증 루프
for epoch in range(num_epochs):
    # === 학습 단계 ===
    model.train()
    total_train_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1} | Train Loss: {avg_train_loss:.4f}")

    # === 검증 단계 ===
    model.eval()
    total_valid_loss = 0
    with torch.no_grad():
        for batch in valid_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_valid_loss += loss.item()

    avg_valid_loss = total_valid_loss / len(valid_dataloader)
    print(f"Epoch {epoch + 1} | Validation Loss: {avg_valid_loss:.4f}")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


Epoch 1 | Train Loss: 2.7731
Epoch 1 | Validation Loss: 2.5639
Epoch 2 | Train Loss: 2.4258
Epoch 2 | Validation Loss: 2.4964


In [13]:
import json

def load_rm_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    qa_pairs = []
    for entry in data["data_info"]:
        question = entry["question"]
        answers = []

        for i in range(1, 6):  # 최대 5개의 답변을 처리
            answer_key = f"answer{i:02d}"
            if answer_key in entry:
                answer = entry[answer_key]["contents"]
                answers.append(answer)

        qa_pairs.append((question, answers))

    return qa_pairs


In [14]:
from transformers import GPT2LMHeadModel

# 학습된 모델 불러오기
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2").to(device)


In [15]:
import torch.nn.functional as F

def compute_reward_model_output(model, tokenizer, question, answers, device):
    inputs = [f"</s> 질문: {question} 답변: {ans} </s>" for ans in answers]

    # 각 답변에 대해 모델에 입력하여 로짓을 얻음
    inputs_encodings = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt").to(device)
    logits = model(**inputs_encodings).logits

    # 각 답변에 대한 확률을 계산 (softmax 사용)
    probs = F.softmax(logits, dim=-1)

    # 가장 높은 확률을 가진 토큰 인덱스를 기준으로 보상 점수 산출
    reward_scores = torch.max(probs, dim=-1).values
    return reward_scores


In [16]:
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup

# 데이터 로드
train_filepath = '/content/drive/MyDrive/멋사자 AI 2차 실전 프로젝트/한국어 성능이 개선된 초거대AI 언어모델 개발 및 데이터/Training/02.라벨링데이터/TL_02.RLHF데이터/RMlabel.json'
train_pairs = load_rm_data(train_filepath)

# Tokenizer 초기화
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    "skt/kogpt2-base-v2", bos_token='</s>', eos_token='</s>', unk_token='<unk>',
    pad_token='<pad>', mask_token='<mask>'
)

# 옵티마이저 정의
optimizer = AdamW(model.parameters(), lr=5e-5)

# DataLoader 준비
train_dataloader = DataLoader(train_pairs, batch_size=8, shuffle=True)

# 학습 루프
num_epochs = 2
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        questions, answers = batch

        # 답변에 대한 보상 점수 계산
        reward_scores = compute_reward_model_output(model, tokenizer, questions, answers, device)

        # 손실 함수 정의 (예: 보상 점수에 대한 MSE 또는 CrossEntropyLoss)
        loss = -reward_scores.mean()  # 보상 점수를 최대화하도록 학습

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader):.4f}")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch import nn
from torch.optim import AdamW
import torch.nn.functional as F
import gym
from torch.distributions import Categorical
import numpy as np
from datasets import load_dataset

# Load the pre-trained koGPT2 model
model_name = "skt/kogpt2-base-v2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Reward Model (RM) (Assumed as a placeholder for the actual Reward Model)
class RewardModel(nn.Module):
    def forward(self, text):
        # Placeholder for actual reward model implementation
        return torch.tensor([np.random.random() for _ in text])  # Random rewards for now

reward_model = RewardModel()

# PPO Hyperparameters
gamma = 0.99       # Discount factor
eps_clip = 0.2     # PPO clipping epsilon
lr = 1e-5          # Learning rate
betas = (0.9, 0.999)
K_epochs = 3       # Number of PPO epochs
batch_size = 4     # Batch size
max_timesteps = 1000  # Max timesteps per episode
eps = 1e-6

optimizer = AdamW(model.parameters(), lr=lr, betas=betas)

# Define PPO policy model
class PPO(nn.Module):
    def __init__(self, model, tokenizer, reward_model):
        super(PPO, self).__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.reward_model = reward_model

    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask=attention_mask, labels=input_ids)

    def get_action(self, input_ids, attention_mask):
        logits = self.model(input_ids, attention_mask=attention_mask).logits
        dist = Categorical(F.softmax(logits, dim=-1))
        action = dist.sample()
        return action, dist

ppo_model = PPO(model, tokenizer, reward_model)

# Sample environment (Dummy Environment for Text Generation Task)
class TextGenEnv(gym.Env):
    def __init__(self, tokenizer):
        super(TextGenEnv, self).__init__()
        self.tokenizer = tokenizer
        self.current_state = None
        self.max_len = 50  # Max sequence length

    def reset(self):
        self.current_state = self.tokenizer.encode("안녕하세요, 오늘 날씨는", return_tensors="pt")  # Start sentence
        return self.current_state

    def step(self, action):
        # Add the token from the action into the sequence
        new_state = torch.cat([self.current_state, action.unsqueeze(0)], dim=-1)

        # Reward Calculation based on Reward Model
        reward = self.reward_model(new_state)

        # Check if the length exceeded the max length (terminate if so)
        done = len(new_state[0]) >= self.max_len
        return new_state, reward, done, {}

    def render(self):
        return self.tokenizer.decode(self.current_state[0], skip_special_tokens=True)

# Instantiate environment and PPO agent
env = TextGenEnv(tokenizer)
ppo_model.train()

# Training loop
for epoch in range(K_epochs):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        input_ids = state
        attention_mask = torch.ones_like(input_ids)  # Assuming no padding for simplicity

        # Get the action (next token prediction) and the probability distribution
        action, dist = ppo_model.get_action(input_ids, attention_mask)

        # Step through the environment with the action
        next_state, reward, done, _ = env.step(action)
        total_reward += reward

        # Calculate the advantage
        reward = reward.item()  # Convert tensor to scalar

        # Calculate the log probability of the action taken
        log_prob = dist.log_prob(action)

        # Compute the PPO objective (loss)
        loss = -log_prob * reward  # The reward would be the advantage
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update state
        state = next_state

    print(f"Epoch {epoch+1}, Total Reward: {total_reward}")



## Pretrained Model Test

In [9]:
text = '</s> 근육을 키우려면'
input_ids = tokenizer.encode(text, return_tensors='pt').to(device)
gen_ids = model.generate(input_ids,
                         max_length=128,
                         repetition_penalty=2.0,
                         pad_token_id=tokenizer.pad_token_id,
                         eos_token_id=tokenizer.eos_token_id,
                         bos_token_id=tokenizer.bos_token_id,
                         use_cache=True)
generated = tokenizer.decode(gen_ids[0])
print(generated)

</s> 근육을 키우려면 운동량을 늘려야 한다.
또한 운동을 할 때 체중을 줄이거나 식이요법을 병행하는 것이 좋다.
운동량이 부족하면 근육이 약해져 살이 찌기 쉽다.
이때는 적당한 운동과 함께 충분한 수면을 취하는 것도 중요하다.
운동을 하면 근육의 양이 늘어나게 되고 이는 곧 지방 축적을 촉진하게 된다.
따라서 운동은 체중 감량에 도움이 되지만 무리해서 하는 것은 오히려 독이 될 수도 있다.
운동은 하루 30분 정도만 해도 효과가 있지만 일주일에 한 번 정도는 반드시 해야 한다.</d> 지난해 12월 31일 오후 2시쯤 서울 강남구 역삼동 강남역 인근에서 A(30)씨가 몰던 승용차가 중앙선을 넘어 마주


In [10]:
text = '</s> 청소를 할 때 가장 먼저 해야할 것은'
input_ids = tokenizer.encode(text, return_tensors='pt').to(device)
gen_ids = model.generate(input_ids,
                         max_length=128,
                         repetition_penalty=2.0,
                         pad_token_id=tokenizer.pad_token_id,
                         eos_token_id=tokenizer.eos_token_id,
                         bos_token_id=tokenizer.bos_token_id,
                         use_cache=True)
generated = tokenizer.decode(gen_ids[0])
print(generated)

</s> 청소를 할 때 가장 먼저 해야할 것은 바로 청소다.
청소 후에는 반드시 세정제를 이용해 깨끗이 씻어내고 먼지를 털어내야 한다.
세탁 후 물기가 마르기 전에 세탁물을 충분히 헹궈낸 다음 물기를 제거한 뒤 중성 세제 등으로 닦아내면 된다.
또한 빨래 건조기를 사용해 젖은 옷이나 이불 등을 말끔히 말려주는 것도 잊지 말아야 한다.</d> 지난해 12월 31일 오후 2시 서울 강남구 삼성동 코엑스 3층 컨퍼런스코에서 열린 ‘2018 대한민국 국제의료기기전시회(KIMES 2018)’에서 만난 김지현 원장은 “국내외 의료기기를 한 자리에서 볼 수 있는


# Data Preprocessing

In [None]:
import json

def load_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    pairs = []
    for entry in data["data_info"]:
        question = entry["question"]
        answer = entry["answer"]["contents"]
        pairs.append((question, answer))

    return pairs


filepath = 'SFTlabel.json'
pairs = load_data(filepath)
print(pairs[3])

('유리공예 작업할 때 필요한 도구와 재료가 뭐야?', '유리공예 작업을 위해 필요한 도구 및 재료는 다음과 같습니다:\n\n도구\n1. 유리 절단 칼: 유리를 원하는 크기로 절단하기 위해 사용됩니다.\n2. 선과: 직선을 그리기 위해 사용됩니다.\n3. 절단 접착제: 접합을 위해 사용되며, 강력한 접착제로 유리 조각이 붙는 데 도움을 줍니다.\n4. 모래 지속기: 유리의 끝을 다듬기 위해 사용됩니다.\n5. 소화기: 작업 도중 작은 불꽃이 발생할 경우 안전을 위해 사용됩니다.\n6. 연마 도구: 유리의 표면을 연마하여 다듬을 수 있습니다.\n7. 벤치 그라인더: 유리를 모양을 다듬는 데 사용되며, 모양을 좀 더 정확하게 조정할 수 있습니다.\n8. 페인트 브러시: 디자인과 색상을 유리에 그릴 때 사용됩니다.\n\n재료\n1. 유리 조각: 다양한 크기, 모양, 색상의 유리 조각을 사용할 수 있습니다.\n2. 테두리: 유리 작품을 보호하거나 견고하게 만들기 위해 사용됩니다.\n3. 줄무늬: 작품을 걸거나 장식할 때 사용되며, 다양한 색상과 재질을 사용할 수 있습니다.\n4. 트림: 테두리를 꾸미거나 작품의 외관을 강조하기 위해 사용됩니다.\n5. 유리그릇: 작품에 사용될 수 있는 꽃, 모래, 컬러드 샌드 등의 재료를 담을 수 있습니다.\n6. 폴리싱 제품: 유리의 표면을 광택 내기 위해 사용됩니다.\n7. 에나멜 그릇: 유리에 그림을 그리고 색칠하기 위해 사용됩니다.\n8. 용제: 유리 조각을 부착하거나 다른 재료를 붙이기 위해 사용됩니다.\n\n다양한 도구와 재료를 사용하여 유리공예 작업을 즐기고 다양한 작품을 만들 수 있습니다!')


In [None]:
# for param in model.transformer.parameters():
#     param.requires_grad = False

# for param in model.lm_head.parameters():
#     param.requires_grad = True

In [None]:
import torch
from torch.utils.data import Dataset

class QADataset(Dataset):
    def __init__(self, qa_pairs, tokenizer, max_length=256):
        self.qa_pairs = qa_pairs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.qa_pairs)

    def __getitem__(self, idx):
        question, answer = self.qa_pairs[idx]
        input_text = f"</s> 질문: {question} 답변: {answer} </s>"

        encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        # labels를 input_ids의 복사본으로 생성
        labels = input_ids.clone()

        # 패딩 토큰에 대한 손실을 무시하기 위해 패딩 토큰 인덱스를 -100으로 설정
        labels[input_ids == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }



In [None]:
from torch.utils.data import DataLoader

# Dataset과 DataLoader 생성
dataset = QADataset(pairs, tokenizer)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# DataLoader에서 배치 샘플 확인
for batch in dataloader:
    print(batch['input_ids'].shape)  # (batch_size, max_length)
    print(batch['attention_mask'].shape)  # (batch_size, max_length)
    print(batch['labels'].shape)  # (batch_size, max_length)
    break

torch.Size([16, 256])
torch.Size([16, 256])
torch.Size([16, 256])


## Model Fine-tuning

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup

num_epochs = 2


# Dataset과 DataLoader 정의
# train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

# 옵티마이저와 스케줄러 정의
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# 학습 루프
for epoch in range(num_epochs):
    model.train()
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    print(f"Epoch: {epoch}, Loss: {loss.item()}")

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch: 0, Loss: 2.743095874786377
Epoch: 1, Loss: 2.508897542953491


In [6]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, get_linear_schedule_with_warmup


# JSON 파일 로드 함수
def load_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    pairs = []
    for entry in data["data_info"]:
        question = entry["question"]
        answer = entry["answer"]["contents"]
        pairs.append((question, answer))

    return pairs


# Dataset 클래스
class QADataset(Dataset):
    def __init__(self, qa_pairs, tokenizer, max_length=256):
        self.qa_pairs = qa_pairs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.qa_pairs)

    def __getitem__(self, idx):
        question, answer = self.qa_pairs[idx]
        input_text = f"</s> 질문: {question} 답변: {answer} </s>"

        encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        # labels를 input_ids의 복사본으로 생성
        labels = input_ids.clone()

        # 패딩 토큰에 대한 손실을 무시하기 위해 패딩 토큰 인덱스를 -100으로 설정
        labels[input_ids == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }


# 파일 경로
train_filepath = '/content/drive/MyDrive/멋사자 AI 2차 실전 프로젝트/한국어 성능이 개선된 초거대AI 언어모델 개발 및 데이터/Training/02.라벨링데이터/TL_02.RLHF데이터/SFTlabel.json'
valid_filepath = '/content/drive/MyDrive/멋사자 AI 2차 실전 프로젝트/한국어 성능이 개선된 초거대AI 언어모델 개발 및 데이터/Validation/02.라벨링데이터/VL/SFTlabel.json'

# JSON 파일 데이터 로드
train_pairs = load_data(train_filepath)
valid_pairs = load_data(valid_filepath)

# Tokenizer 초기화 (예: Hugging Face KoGPT2 사용)
tokenizer = AutoTokenizer.from_pretrained("skt/kogpt2-base-v2")

# Dataset 및 DataLoader 생성
train_dataset = QADataset(train_pairs, tokenizer)
valid_dataset = QADataset(valid_pairs, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=16, shuffle=False)

# 학습용 모델 불러오기 (예: Hugging Face GPT)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained("skt/kogpt2-base-v2").to(device)

# 옵티마이저 및 스케줄러 정의
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 2
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# 학습 및 검증 루프
for epoch in range(num_epochs):
    # === 학습 단계 ===
    model.train()
    total_train_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1} | Train Loss: {avg_train_loss:.4f}")

    # === 검증 단계 ===
    model.eval()
    total_valid_loss = 0
    with torch.no_grad():
        for batch in valid_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_valid_loss += loss.item()

    avg_valid_loss = total_valid_loss / len(valid_dataloader)
    print(f"Epoch {epoch + 1} | Validation Loss: {avg_valid_loss:.4f}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

## QA bot Test

In [None]:
text = '근육을 키우려면 어떻게 해야할까요?'
input_text = f"</s> 질문: {text} 답변:"
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
gen_ids = model.generate(input_ids,
                         max_length=128,
                         repetition_penalty=2.0,
                         pad_token_id=tokenizer.pad_token_id,
                         eos_token_id=tokenizer.eos_token_id,
                         bos_token_id=tokenizer.bos_token_id,
                         use_cache=True)
generated = tokenizer.decode(gen_ids[0])
print(generated)

</s> 질문: 근육을 키우려면 어떻게 해야할까요? 답변: 근육이 형성되는 과정은 다음과 같습니다 :
1. 균형 잡힌 운동 및 식단 관리 - 근력을 키우기 위해 적절한 운동을 시작합니다. 
2. 스트레스를 관리하기 위한 식사 조절하기, 스트레칭과 같은 유산소 운동과 함께 하는 활동 등을 포함합니다.
3. 신체활동 계획 수립- 건강한 신체를 유지하기 위해서는 충분한 휴식과 수면을 취하는 것이 중요합니다,
 규칙적인 생활과 꾸준한 운동은 근육의 활동을 촉진하고 긴장을 완화시키는 데 도움이 됩니다. 또한, 과도한 음식은 피해야 합니다. 캥거루나 바렛더 등의 가벼운 스포츠 활동은 몸의 균형을 깨뜨려 부상을 예방하거나


In [None]:
text = '청소를 할 때 무엇을 먼저 해야할까요?'
input_text = f"</s> 질문: {text} 답변:"
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
gen_ids = model.generate(input_ids,
                         max_length=128,
                         repetition_penalty=2.0,
                         pad_token_id=tokenizer.pad_token_id,
                         eos_token_id=tokenizer.eos_token_id,
                         bos_token_id=tokenizer.bos_token_id,
                         use_cache=True)
generated = tokenizer.decode(gen_ids[0])
print(generated)

</s> 질문: 청소를 할 때 무엇을 먼저 해야할까요? 답변: 청소 후에는 다음과 같은 단계를 거쳐야 합니다 :
1. 먼지 제거 및 클리닝을 위한 세트 사용법 설명하시오. 
2. 세탁기 필터링과 공기청정기 사용을 위해 세척기를 사용해야 합니다. 이 경우 세제 또는 섬유유를 사용하여 깨끗하게 닦아낼 수 있습니다. 다음은 몇 가지 예시입니다
- 팁(Tip) - 깨끗한 물을 사용하는 것이 중요합니다. 이는 오염물질이나 세균을 효과적으로 제거하는 데 도움이 됩니다. 또한, 오염된 물도 사용할 필요가 있습니다.
3. 적절한 물과 함께 사용하면 더 잘 닦일 수도 있습니다(Therm


In [None]:
text = "산림의 주요 기능은 무엇이며, 환경 보호와 관련하여 어떤 역할을 하나요?"
input_text = f"</s> 질문: {text} 답변:"
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
gen_ids = model.generate(input_ids,
                         max_length=128,
                         repetition_penalty=2.0,
                         pad_token_id=tokenizer.pad_token_id,
                         eos_token_id=tokenizer.eos_token_id,
                         bos_token_id=tokenizer.bos_token_id,
                         use_cache=True)
generated = tokenizer.decode(gen_ids[0])
print(generated)

</s> 질문: 산림의 주요 기능은 무엇이며, 환경 보호와 관련하여 어떤 역할을 하나요? 답변: 산림 보호는 다양한 역할과 책임을 가지고 있습니다. 

1. 생태계 보전 및 지속 가능한 이용 촉진 : 산림은 생물 다양성과 생태적 다양성을 유지하고 유지하는데 중요한 기능을 합니다. 이는 다음과 같은 역할들을 포함합니다.
첫째, 생태계의 보전과 복원"은 기후변화와 자연재해로부터 보호하는 데 도움을 줍니다. 이를 위해 정부는 숲 가꾸기, 나무 심기 등 숲을 위한 사업을 추진합니다. 이러한 사업은 숲의 가치를 높이고 환경을 보호하기 때문에 많은 사람들이 혜택을 볼 수 있습니다.
둘째로, 임업 생산성 향상 (예: 목재 생산) 은 임산물 생산을 통해 수익을 창출할 수도 있지만, 다른


## Model Save

In [None]:
save_directory = './fine_tuned_kogpt2'

model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('./fine_tuned_kogpt2/tokenizer_config.json',
 './fine_tuned_kogpt2/special_tokens_map.json',
 './fine_tuned_kogpt2/tokenizer.json')

## Model Load

In [None]:
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast

model = GPT2LMHeadModel.from_pretrained(save_directory)
tokenizer = PreTrainedTokenizerFast.from_pretrained(save_directory)

model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=51200, bias=False)
)

In [None]:
text = '근육을 키우려면 어떻게 해야할까요?'
input_text = f"</s> 질문: {text} 답변:"
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
gen_ids = model.generate(input_ids,
                         max_length=128,
                         repetition_penalty=2.0,
                         pad_token_id=tokenizer.pad_token_id,
                         eos_token_id=tokenizer.eos_token_id,
                         bos_token_id=tokenizer.bos_token_id,
                         use_cache=True)
generated = tokenizer.decode(gen_ids[0])
print(generated)

</s> 질문: 근육을 키우려면 어떻게 해야할까요? 답변: 근육이 성장하려면 다음과 같은 단계를 거쳐야 합니다 :
1. 운동량 조절 - 근력 운동을 통해 근육의 힘을 키웁니다. 
2. 스트레칭과 유산소운동- 그리고 균형 잡힌 식단 유지와 적절한 운동으로 체중을 조절하는 것이 중요합니다.
3. 유연성 향상 및 밸런스 관리– 유연한 신체 구조를 유지하는 것은 중요합니다. 예를 들어, 발바닥, 팔뚝 등 다양한 부위의 움직임을 부드럽게 만들어주는 동작을 반복하면 됩니다.
4. 호흡 훈련의 중요성 확인 ­ 호흡을 통한 몸의 움직임에 대한 이해와 훈련을 실시해야 합니다. 이를 위해 충분한 휴식과 함께 규칙적인 수면


In [None]:
text = "산림의 주요 기능은 무엇이며, 환경 보호와 관련하여 어떤 역할을 하나요?"
input_text = f"</s> 질문: {text} 답변:"
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
gen_ids = model.generate(input_ids,
                         max_length=128,
                         repetition_penalty=2.0,
                         pad_token_id=tokenizer.pad_token_id,
                         eos_token_id=tokenizer.eos_token_id,
                         bos_token_id=tokenizer.bos_token_id,
                         use_cache=True)
generated = tokenizer.decode(gen_ids[0])
print(generated)

</s> 질문: 산림의 주요 기능은 무엇이며, 환경 보호와 관련하여 어떤 역할을 하나요? 답변: 산림청은 다양한 기능을 수행하는 대표적인 임업 기관입니다. 

1. 생태계 보전 및 관리 : 산림은 생물 다양성과 생태계를 보호하고, 생태계의 균형을 유지합니다. 이를 위해 숲 가꾸기, 나무 심기 등 자연친화적인 활동을 장려하고 지속 가능한 관리를 제공합니다.
2. 기후변화 대응과 자원 재활용 촉진 노력 - 정부는 온실가스 배출을 줄이기 위한 노력을 기울이고 있습니다.
3. 수질 개선 활동 지원 (예: 대기오염 정화) 사업은 하천의 수질을 개선하기 위하여 물을 공급하고 오염물질을 제거하는 데 도움을 줍니다.
4. 재해 예방활동 강화와 복구 계획 수립에 대한 지원을 통해 국민의


### enhancing variability

In [None]:
text = "산림의 주요 기능은 무엇이며, 환경 보호와 관련하여 어떤 역할을 하나요?"
input_text = f"</s> 질문: {text} 답변:"
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

gen_ids = model.generate(
    input_ids,
    max_length=128,
    repetition_penalty=2.0,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
    bos_token_id=tokenizer.bos_token_id,
    use_cache=True,
    top_k=50,
    top_p=0.95,
    temperature=0.5,
    do_sample=True
)

generated = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
print(generated)

질문: 산림의 주요 기능은 무엇이며, 환경 보호와 관련하여 어떤 역할을 하나요? 답변: 산림 생태계의 중요한 역할은 다음과 같습니다 :
1. 자원 보존 및 보전에 관한 기본 원칙, 보전 계획 수립과 유지 보수 절차의 준수 여부 등입니다.
2. 생태계 복원과 보호 관리 계획의 수립, 지속 가능한 관리를 위한 노력 등을 포함합니다.
3. 기후 변화 대응에 대한 관심과 참여도 필요합니다. 이를 위해 다양한 정책 제안이 이루어집니다.
4. 생물 다양성 보호를 통한 서식지 보호, 토양 오염 예방 등에 관심을 기울여야 합니다.
5. 자연 재해 예방을 위해서는 지속적인 관리와 관리가 필요합니다.
6. 대기 질 개선을 통해 숲 훼손을 방지하고 숲의 생태적 가치를 증진
