In [12]:
import pandas as pd
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast
from pathlib import Path
import torch

# Load dataset
input_path = '../static/data/음식점_train.csv'
output_path = '../static/data/augmented_questions.csv'
data = pd.read_csv(input_path)

# Filter for QA여부 == 'q' and select the first 10 rows
data = data[data["QA여부"].str.lower() == 'q'].head(10)

# GPU 또는 CPU 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# KoGPT2 모델과 토크나이저 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    "skt/kogpt2-base-v2",
    bos_token="</s>", 
    eos_token="</s>", 
    pad_token="<pad>"
)
model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2").to(device)

# Function for contextual insertion augmentation
def augment_with_context(sentence, tokenizer, model, max_length=50):
    input_ids = tokenizer.encode(sentence, return_tensors="pt")
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1)
    augmented_sentence = tokenizer.decode(output[0], skip_special_tokens=True)
    return augmented_sentence

# Apply augmentation
data["증강된 발화문"] = data["발화문"].apply(lambda x: augment_with_context(x, tokenizer, model))

# Save the results
Path("static/data").mkdir(parents=True, exist_ok=True)
data.to_csv(output_path, index=False)
print(f"Augmented data saved to {output_path}")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


KeyboardInterrupt: 

In [None]:
!pip install protobuf
!pip install transformers
!pip install torch torchvision torchaudio