In [None]:
import pandas as pd
import random
import pickle
import re
from transformers import PreTrainedTokenizerFast

# KoGPT2 토크나이저 준비
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    "skt/kogpt2-base-v2",
    bos_token='</s>', eos_token='</s>', unk_token='<unk>',
    pad_token='<pad>', mask_token='<mask>'
)

# WordNet 로드
with open("../static/data/wordnet.pickle", "rb") as f:
    wordnet = pickle.load(f)

# Synonym Replacement
def synonym_replacement(words, n):
    new_words = words.copy()
    random_word_list = list(set([word for word in words]))
    random.shuffle(random_word_list)
    num_replaced = 0

    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    return new_words

def get_synonyms(word):
    normalized_word = word.lstrip('▁')  # KoGPT2의 '▁' 제거
    synonyms = []
    try:
        for syn in wordnet[normalized_word]:
            synonyms.extend(syn)
    except KeyError:
        pass
    return synonyms

# 텍스트 증강 함수
def augment_text(text, num_replacements=2):

    # KoGPT2 토큰화
    tokens = tokenizer.tokenize(text)

    # Synonym Replacement
    augmented_tokens = synonym_replacement(tokens, num_replacements)

    # 디토큰화
    return tokenizer.convert_tokens_to_string(augmented_tokens)

# CSV 파일 로드
data_file_path = '../static/data/'
training_file_path = 'training/'
csv_name = '기타_train.csv'
data = pd.read_csv(data_file_path + training_file_path + csv_name)

# 증강 대상 컬럼 지정
target_column = '발화문'  # 증강할 텍스트 컬럼

# 텍스트 증강 적용
data[target_column] = data[target_column].apply(lambda x: augment_text(x, num_replacements=2))

# 결과 저장
augment_file_path = 'augment/'
output_file_path = data_file_path + augment_file_path + 'SR/SR_' +  csv_name
data.to_csv(output_file_path, index=False, encoding='utf-8-sig')
print(f"증강 완료! 결과가 '{output_file_path}'에 저장되었습니다.")
