In [11]:
!pip install googletrans==4.0.0-rc1 pickle5



In [2]:
import random
from googletrans import Translator
import re
import numpy as np
import pandas as pd
import json
import unicodedata
import pickle5 as pickle
import os

In [3]:
# KorEDA
# https://github.com/catSirup/KorEDA/tree/master
# 기반 논문: EDA: Easy Data Augmentation Techniques for Boosting Performance on Text Classification Tasks

def random_deletion(words, p):
	if len(words) == 1:
		return words

	new_words = []
	for word in words:
		r = random.uniform(0, 1)
		if r > p:
			new_words.append(word)

	if len(new_words) == 0:
		rand_int = random.randint(0, len(words)-1)
		return [words[rand_int]]

	return new_words

def random_swap(words, n):
	new_words = words.copy()
	for _ in range(n):
		new_words = swap_word(new_words)

	return new_words

def swap_word(new_words):
	random_idx_1 = random.randint(0, len(new_words)-1)
	random_idx_2 = random_idx_1
	counter = 0

	while random_idx_2 == random_idx_1:
		random_idx_2 = random.randint(0, len(new_words)-1)
		counter += 1
		if counter > 3:
			return new_words

	new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
	return new_words

def EDA(sentence, alpha_rs=0.1, p_rd=0.1, num_aug=1):
	words = sentence.split(' ')
	words = [word for word in words if word is not ""]
	num_words = len(words)

	num_new_per_technique = int(num_aug/4) + 1

	n_rs = max(1, int(alpha_rs*num_words))

	# rs
	for _ in range(num_new_per_technique):
		a_words = random_swap(words, n_rs)
		rs_result = " ".join(a_words)

	# rd
	for _ in range(num_new_per_technique):
		a_words = random_deletion(words, p_rd)
		rd_result = " ".join(a_words)

	return rs_result, rd_result

In [4]:
# Back translattion
# 참고 논문: Data expansion using back translation and paraphrasing for hate speech detection

def back_translattion_using_google(text):
  translator = Translator()
  result = translator.translate(text, src="ko", dest="en")
  result = translator.translate(result.text, src="en", dest="ko")

  return result.text

In [5]:
def data_qc(paragrahp:str):
    paragrahp = unicodedata.normalize("NFKD", paragrahp)
    paragrahp = re.sub(r"(\(.*?\))", "", paragrahp)
    paragrahp = re.sub("((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*", "", paragrahp)
    paragrahp = re.sub("'^[a-zA-Z0-9+-_.]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'", "", paragrahp)
    return paragrahp

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
cd "./drive/MyDrive/취업/프리온보딩/기업과제/nlu"

/content/drive/MyDrive/취업/프리온보딩/기업과제/nlu


In [8]:
with open("./klue-sts-v1.1_train.json", "rt", encoding='utf8') as f:
    data = json.load(f)
    
frame = np.full([len(data), 3], np.nan)
df = pd.DataFrame(frame, columns=['sentence1', 'sentence2', 'label'])
for idx, el in enumerate(data):
    df.loc[idx] = [el['sentence1'], 
                   el['sentence2'], 
                   el['labels']['real-label']]
del data

df[['sentence1', 'sentence2']] = df[['sentence1', 'sentence2']].applymap(data_qc)

In [9]:
def process(df, num):
  sent_1_list=df.sentence1.to_list()
  sent_2_list=df.sentence2.to_list()
  label_list=df.label.to_list()

  random_swap_result = list()
  random_delete_result = list()
  translate_result = list()

  for idx, sent1 in enumerate(sent_1_list):
    sent2 = sent_2_list[idx]
    s1_rs, s1_rd = EDA(sent1)
    s2_rs, s2_rd = EDA(sent2)

    random_swap_result.append((s1_rs, s2_rs, label_list[idx]))
    random_delete_result.append((s1_rd, s2_rd, label_list[idx]))

    try:
      translate_result.append((back_translattion_using_google(sent1), back_translattion_using_google(sent2), label_list[idx]))
    except:
      translate_result=list()
  return random_swap_result, random_delete_result, translate_result

In [10]:
os.makedirs("./data", exist_ok=True)

for num in range(1, 51):
  if num < 10:
    str_num = "0"+str(num)
  else:
    str_num = str(num)
  output_file_name = './data/bt_'+str_num+'.pickle'
  if os.path.isfile(output_file_name):
    continue

  df_len = len(df)//(50)
  l_idx = df_len*(num-1)
  r_idx = df_len*num

  if num == 50:
    r_idx = None

  new_df = df[l_idx:r_idx]

  try:
    result = process(new_df, num)

    with open('./data/rs_'+str(str_num)+'.pickle', 'wb') as f:
        pickle.dump(result[0], f, pickle.HIGHEST_PROTOCOL)
              
    with open('./data/rd_'+str(str_num)+'.pickle', 'wb') as f:
          pickle.dump(result[1], f, pickle.HIGHEST_PROTOCOL)

    if not result[2]:
      continue
    else: 
      with open('./data/bt_'+str(str_num)+'.pickle', 'wb') as f:
            pickle.dump(result[2], f, pickle.HIGHEST_PROTOCOL)
  except:
    continue

In [15]:
bt_total_list = list()
rs_total_list = list()
rd_total_list = list()

file_list = sorted(os.listdir("./data/"))
for file in file_list:
  print(file)
  with open("./data/"+file, 'rb') as f:
    data = pickle.load(f)
    if "bt" in file:
      bt_total_list.extend(data)
    elif "rs" in file:
      rs_total_list.extend(data)
    elif "rd" in file:
      rd_total_list.extend(data)

with open('./bt.pickle', 'wb') as f:
    pickle.dump(bt_total_list, f, pickle.HIGHEST_PROTOCOL)

with open('./rs.pickle', 'wb') as f:
    pickle.dump(rs_total_list, f, pickle.HIGHEST_PROTOCOL)

with open('./rd.pickle', 'wb') as f:
    pickle.dump(rd_total_list, f, pickle.HIGHEST_PROTOCOL)

bt_01.pickle
bt_02.pickle
bt_03.pickle
bt_04.pickle
bt_05.pickle
bt_06.pickle
bt_07.pickle
bt_08.pickle
bt_09.pickle
bt_10.pickle
bt_11.pickle
bt_12.pickle
bt_13.pickle
bt_14.pickle
bt_15.pickle
bt_16.pickle
bt_17.pickle
bt_19.pickle
bt_20.pickle
bt_21.pickle
bt_22.pickle
bt_23.pickle
bt_24.pickle
bt_25.pickle
bt_26.pickle
bt_27.pickle
bt_28.pickle
bt_29.pickle
bt_30.pickle
bt_31.pickle
bt_32.pickle
bt_33.pickle
bt_34.pickle
bt_35.pickle
bt_36.pickle
bt_37.pickle
bt_38.pickle
bt_39.pickle
bt_40.pickle
bt_41.pickle
bt_42.pickle
bt_43.pickle
bt_44.pickle
bt_45.pickle
bt_46.pickle
bt_47.pickle
bt_48.pickle
bt_49.pickle
bt_50.pickle
rd_01.pickle
rd_02.pickle
rd_03.pickle
rd_04.pickle
rd_05.pickle
rd_06.pickle
rd_07.pickle
rd_08.pickle
rd_09.pickle
rd_10.pickle
rd_11.pickle
rd_12.pickle
rd_13.pickle
rd_14.pickle
rd_15.pickle
rd_16.pickle
rd_17.pickle
rd_19.pickle
rd_20.pickle
rd_21.pickle
rd_22.pickle
rd_23.pickle
rd_24.pickle
rd_25.pickle
rd_26.pickle
rd_27.pickle
rd_28.pickle
rd_29.pickle

In [46]:
random_swap_result

[('숙소 위치는 쉽고 찾기 일반적인 한국의 반지하 숙소입니다.',
  '숙박시설의 위치는 쉽게 찾을 반지하 있고 한국의 대표적인 수 숙박시설입니다.',
  3.714285714285714),
 ('위반행위 조사 대상이다. 거부·방해·기피한 자는 500만원 이하 과태료 부과 등을',
  '시민들 것은 자발적인 예방 노력을 한 스스로 아산 뿐만이 아니었다.',
  0.0),
 ('회사가 보낸 메일은 이 지메일이 아니라 다른 지메일 전달해줘. 계정으로',
  '알려줘 주로 네이버 메일을 쓰는 이유를 사람들이',
  0.3333333333333333),
 ('긴급 고용안정지원금은 불가능하다. 등 특별지원금, 지자체별 소상공인 지원사업, 취업성공패키지, 청년구직활동지원금, 긴급복지지원제도 지원금과는 중복 수급이 지역고용대응',
  '고용보험이 고용안전망입니다. 고용안전망이라면, 국민취업지원제도는 2차 1차',
  0.5714285714285714),
 ('개선될 답장이 늦으나, 호스트의 것으로 보입니다.',
  '호스트 보입니다. 늦었지만 개선될 것으로 응답이',
  

In [47]:
random_delete_result

[('숙소 위치는 찾기 쉽고 일반적인 한국의 숙소입니다.',
  '숙박시설의 위치는 쉽게 찾을 수 있고 한국의 대표적인 반지하 숙박시설입니다.',
  3.714285714285714),
 ('위반행위 조사 등을 거부·방해·기피한 자는 500만원 이하 과태료 부과 대상이다.',
  '시민들 스스로 자발적인 예방 노력을 한 것은 아산 뿐만이',
  0.0),
 ('회사가 메일은 이 아니라 다른 지메일 계정으로 전달해줘.',
  '사람들이 주로 메일을 쓰는 이유를 알려줘',
  0.3333333333333333),
 ('고용안정지원금은 지역고용대응 등 특별지원금, 지자체별 소상공인 지원사업, 취업성공패키지, 청년구직활동지원금, 긴급복지지원제도 지원금과는 중복 수급이 불가능하다.',
  '고용보험이 1차 고용안전망이라면, 국민취업지원제도는 2차 고용안전망입니다.',
  0.5714285714285714),
 ('호스트의 답장이 늦으나, 개선될 것으로 보입니다.',
  '호스트 응답이 늦었지만 개선될 것으로 보입니다.',
  4.714285714285714)]

In [48]:
translate_result

[('호스텔 위치는 쉽게 찾을 수 있으며 한국의 일반 거주지입니다.',
  '숙박 시설의 위치는 쉽게 찾을 수 있으며 한국 대표적인 반지가 있습니다.',
  3.714285714285714),
 ('부정, 방해, 위반을 피하고 피하는자는 5 백만원의 적용을받습니다.',
  '시민의 자주 자발적 노력은 Acan뿐만 아니라뿐만 아니라뿐입니다.',
  0.0),
 ('회사가 보낸 메일은 GMILE이 아니지만 다른 Gmail 계정으로 전달합니다.',
  '사람들이 왜 사람들이 네이버 메일을 쓰는지 알게하십시오.',
  0.3333333333333333),
 ('긴급 고용 안정화 혜택은 지역 고용 대응, 고용 특정 지원 사업, 고용 성공 패키지, 청소년 직업, 고용 성공 패키지, 청소년 직업 활동 보조금 및 비상 복지 지원 시스템 보조금과 같은 특별 보조금을 복제하는 것이 불가능합니다.',
  '고용 보험이 주요 고용 안전망이면 국가 고용 지원 시스템은 두 번째 고용 안전망입니다.',
  0.5714285714285714),
 ('호스트의 회신은 늦었지만 개선 된 것 같습니다.',
  '호스트 응답은 늦었지만 개선 된 것으로 보입니다.',
  4.714285714285714)]

In [18]:
with open("./bt.pickle", 'rb') as f:
  data = pickle.load(f)
len(data)

11435