In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import os
from pathlib import Path

In [3]:
df = pd.DataFrame(columns = ['ko', 'en'])

In [4]:
# 1. 엑셀 파일이 있는 폴더 경로를 지정합니다.
folder_path = Path("/content/drive/MyDrive/서울대/snuann/Transform/data/en-ko_corpus")

# 2. 폴더와 모든 하위 폴더에서 .xlsx 파일을 찾습니다.
excel_files = list(folder_path.glob("**/*.xlsx"))
len(excel_files)

10

In [5]:
import pandas as pd
from tqdm import tqdm

# 1. 빈 리스트를 먼저 만듭니다.
df_list = []

for f in tqdm(excel_files):
    # 엑셀 파일을 읽습니다.
    _df = pd.read_excel(f)

    # 불필요한 공백을 제거합니다.
    _df.columns = _df.columns.str.strip()

    # 필요한 열만 선택하고, 열 이름을 변경합니다.
    _df_renamed = _df[['원문', '번역문']].rename(columns={"원문": "ko", "번역문": "en"})

    # 2. 처리된 데이터프레임을 리스트에 추가합니다.
    df_list.append(_df_renamed)

# 3. 루프가 끝난 후, 리스트에 있는 모든 데이터프레임을 한 번에 합칩니다..
# ignore_index=True는 각 파일의 인덱스를 무시하고 새로 붙여줍니다.
df = pd.concat(df_list, axis=0, ignore_index=True)

# 최종 결과를 확인합니다.
df.info()

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
100%|██████████| 10/10 [03:44<00:00, 22.42s/it]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1602418 entries, 0 to 1602417
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   ko      1602418 non-null  object
 1   en      1602418 non-null  object
dtypes: object(2)
memory usage: 24.5+ MB


In [6]:
import re, unicodedata
def filter(x):
    ko, en = x['ko'], x['en']
    if not (1 <= len(en.split()) <= 200 and 1 <= len(ko.split()) <= 200):
        return False
    ratio = max(len(en.split()), len(ko.split())) / max(1, min(len(en.split()), len(ko.split())))
    if ratio > 3.0:
        return False
    return True

ZWSP_RE = re.compile(r'[\u200B-\u200D\uFEFF]')   # 제로폭 공백류
CTRL_RE = re.compile(r'[\u0000-\u001F\u007F]')   # 제어문자

def normalize(s: str) -> str:
    s = unicodedata.normalize('NFKC', str(s))
    s = ZWSP_RE.sub('', s)
    s = CTRL_RE.sub('', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

df = df[df.apply(filter, axis=1)]
df['ko'] = df['ko'].apply(normalize)
df['en'] = df['en'].apply(normalize)

# 완전히 빈 문자열은 제거
df = df[(df['en']!='') & (df['ko']!='')]

# (선택) 셔플해서 순서 편향 줄이기
df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1592039 entries, 0 to 1592038
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   ko      1592039 non-null  object
 1   en      1592039 non-null  object
dtypes: object(2)
memory usage: 24.3+ MB


In [8]:
RAW_DIR = Path('/content/drive/MyDrive/서울대/snuann/Transform/data/raw')
OUT_TRAIN_EN = RAW_DIR/'train.en'
OUT_TRAIN_KO = RAW_DIR/'train.ko'
OUT_TEST_EN = RAW_DIR/'test.en'
OUT_TEST_KO = RAW_DIR/'test.ko'

TRAIN_RATIO = 0.8

OUT_TRAIN_EN.write_text('\n'.join(df.iloc[:int(len(df) * TRAIN_RATIO)]['en'].tolist()) + '\n', encoding='utf-8')
OUT_TRAIN_KO.write_text('\n'.join(df.iloc[:int(len(df) * TRAIN_RATIO)]['ko'].tolist()) + '\n', encoding='utf-8')

print(f"Saved: {int(len(df) * TRAIN_RATIO)} pairs")
print("EN:", OUT_TRAIN_EN)
print("KO:", OUT_TRAIN_KO)

OUT_TEST_EN.write_text('\n'.join(df.iloc[int(len(df) * TRAIN_RATIO):]['en'].tolist()) + '\n', encoding='utf-8')
OUT_TEST_KO.write_text('\n'.join(df.iloc[int(len(df) * TRAIN_RATIO):]['ko'].tolist()) + '\n', encoding='utf-8')

print(f"Saved: {len(df) - int(len(df) * TRAIN_RATIO)} pairs")
print("EN:", OUT_TEST_EN)
print("KO:", OUT_TEST_KO)

# 5) 검증 + 프리뷰 ------------------------------------------------
n_en = sum(1 for _ in open(OUT_TRAIN_EN, encoding='utf-8'))
n_ko = sum(1 for _ in open(OUT_TRAIN_KO, encoding='utf-8'))
print("Train Line counts — en:", n_en, "| ko:", n_ko)
assert n_en == n_ko, "en/ko line counts must match!"

n_en = sum(1 for _ in open(OUT_TEST_EN, encoding='utf-8'))
n_ko = sum(1 for _ in open(OUT_TEST_KO, encoding='utf-8'))
print("Test Line counts — en:", n_en, "| ko:", n_ko)
assert n_en == n_ko, "en/ko line counts must match!"

print("\n[Preview]")
with open(OUT_TRAIN_EN, encoding='utf-8') as fe, open(OUT_TRAIN_KO, encoding='utf-8') as fk:
    for i in range(min(3, n_en)):
        print(f"EN[{i}]:", fe.readline().strip())
        print(f"KO[{i}]:", fk.readline().strip())

Saved: 1273631 pairs
EN: /content/drive/MyDrive/서울대/snuann/Transform/data/raw/train.en
KO: /content/drive/MyDrive/서울대/snuann/Transform/data/raw/train.ko
Saved: 318408 pairs
EN: /content/drive/MyDrive/서울대/snuann/Transform/data/raw/test.en
KO: /content/drive/MyDrive/서울대/snuann/Transform/data/raw/test.ko
Train Line counts — en: 1273631 | ko: 1273631
Test Line counts — en: 318408 | ko: 318408

[Preview]
EN[0]: The robot automatically controls itself so that the procedure can be performed according to the plan.
KO[0]: 해당 계획에 따라 시술이 이뤄지도록 로봇은 스스로 자동제어하게 된다.
EN[1]: Cho Hyun-woo started to show off his skills in the K-League in the 2015 season, when he was the only player to play in all games and recorded 49 goals conceded in 41 games in the Second league, K-league Challenge, and bring his team to the 2nd place of the regular season.
KO[1]: 조현우가 K리그에서 발군의 기량을 선보이기 시작한 건 2부 리그 격인 K리그 챌린지에서 유일하게 전 경기에 출전해 41경기 49실점을 기록하면서 팀을 정규리그 2위로 올려둔 2015년 시즌이다.
EN[2]: Can you show me the pas