# v10 Data Preparation: Large-Scale (10M+ samples)

v8/v9의 데이터 부족 문제를 해결하기 위해 대규모 데이터셋을 구축합니다.

## 핵심 변경사항
1. **직접 KO-EN 쌍 사용**: 클러스터링에 의존하지 않음
2. **다중 데이터셋**: OPUS-100 + CCMatrix + Tatoeba 등
3. **10M+ 샘플 목표**: 대규모 학습 데이터
4. **필터링 완화**: 더 많은 데이터 활용

## 1. Setup

In [None]:
import sys
import json
import os
from pathlib import Path
from collections import defaultdict

def find_project_root():
    candidates = [
        Path.cwd(),
        Path.cwd().parent,
        Path.cwd().parent.parent,
        Path("/home/west/Documents/cursor-workspace/opensearch-neural-pre-train"),
    ]
    for candidate in candidates:
        if (candidate / "CLAUDE.md").exists() or (candidate / ".git").exists():
            return candidate
    return Path("/home/west/Documents/cursor-workspace/opensearch-neural-pre-train")

PROJECT_ROOT = find_project_root()
sys.path.insert(0, str(PROJECT_ROOT))

print(f"Project root: {PROJECT_ROOT}")

In [None]:
import numpy as np
import pandas as pd
import torch
from tqdm.auto import tqdm
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer
import random

print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

# Set random seed
random.seed(42)
np.random.seed(42)

## 2. Load Multiple Datasets

In [None]:
# Configuration
CONFIG = {
    'target_samples': 10_000_000,  # 10M target
    'min_ko_len': 2,
    'max_ko_len': 100,
    'min_en_len': 2,
    'max_en_len': 150,
    'output_dir': PROJECT_ROOT / 'dataset' / 'v10_large_scale',
}

print("Configuration:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

In [None]:
def extract_pairs_from_dataset(dataset, ko_key, en_key, max_samples=None, desc="Extracting"):
    """
    Extract KO-EN pairs from a dataset.
    """
    pairs = []
    
    iterator = tqdm(dataset, desc=desc)
    
    for i, sample in enumerate(iterator):
        if max_samples and i >= max_samples:
            break
        
        try:
            # Handle nested structure
            if 'translation' in sample:
                ko_text = sample['translation'].get('ko', sample['translation'].get('kor', ''))
                en_text = sample['translation'].get('en', sample['translation'].get('eng', ''))
            else:
                ko_text = sample.get(ko_key, '')
                en_text = sample.get(en_key, '')
            
            if not ko_text or not en_text:
                continue
            
            ko_text = ko_text.strip()
            en_text = en_text.strip()
            
            # Basic filtering
            if CONFIG['min_ko_len'] <= len(ko_text) <= CONFIG['max_ko_len']:
                if CONFIG['min_en_len'] <= len(en_text) <= CONFIG['max_en_len']:
                    pairs.append({
                        'ko': ko_text,
                        'en': en_text,
                    })
        except Exception as e:
            continue
    
    return pairs

print("Helper function defined")

In [None]:
# Load OPUS-100 (Full dataset - 1M samples)
print("="*70)
print("Loading OPUS-100 Korean-English...")
print("="*70)

try:
    opus_dataset = load_dataset("opus100", "en-ko", split="train", trust_remote_code=True)
    print(f"OPUS-100 loaded: {len(opus_dataset):,} samples")
except Exception as e:
    print(f"Error: {e}")
    opus_dataset = load_dataset("Helsinki-NLP/opus-100", "en-ko", split="train")
    print(f"OPUS-100 (Helsinki) loaded: {len(opus_dataset):,} samples")

In [None]:
# Extract all pairs from OPUS-100
opus_pairs = extract_pairs_from_dataset(
    opus_dataset, 
    ko_key='ko', 
    en_key='en',
    max_samples=None,  # Use all
    desc="OPUS-100"
)
print(f"\nExtracted {len(opus_pairs):,} pairs from OPUS-100")

In [None]:
# Load Tatoeba (High quality sentence pairs)
print("\n" + "="*70)
print("Loading Tatoeba Korean-English...")
print("="*70)

try:
    tatoeba_dataset = load_dataset("tatoeba", lang1="en", lang2="ko", split="train")
    print(f"Tatoeba loaded: {len(tatoeba_dataset):,} samples")
    
    tatoeba_pairs = []
    for sample in tqdm(tatoeba_dataset, desc="Tatoeba"):
        try:
            en_text = sample['translation']['en'].strip()
            ko_text = sample['translation']['ko'].strip()
            if ko_text and en_text:
                if CONFIG['min_ko_len'] <= len(ko_text) <= CONFIG['max_ko_len']:
                    if CONFIG['min_en_len'] <= len(en_text) <= CONFIG['max_en_len']:
                        tatoeba_pairs.append({'ko': ko_text, 'en': en_text})
        except:
            continue
    
    print(f"Extracted {len(tatoeba_pairs):,} pairs from Tatoeba")
except Exception as e:
    print(f"Tatoeba not available: {e}")
    tatoeba_pairs = []

In [None]:
# Try to load CCMatrix (Large-scale mining dataset)
print("\n" + "="*70)
print("Loading CCMatrix Korean-English...")
print("="*70)

ccmatrix_pairs = []
try:
    # CCMatrix can be very large, so we limit
    ccmatrix_dataset = load_dataset(
        "yhavinga/ccmatrix", 
        "en-ko",
        split="train",
        streaming=True  # Use streaming for large dataset
    )
    
    print("CCMatrix loading in streaming mode...")
    
    max_ccmatrix = 5_000_000  # Limit to 5M from CCMatrix
    for i, sample in enumerate(tqdm(ccmatrix_dataset, desc="CCMatrix", total=max_ccmatrix)):
        if i >= max_ccmatrix:
            break
        try:
            en_text = sample['translation']['en'].strip()
            ko_text = sample['translation']['ko'].strip()
            if ko_text and en_text:
                if CONFIG['min_ko_len'] <= len(ko_text) <= CONFIG['max_ko_len']:
                    if CONFIG['min_en_len'] <= len(en_text) <= CONFIG['max_en_len']:
                        ccmatrix_pairs.append({'ko': ko_text, 'en': en_text})
        except:
            continue
    
    print(f"Extracted {len(ccmatrix_pairs):,} pairs from CCMatrix")
    
except Exception as e:
    print(f"CCMatrix not available: {e}")
    print("Continuing with other datasets...")

In [None]:
# Try additional datasets
print("\n" + "="*70)
print("Trying additional datasets...")
print("="*70)

additional_pairs = []

# Try OPUS Books
try:
    print("\nLoading OPUS Books...")
    opus_books = load_dataset("opus_books", "en-ko", split="train")
    for sample in tqdm(opus_books, desc="OPUS Books"):
        try:
            en_text = sample['translation']['en'].strip()
            ko_text = sample['translation']['ko'].strip()
            if ko_text and en_text:
                if CONFIG['min_ko_len'] <= len(ko_text) <= CONFIG['max_ko_len']:
                    additional_pairs.append({'ko': ko_text, 'en': en_text})
        except:
            continue
    print(f"OPUS Books: {len(additional_pairs):,} pairs so far")
except Exception as e:
    print(f"OPUS Books not available: {e}")

# Try OpenSubtitles
try:
    print("\nLoading OpenSubtitles...")
    opensubtitles = load_dataset("open_subtitles", lang1="en", lang2="ko", split="train", streaming=True)
    
    count = 0
    max_subs = 3_000_000
    for sample in tqdm(opensubtitles, desc="OpenSubtitles", total=max_subs):
        if count >= max_subs:
            break
        try:
            en_text = sample['translation']['en'].strip()
            ko_text = sample['translation']['ko'].strip()
            if ko_text and en_text:
                if CONFIG['min_ko_len'] <= len(ko_text) <= CONFIG['max_ko_len']:
                    additional_pairs.append({'ko': ko_text, 'en': en_text})
                    count += 1
        except:
            continue
    print(f"After OpenSubtitles: {len(additional_pairs):,} additional pairs")
except Exception as e:
    print(f"OpenSubtitles not available: {e}")

In [None]:
# Combine all pairs
print("\n" + "="*70)
print("COMBINING ALL DATASETS")
print("="*70)

all_pairs = []
all_pairs.extend(opus_pairs)
print(f"After OPUS-100: {len(all_pairs):,}")

if tatoeba_pairs:
    all_pairs.extend(tatoeba_pairs)
    print(f"After Tatoeba: {len(all_pairs):,}")

if ccmatrix_pairs:
    all_pairs.extend(ccmatrix_pairs)
    print(f"After CCMatrix: {len(all_pairs):,}")

if additional_pairs:
    all_pairs.extend(additional_pairs)
    print(f"After additional: {len(all_pairs):,}")

print(f"\nTotal raw pairs: {len(all_pairs):,}")

## 3. Deduplication and Cleaning

In [None]:
# Remove duplicates based on Korean text
print("\n" + "="*70)
print("DEDUPLICATION")
print("="*70)

seen_ko = set()
unique_pairs = []

for pair in tqdm(all_pairs, desc="Deduplicating"):
    ko_normalized = pair['ko'].lower().strip()
    if ko_normalized not in seen_ko:
        seen_ko.add(ko_normalized)
        unique_pairs.append(pair)

print(f"Before deduplication: {len(all_pairs):,}")
print(f"After deduplication: {len(unique_pairs):,}")
print(f"Removed: {len(all_pairs) - len(unique_pairs):,} duplicates")

In [None]:
def is_valid_pair(pair: dict) -> bool:
    """
    Additional validation for pair quality.
    """
    ko = pair['ko']
    en = pair['en']
    
    # Check if Korean text has Korean characters
    has_korean = any('\uac00' <= c <= '\ud7a3' for c in ko)
    if not has_korean:
        return False
    
    # Check if English text has English characters
    has_english = any(c.isalpha() and c.isascii() for c in en)
    if not has_english:
        return False
    
    # Skip if too many special characters
    special_ratio_ko = sum(1 for c in ko if not c.isalnum() and c != ' ') / len(ko)
    special_ratio_en = sum(1 for c in en if not c.isalnum() and c != ' ') / len(en)
    
    if special_ratio_ko > 0.5 or special_ratio_en > 0.5:
        return False
    
    return True

# Apply validation
print("\nApplying quality validation...")
valid_pairs = [p for p in tqdm(unique_pairs, desc="Validating") if is_valid_pair(p)]

print(f"After validation: {len(valid_pairs):,}")
print(f"Removed: {len(unique_pairs) - len(valid_pairs):,} invalid pairs")

In [None]:
# Shuffle
print("\nShuffling...")
random.shuffle(valid_pairs)

# Preview
print("\nSample pairs:")
for i in range(min(10, len(valid_pairs))):
    print(f"  [{i}] {valid_pairs[i]['ko'][:50]}... → {valid_pairs[i]['en'][:50]}...")

## 4. Data Augmentation (if needed)

In [None]:
# Check if we need augmentation
current_count = len(valid_pairs)
target_count = CONFIG['target_samples']

print(f"Current samples: {current_count:,}")
print(f"Target samples: {target_count:,}")

if current_count < target_count:
    print(f"\nNeed {target_count - current_count:,} more samples")
    print("Proceeding with current dataset...")
else:
    print(f"\nTarget reached! Using {current_count:,} samples")

## 5. Create Training Dataset

In [None]:
# Prepare final training data
print("\n" + "="*70)
print("PREPARING TRAINING DATA")
print("="*70)

# Create output directory
CONFIG['output_dir'].mkdir(parents=True, exist_ok=True)

# For v10, we'll use direct KO-EN pairs without complex synonym structure
# This simplifies training and allows for more data

training_data = []
for pair in tqdm(valid_pairs, desc="Preparing training data"):
    training_data.append({
        'ko_term': pair['ko'],
        'en_term': pair['en'],
    })

print(f"\nTotal training samples: {len(training_data):,}")

In [None]:
# Split into train/val
val_ratio = 0.01  # 1% validation
val_size = int(len(training_data) * val_ratio)
train_size = len(training_data) - val_size

train_data = training_data[:train_size]
val_data = training_data[train_size:]

print(f"Train size: {len(train_data):,}")
print(f"Val size: {len(val_data):,}")

In [None]:
# Save training data
train_path = CONFIG['output_dir'] / 'train.jsonl'
val_path = CONFIG['output_dir'] / 'val.jsonl'

print(f"\nSaving training data to {train_path}...")
with open(train_path, 'w', encoding='utf-8') as f:
    for item in tqdm(train_data, desc="Writing train"):
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"Saving validation data to {val_path}...")
with open(val_path, 'w', encoding='utf-8') as f:
    for item in tqdm(val_data, desc="Writing val"):
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print("\nData saved!")

In [None]:
# Save metadata
metadata = {
    'total_samples': len(training_data),
    'train_samples': len(train_data),
    'val_samples': len(val_data),
    'sources': ['OPUS-100', 'Tatoeba', 'CCMatrix', 'OpenSubtitles'],
    'config': {k: str(v) if isinstance(v, Path) else v for k, v in CONFIG.items()},
}

with open(CONFIG['output_dir'] / 'metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("Metadata saved!")

## 6. Dataset Statistics

In [None]:
# Calculate statistics
ko_lengths = [len(p['ko_term']) for p in training_data]
en_lengths = [len(p['en_term']) for p in training_data]

print("\n" + "="*70)
print("DATASET STATISTICS")
print("="*70)
print(f"\nKorean text length:")
print(f"  Min: {min(ko_lengths)}")
print(f"  Max: {max(ko_lengths)}")
print(f"  Mean: {np.mean(ko_lengths):.1f}")
print(f"  Median: {np.median(ko_lengths):.1f}")

print(f"\nEnglish text length:")
print(f"  Min: {min(en_lengths)}")
print(f"  Max: {max(en_lengths)}")
print(f"  Mean: {np.mean(en_lengths):.1f}")
print(f"  Median: {np.median(en_lengths):.1f}")

In [None]:
# Length distribution
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(ko_lengths, bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Korean Text Length (chars)')
axes[0].set_ylabel('Count')
axes[0].set_title('Korean Text Length Distribution')
axes[0].axvline(np.mean(ko_lengths), color='red', linestyle='--', label=f'Mean: {np.mean(ko_lengths):.1f}')
axes[0].legend()

axes[1].hist(en_lengths, bins=50, edgecolor='black', alpha=0.7)
axes[1].set_xlabel('English Text Length (chars)')
axes[1].set_ylabel('Count')
axes[1].set_title('English Text Length Distribution')
axes[1].axvline(np.mean(en_lengths), color='red', linestyle='--', label=f'Mean: {np.mean(en_lengths):.1f}')
axes[1].legend()

plt.tight_layout()
plt.savefig(CONFIG['output_dir'] / 'length_distribution.png', dpi=150)
plt.show()

## 7. Summary

In [None]:
print("\n" + "="*70)
print("v10 DATA PREPARATION COMPLETE")
print("="*70)

print(f"\nTotal samples: {len(training_data):,}")
print(f"  - Train: {len(train_data):,}")
print(f"  - Val: {len(val_data):,}")

print(f"\nOutput directory: {CONFIG['output_dir']}")
print(f"Files:")
for f in CONFIG['output_dir'].iterdir():
    size_mb = f.stat().st_size / (1024 * 1024)
    print(f"  - {f.name}: {size_mb:.1f} MB")

print(f"\nNext step: Run 02_training.ipynb for model training")