# v19 Data Preparation

This notebook processes term pairs collected from `00_data_ingestion.ipynb` and creates the training dataset.

## Input/Output

- **Input**: `term_pairs.jsonl` (1:1 Korean-English pairs)
- **Output**: `term_mappings.jsonl` (1:N Korean to multi-target mappings with similarity scores)

## Process

1. Load term pairs from `00_data_ingestion.ipynb` output
2. Group by Korean source term
3. Calculate similarity scores using multilingual embeddings
4. Save as 1:N format for training

In [None]:
import sys
from pathlib import Path

def find_project_root():
    current = Path.cwd()
    for parent in [current] + list(current.parents):
        if (parent / "pyproject.toml").exists() or (parent / "src").exists():
            return parent
    return Path.cwd().parent.parent

PROJECT_ROOT = find_project_root()
sys.path.insert(0, str(PROJECT_ROOT))
print(f"Project root: {PROJECT_ROOT}")

In [None]:
import json
from collections import defaultdict
from typing import Dict, List, Tuple

import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import torch
from sentence_transformers import SentenceTransformer

DATA_DIR = PROJECT_ROOT / "dataset" / "v19_high_quality"
print(f"Data directory: {DATA_DIR}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## Configuration

In [None]:
CONFIG = {
    # Embedding model for similarity calculation
    "embedding_model": "BAAI/bge-m3",  # Best multilingual model
    "batch_size": 128,
    
    # Similarity thresholds
    "min_similarity": 0.3,  # Minimum similarity to keep
    "max_targets_per_source": 15,  # Max English targets per Korean term
    
    # Source-based similarity defaults (when embedding fails)
    "default_sim_it": 0.95,  # IT terminology (high quality)
    "default_sim_muse": 0.90,  # MUSE dictionary
    "default_sim_wikidata": 0.85,  # Wikidata
}

print("Configuration:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

## 1. Load Term Pairs

In [None]:
term_pairs_path = DATA_DIR / "term_pairs.jsonl"

if not term_pairs_path.exists():
    raise FileNotFoundError(
        f"term_pairs.jsonl not found at {term_pairs_path}\n"
        "Please run 00_data_ingestion.ipynb first."
    )

print(f"Loading term pairs from: {term_pairs_path}")

term_pairs = []
with open(term_pairs_path, "r", encoding="utf-8") as f:
    for line in tqdm(f, desc="Loading"):
        term_pairs.append(json.loads(line.strip()))

print(f"\nLoaded {len(term_pairs):,} term pairs")

# Statistics
sources = defaultdict(int)
for p in term_pairs:
    sources[p.get("source", "unknown")] += 1

print("\nBy source:")
for src, cnt in sorted(sources.items(), key=lambda x: -x[1]):
    print(f"  {src}: {cnt:,} ({cnt/len(term_pairs)*100:.1f}%)")

## 2. Group by Korean Term

In [None]:
# Group English terms by Korean source
ko_to_en: Dict[str, List[Tuple[str, str]]] = defaultdict(list)  # ko -> [(en, source), ...]

for pair in term_pairs:
    ko = pair["ko"]
    en = pair["en"].lower()
    source = pair.get("source", "unknown")
    
    # Deduplicate within same Korean term
    if (en, source) not in ko_to_en[ko]:
        ko_to_en[ko].append((en, source))

print(f"Unique Korean terms: {len(ko_to_en):,}")

# Distribution of English targets per Korean
target_counts = [len(v) for v in ko_to_en.values()]
print(f"\nEnglish targets per Korean:")
print(f"  Min: {min(target_counts)}")
print(f"  Max: {max(target_counts)}")
print(f"  Mean: {np.mean(target_counts):.2f}")
print(f"  Median: {np.median(target_counts):.1f}")

## 3. Load Embedding Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

print(f"\nLoading embedding model: {CONFIG['embedding_model']}...")
embed_model = SentenceTransformer(CONFIG["embedding_model"])
embed_model = embed_model.to(device)

print(f"Model loaded successfully!")
print(f"Embedding dimension: {embed_model.get_sentence_embedding_dimension()}")

## 4. Calculate Similarity Scores

In [None]:
def get_default_similarity(source: str) -> float:
    """Get default similarity based on source."""
    if source == "it_terminology":
        return CONFIG["default_sim_it"]
    elif source == "muse":
        return CONFIG["default_sim_muse"]
    else:
        return CONFIG["default_sim_wikidata"]


def calculate_similarities_batch(
    ko_terms: List[str],
    en_targets_list: List[List[Tuple[str, str]]],
    model: SentenceTransformer,
    batch_size: int = 128,
) -> Dict[str, List[Tuple[str, float]]]:
    """Calculate cosine similarities between Korean terms and their English targets.
    
    Args:
        ko_terms: List of Korean source terms
        en_targets_list: List of [(en_term, source), ...] for each Korean term
        model: SentenceTransformer model
        batch_size: Batch size for encoding
    
    Returns:
        Dict mapping Korean term to [(en_term, similarity), ...]
    """
    results = {}
    
    # Collect all texts to encode
    all_texts = list(ko_terms)
    ko_indices = {ko: i for i, ko in enumerate(ko_terms)}
    
    en_start_idx = len(ko_terms)
    en_indices = {}  # (ko, en) -> index in all_texts
    
    for ko, en_list in zip(ko_terms, en_targets_list):
        for en, source in en_list:
            if en not in [t for t in all_texts[en_start_idx:]]:
                en_indices[(ko, en)] = len(all_texts)
                all_texts.append(en)
            else:
                # Find existing index
                for i, t in enumerate(all_texts[en_start_idx:], start=en_start_idx):
                    if t == en:
                        en_indices[(ko, en)] = i
                        break
    
    print(f"Encoding {len(all_texts):,} texts...")
    
    # Encode all texts
    embeddings = model.encode(
        all_texts,
        batch_size=batch_size,
        show_progress_bar=True,
        normalize_embeddings=True,
        convert_to_numpy=True,
    )
    
    # Calculate similarities
    print("Calculating similarities...")
    for ko, en_list in tqdm(zip(ko_terms, en_targets_list), total=len(ko_terms)):
        ko_idx = ko_indices[ko]
        ko_emb = embeddings[ko_idx]
        
        term_sims = []
        for en, source in en_list:
            en_idx = en_indices.get((ko, en))
            if en_idx is not None:
                en_emb = embeddings[en_idx]
                # Cosine similarity (embeddings are normalized)
                sim = float(np.dot(ko_emb, en_emb))
            else:
                # Fallback to default
                sim = get_default_similarity(source)
            
            # Apply minimum threshold
            if sim >= CONFIG["min_similarity"]:
                term_sims.append((en, sim))
        
        # Sort by similarity and limit
        term_sims.sort(key=lambda x: -x[1])
        results[ko] = term_sims[:CONFIG["max_targets_per_source"]]
    
    return results

In [None]:
# Calculate similarities
ko_list = list(ko_to_en.keys())
en_list = [ko_to_en[ko] for ko in ko_list]

print(f"Processing {len(ko_list):,} Korean terms...")

term_mappings = calculate_similarities_batch(
    ko_list,
    en_list,
    embed_model,
    batch_size=CONFIG["batch_size"],
)

## 5. Analyze Results

In [None]:
# Filter out empty mappings
term_mappings = {k: v for k, v in term_mappings.items() if v}

print("=" * 70)
print("PROCESSING RESULTS")
print("=" * 70)

print(f"\nKorean terms with targets: {len(term_mappings):,}")
total_targets = sum(len(v) for v in term_mappings.values())
print(f"Total target terms: {total_targets:,}")

# Statistics
target_counts = [len(v) for v in term_mappings.values()]
all_sims = [s for v in term_mappings.values() for _, s in v]

print(f"\nTargets per Korean term:")
print(f"  Min: {min(target_counts)}")
print(f"  Max: {max(target_counts)}")
print(f"  Mean: {np.mean(target_counts):.2f}")
print(f"  Median: {np.median(target_counts):.1f}")

print(f"\nSimilarity scores:")
print(f"  Min: {min(all_sims):.4f}")
print(f"  Max: {max(all_sims):.4f}")
print(f"  Mean: {np.mean(all_sims):.4f}")
print(f"  Median: {np.median(all_sims):.4f}")

In [None]:
# Visualize distributions
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Target count distribution
ax1 = axes[0]
ax1.hist(target_counts, bins=range(1, max(target_counts) + 2), 
         edgecolor='black', alpha=0.7, align='left')
ax1.set_xlabel('Number of targets')
ax1.set_ylabel('Count of Korean terms')
ax1.set_title('Targets per Korean Term')
ax1.axvline(np.mean(target_counts), color='red', linestyle='--', 
            label=f'Mean: {np.mean(target_counts):.1f}')
ax1.legend()

# Similarity score distribution
ax2 = axes[1]
ax2.hist(all_sims, bins=30, edgecolor='black', alpha=0.7)
ax2.set_xlabel('Similarity Score')
ax2.set_ylabel('Count')
ax2.set_title('Similarity Score Distribution')
ax2.axvline(np.mean(all_sims), color='red', linestyle='--',
            label=f'Mean: {np.mean(all_sims):.3f}')
ax2.legend()

plt.tight_layout()
plt.show()

In [None]:
# Sample mappings
print("=" * 70)
print("SAMPLE MAPPINGS")
print("=" * 70)

# Show key terms
key_terms = ['자연어처리', '인증', '인공지능', '검색', '추천', '신경망', '기계학습', '머신러닝', '딥러닝']

print("\nKey IT Terms:")
for ko in key_terms:
    if ko in term_mappings:
        targets = term_mappings[ko][:5]
        targets_str = ", ".join([f"{t}({s:.2f})" for t, s in targets])
        print(f"  {ko} -> [{targets_str}]")
    else:
        print(f"  {ko} -> NOT FOUND")

# Random samples
import random
print("\nRandom Samples:")
sample_keys = random.sample(list(term_mappings.keys()), min(10, len(term_mappings)))
for ko in sample_keys:
    targets = term_mappings[ko][:4]
    targets_str = ", ".join([f"{t}({s:.2f})" for t, s in targets])
    print(f"  {ko} -> [{targets_str}]")

## 6. Save Dataset

In [None]:
# Save to term_mappings.jsonl
# Format: {"ko": "프로그램", "terms": [{"term": "program", "sim": 0.95}, ...]}

output_path = DATA_DIR / "term_mappings.jsonl"

with open(output_path, "w", encoding="utf-8") as f:
    for ko, targets in tqdm(term_mappings.items(), desc="Saving"):
        item = {
            "ko": ko,
            "terms": [{"term": t, "sim": round(s, 4)} for t, s in targets]
        }
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"\nSaved: {output_path}")
print(f"Size: {output_path.stat().st_size / 1024:.1f} KB")

In [None]:
# Save metadata
metadata = {
    "version": "v19",
    "format": "1:N term mappings with similarity scores",
    "embedding_model": CONFIG["embedding_model"],
    "min_similarity": CONFIG["min_similarity"],
    "max_targets_per_source": CONFIG["max_targets_per_source"],
    "total_korean_terms": len(term_mappings),
    "total_targets": sum(len(v) for v in term_mappings.values()),
    "avg_targets_per_korean": float(np.mean([len(v) for v in term_mappings.values()])),
    "similarity_stats": {
        "min": float(min(all_sims)),
        "max": float(max(all_sims)),
        "mean": float(np.mean(all_sims)),
        "median": float(np.median(all_sims)),
    },
}

metadata_path = DATA_DIR / "metadata.json"
with open(metadata_path, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)

print(f"Metadata saved: {metadata_path}")

In [None]:
# Verify saved data
print("\n" + "=" * 70)
print("VERIFICATION")
print("=" * 70)

print("\nSample saved entries:")
with open(output_path, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 5:
            break
        item = json.loads(line)
        terms_str = ", ".join([f"{t['term']}({t['sim']:.2f})" for t in item['terms'][:3]])
        print(f"  {item['ko']} -> [{terms_str}...]")

In [None]:
print("\n" + "=" * 70)
print("DATA PREPARATION COMPLETE")
print("=" * 70)

print(f"\nOutput files:")
print(f"  term_mappings.jsonl: {len(term_mappings):,} Korean terms")
print(f"  metadata.json: Configuration and statistics")

print(f"\nDataset ready for training!")
print(f"Next: Run 02_training.ipynb")

## Summary

### Output Format

```json
{"ko": "머신러닝", "terms": [{"term": "machine", "sim": 0.92}, {"term": "learning", "sim": 0.88}]}
{"ko": "인증", "terms": [{"term": "authentication", "sim": 0.95}, {"term": "verification", "sim": 0.82}]}
```

### Key Features

- **1:N Mapping**: Each Korean term maps to multiple English targets
- **Similarity Scores**: BGE-M3 embeddings for semantic similarity
- **Quality Filtering**: Minimum similarity threshold applied
- **Limited Targets**: Max 15 targets per Korean term

### Next Steps

1. Run `02_training.ipynb` to train the SPLADE model
2. Run `03_inference_test.ipynb` to evaluate results