# v22.0 Data Preparation

Generate training triplets with curriculum learning phases optimized for InfoNCE loss.

## Key Features

1. **InfoNCE-optimized Curriculum**: Temperature annealing across phases
2. **Single-term Focus**: Phase 1 emphasizes single-term pairs (50%)
3. **Hard Negative Mining**: Character overlap-based difficulty scoring

## Curriculum Phases

| Phase | Epochs | λ_infonce | Temperature | Data Focus |
|-------|--------|-----------|-------------|------------|
| 1 | 1-10 | 1.0 | 0.07 | 50% single-term |
| 2 | 11-20 | 1.5 | 0.05 | 33% balanced |
| 3 | 21-30 | 2.0 | 0.03 | Full + hard negatives |

In [None]:
import sys
from pathlib import Path

def find_project_root():
    current = Path.cwd()
    for parent in [current] + list(current.parents):
        if (parent / "pyproject.toml").exists() or (parent / "src").exists():
            return parent
    return Path.cwd().parent.parent

PROJECT_ROOT = find_project_root()
sys.path.insert(0, str(PROJECT_ROOT))

import json
import random
import numpy as np
from collections import defaultdict
from typing import Dict, List, Set, Tuple
from dataclasses import dataclass, asdict
from tqdm import tqdm

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

print(f"Project root: {PROJECT_ROOT}")

## 1. Load Augmented Data

In [None]:
# Paths
V22_DATA_DIR = PROJECT_ROOT / "data" / "v22.0"
CORPUS_DIR = PROJECT_ROOT / "dataset" / "v21.3_filtered_enhanced"

# Load augmented pairs
augmented_pairs_path = V22_DATA_DIR / "augmented_synonym_pairs.jsonl"

pairs = []
with open(augmented_pairs_path, "r", encoding="utf-8") as f:
    for line in f:
        pairs.append(json.loads(line))

print(f"Loaded {len(pairs):,} augmented pairs")

# Build source -> targets mapping
source_to_targets: Dict[str, Set[str]] = defaultdict(set)
for pair in pairs:
    source_to_targets[pair["source"]].add(pair["target"])

print(f"Unique sources: {len(source_to_targets):,}")

In [None]:
# Load single-term expanded triplets (already have hard negatives)
single_term_path = V22_DATA_DIR / "single_term_expanded.jsonl"

single_term_triplets = []
if single_term_path.exists():
    with open(single_term_path, "r", encoding="utf-8") as f:
        for line in f:
            single_term_triplets.append(json.loads(line))
    print(f"Loaded {len(single_term_triplets):,} single-term expanded triplets")
else:
    print(f"Warning: {single_term_path} not found")

## 2. Load Corpus for Negative Mining

In [None]:
# Load corpus vocabulary for negative mining
term_list_path = CORPUS_DIR / "term_list.json"

if term_list_path.exists():
    with open(term_list_path, "r", encoding="utf-8") as f:
        term_list = json.load(f)
    corpus_vocab = {term: 1 for term in term_list}
    print(f"Loaded term_list.json: {len(corpus_vocab):,} terms")
else:
    # Build vocabulary from pairs
    corpus_vocab = {}
    all_terms = set()
    for pair in pairs:
        all_terms.add(pair["source"])
        all_terms.add(pair["target"])
    for term in all_terms:
        corpus_vocab[term] = 1
    print(f"Built vocabulary from pairs: {len(corpus_vocab):,} terms")

all_terms = list(corpus_vocab.keys())
print(f"Total unique terms: {len(all_terms):,}")

## 3. Classify Pairs by Length

In [None]:
def classify_length(text: str) -> str:
    """Classify text by character length."""
    length = len(text)
    if length <= 3:  # 1-2 syllables (single term)
        return "single_term"
    elif length <= 8:  # 3-4 syllables (short phrase)
        return "short_phrase"
    else:  # 5+ syllables (sentence/long phrase)
        return "sentence"


# Classify all pairs
pairs_by_length = defaultdict(list)
for pair in pairs:
    length_class = classify_length(pair["source"])
    pairs_by_length[length_class].append(pair)

print("Pairs by length class:")
for length_class, class_pairs in pairs_by_length.items():
    pct = len(class_pairs) / len(pairs) * 100
    print(f"  {length_class:<15}: {len(class_pairs):>8} ({pct:.1f}%)")

## 4. Hard Negative Mining

In [None]:
@dataclass
class TrainingTriplet:
    anchor: str
    positive: str
    negative: str
    difficulty: str  # "easy", "medium", "hard"
    length_class: str  # "single_term", "short_phrase", "sentence"
    pair_type: str


def find_similar_negatives(source: str, all_terms: List[str], 
                           positives: Set[str], n: int = 10) -> List[Tuple[str, str]]:
    """
    Find similar-looking but semantically different negatives.
    Returns list of (negative, difficulty) tuples.
    """
    negatives = []
    source_len = len(source)
    
    # Shuffle for randomness
    shuffled_terms = random.sample(all_terms, min(len(all_terms), 5000))
    
    for term in shuffled_terms:
        if term == source or term in positives:
            continue
        
        term_len = len(term)
        
        # Calculate similarity based on character overlap and length
        common_chars = len(set(source) & set(term))
        len_diff = abs(source_len - term_len)
        
        # Hard: Similar length, some character overlap
        if len_diff <= 2 and common_chars >= 1:
            negatives.append((term, "hard"))
        # Medium: Similar length OR some overlap
        elif len_diff <= 3 or common_chars >= 1:
            negatives.append((term, "medium"))
        # Easy: Different length, no overlap
        else:
            negatives.append((term, "easy"))
        
        if len(negatives) >= n * 3:
            break
    
    return negatives


def generate_triplets_for_pair(pair: Dict, all_terms: List[str],
                               source_to_targets: Dict[str, Set[str]],
                               n_negatives: int = 3) -> List[TrainingTriplet]:
    """Generate training triplets for a synonym pair."""
    source = pair["source"]
    target = pair["target"]
    positives = source_to_targets.get(source, set())
    length_class = classify_length(source)
    pair_type = pair.get("pair_type", "original")
    
    # Find negatives with difficulty labels
    negatives = find_similar_negatives(source, all_terms, positives, n_negatives * 2)
    
    # Balance by difficulty
    by_difficulty = defaultdict(list)
    for neg, diff in negatives:
        by_difficulty[diff].append(neg)
    
    triplets = []
    for difficulty in ["easy", "medium", "hard"]:
        candidates = by_difficulty[difficulty]
        n_select = min(len(candidates), max(1, n_negatives // 3))
        selected = random.sample(candidates, n_select) if candidates else []
        
        for neg in selected:
            triplets.append(TrainingTriplet(
                anchor=source,
                positive=target,
                negative=neg,
                difficulty=difficulty,
                length_class=length_class,
                pair_type=pair_type,
            ))
    
    return triplets

In [None]:
# Generate triplets for augmented pairs
augmented_triplets = []

for pair in tqdm(pairs, desc="Generating triplets from pairs"):
    triplets = generate_triplets_for_pair(
        pair, all_terms, source_to_targets, n_negatives=3
    )
    augmented_triplets.extend(triplets)

print(f"\nGenerated {len(augmented_triplets):,} triplets from augmented pairs")

In [None]:
# Convert single-term expanded triplets
single_term_training = []
for triplet in single_term_triplets:
    single_term_training.append(TrainingTriplet(
        anchor=triplet["anchor"],
        positive=triplet["positive"],
        negative=triplet["negative"],
        difficulty=triplet.get("difficulty", "medium"),
        length_class="single_term",
        pair_type="single_term_expanded",
    ))

print(f"Converted {len(single_term_training):,} single-term expanded triplets")

# Merge all triplets
all_triplets = augmented_triplets + single_term_training
print(f"\nTotal triplets: {len(all_triplets):,}")

## 5. Triplet Statistics

In [None]:
print("Triplet Statistics:")
print("=" * 50)

# By difficulty
difficulty_counts = defaultdict(int)
for t in all_triplets:
    difficulty_counts[t.difficulty] += 1

print("\nBy Difficulty:")
for diff, count in sorted(difficulty_counts.items()):
    pct = count / len(all_triplets) * 100
    print(f"  {diff:<10}: {count:>8,} ({pct:.1f}%)")

# By length class
length_counts = defaultdict(int)
for t in all_triplets:
    length_counts[t.length_class] += 1

print("\nBy Length Class:")
for lc, count in sorted(length_counts.items()):
    pct = count / len(all_triplets) * 100
    print(f"  {lc:<15}: {count:>8,} ({pct:.1f}%)")

# By pair type
type_counts = defaultdict(int)
for t in all_triplets:
    type_counts[t.pair_type] += 1

print("\nBy Pair Type:")
for pt, count in sorted(type_counts.items(), key=lambda x: -x[1])[:10]:
    pct = count / len(all_triplets) * 100
    print(f"  {pt:<20}: {count:>8,} ({pct:.1f}%)")

## 6. Create Curriculum Learning Splits

Optimized for InfoNCE loss with temperature annealing.

In [None]:
def create_curriculum_splits(triplets: List[TrainingTriplet]) -> Dict[str, List[TrainingTriplet]]:
    """
    Create curriculum learning splits optimized for InfoNCE:
    - Phase 1: Single-term heavy (50% single, 30% short, 20% sentence)
    - Phase 2: Balanced (33% each)
    - Phase 3: Full data with hard negatives
    """
    # Group by length class
    by_length = defaultdict(list)
    for t in triplets:
        by_length[t.length_class].append(t)
    
    single_term = by_length["single_term"]
    short_phrase = by_length["short_phrase"]
    sentence = by_length["sentence"]
    
    print(f"Available: single={len(single_term):,}, short={len(short_phrase):,}, sentence={len(sentence):,}")
    
    # Phase 1: Single-term focus (50% single-term)
    # This is critical for InfoNCE as it needs strong single-term representations
    phase1_size = len(single_term) * 2
    phase1 = single_term.copy()
    phase1 += random.sample(short_phrase, min(len(short_phrase), int(phase1_size * 0.3)))
    phase1 += random.sample(sentence, min(len(sentence), int(phase1_size * 0.2)))
    random.shuffle(phase1)
    
    # Phase 2: Balanced learning
    min_class_size = min(len(single_term), len(short_phrase), len(sentence))
    phase2 = []
    phase2 += random.sample(single_term, min(len(single_term), min_class_size))
    phase2 += random.sample(short_phrase, min(len(short_phrase), min_class_size))
    phase2 += random.sample(sentence, min(len(sentence), min_class_size))
    random.shuffle(phase2)
    
    # Phase 3: Full data (all triplets)
    phase3 = triplets.copy()
    random.shuffle(phase3)
    
    return {
        "phase1_single_term_focus": phase1,
        "phase2_balanced": phase2,
        "phase3_full": phase3,
    }


curriculum_splits = create_curriculum_splits(all_triplets)

print("\nCurriculum Splits:")
for phase, data in curriculum_splits.items():
    print(f"  {phase}: {len(data):,} triplets")

In [None]:
# Verify single-term percentage in Phase 1
for phase_name, phase_data in curriculum_splits.items():
    length_dist = defaultdict(int)
    for t in phase_data:
        length_dist[t.length_class] += 1
    
    print(f"\n{phase_name}:")
    for lc in ["single_term", "short_phrase", "sentence"]:
        pct = length_dist[lc] / len(phase_data) * 100 if phase_data else 0
        print(f"  {lc}: {length_dist[lc]:,} ({pct:.1f}%)")

## 7. Add MS MARCO Triplets to Phase 3

In [None]:
# Load MS MARCO direct triplets
msmarco_triplets_path = V22_DATA_DIR / "msmarco_direct_triplets.jsonl"

msmarco_triplets: List[TrainingTriplet] = []

if msmarco_triplets_path.exists():
    with open(msmarco_triplets_path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)
            msmarco_triplets.append(TrainingTriplet(
                anchor=item["anchor"],
                positive=item["positive"],
                negative=item.get("negative", ""),
                difficulty=item.get("difficulty", "medium"),
                length_class=item.get("length_class", "sentence"),
                pair_type=item.get("pair_type", "msmarco_direct"),
            ))
    print(f"Loaded {len(msmarco_triplets):,} MS MARCO direct triplets")
    
    # Merge into Phase 3
    original_phase3_size = len(curriculum_splits["phase3_full"])
    curriculum_splits["phase3_full"].extend(msmarco_triplets)
    random.shuffle(curriculum_splits["phase3_full"])
    
    print(f"Phase 3 size: {original_phase3_size:,} -> {len(curriculum_splits['phase3_full']):,}")
else:
    print(f"Warning: {msmarco_triplets_path} not found")

## 8. Train/Validation Split

In [None]:
def train_val_split(triplets: List[TrainingTriplet], 
                    val_ratio: float = 0.1) -> Tuple[List[TrainingTriplet], List[TrainingTriplet]]:
    """Split triplets into train and validation sets by anchor."""
    # Group by anchor to prevent data leakage
    by_anchor = defaultdict(list)
    for t in triplets:
        by_anchor[t.anchor].append(t)
    
    anchors = list(by_anchor.keys())
    random.shuffle(anchors)
    
    val_size = int(len(anchors) * val_ratio)
    val_anchors = set(anchors[:val_size])
    
    train_triplets = []
    val_triplets = []
    
    for anchor, anchor_triplets in by_anchor.items():
        if anchor in val_anchors:
            val_triplets.extend(anchor_triplets)
        else:
            train_triplets.extend(anchor_triplets)
    
    return train_triplets, val_triplets


# Split full dataset
train_triplets, val_triplets = train_val_split(all_triplets, val_ratio=0.1)

print(f"Train triplets: {len(train_triplets):,}")
print(f"Validation triplets: {len(val_triplets):,}")
print(f"Validation ratio: {len(val_triplets) / (len(train_triplets) + len(val_triplets)) * 100:.1f}%")

## 9. Save Training Data

In [None]:
def save_triplets(triplets: List[TrainingTriplet], path: Path):
    """Save triplets to JSONL file."""
    with open(path, "w", encoding="utf-8") as f:
        for t in triplets:
            f.write(json.dumps(asdict(t), ensure_ascii=False) + "\n")
    print(f"Saved {len(triplets):,} triplets to {path}")


# Save main splits
save_triplets(train_triplets, V22_DATA_DIR / "training_triplets.jsonl")
save_triplets(val_triplets, V22_DATA_DIR / "validation_triplets.jsonl")

# Save curriculum splits
for phase, data in curriculum_splits.items():
    save_triplets(data, V22_DATA_DIR / f"{phase}_triplets.jsonl")

## 10. Verify Problem Term Coverage

In [None]:
PROBLEM_TERMS = ["추천", "데이터베이스", "증상", "질환", "인슐린"]

print("Problem Term Coverage in Training Triplets:")
print("=" * 60)

for term in PROBLEM_TERMS:
    as_anchor = sum(1 for t in train_triplets if t.anchor == term)
    as_positive = sum(1 for t in train_triplets if t.positive == term)
    total = as_anchor + as_positive
    print(f"{term:<15}: anchor={as_anchor:>4}, positive={as_positive:>4}, total={total:>4}")

## 11. Summary

In [None]:
print("\n" + "=" * 60)
print("v22.0 Data Preparation Summary")
print("=" * 60)

print(f"\nInput:")
print(f"  Augmented pairs: {len(pairs):,}")
print(f"  Single-term expanded: {len(single_term_triplets):,}")

print(f"\nOutput:")
print(f"  Total triplets: {len(all_triplets):,}")
print(f"  Training triplets: {len(train_triplets):,}")
print(f"  Validation triplets: {len(val_triplets):,}")

print(f"\nCurriculum Phases (optimized for InfoNCE):")
for phase, data in curriculum_splits.items():
    length_dist = defaultdict(int)
    for t in data:
        length_dist[t.length_class] += 1
    print(f"  {phase}:")
    for lc in ["single_term", "short_phrase", "sentence"]:
        pct = length_dist[lc] / len(data) * 100 if data else 0
        print(f"    {lc}: {length_dist[lc]:,} ({pct:.1f}%)")

print(f"\nOutput Files:")
for f in V22_DATA_DIR.glob("*.jsonl"):
    size_mb = f.stat().st_size / 1024 / 1024
    print(f"  {f.name}: {size_mb:.2f} MB")

## Next Steps

1. Run `03_training.ipynb` with InfoNCE loss and temperature annealing
2. Use phase-specific data for curriculum learning:
   - Phase 1 (epochs 1-10): `phase1_single_term_focus_triplets.jsonl` with τ=0.07
   - Phase 2 (epochs 11-20): `phase2_balanced_triplets.jsonl` with τ=0.05
   - Phase 3 (epochs 21-30): `phase3_full_triplets.jsonl` with τ=0.03