# v22.0 HuggingFace Korean Dataset Loading

Load and process Korean datasets from HuggingFace for training data augmentation.

## Features

- **Sequential Loading**: Memory-safe sequential loading with garbage collection
- **Progress Tracking**: Individual progress bars for each dataset
- **Error Handling**: Graceful failure handling - if one dataset fails, others continue
- **Type Safety**: Full type hints throughout

## Datasets

| Dataset | Type | Size | Use Case |
|---------|------|------|----------|
| williamjeong2/msmarco-triplets-ko-v1 | Query-Doc Triplets | 50K | Direct triplet training |
| klue (nli, sts) | NLI, STS | 45K | Semantic similarity pairs |
| squad_kor_v1 | QA | 30K | Question-context pairs |
| nsmc | Sentiment | 50K | Text corpus for negatives |
| skt/kobest_v1 (copa) | COPA | 5K | Premise-alternative pairs |
| daekeun-ml/naver-news-summarization-ko | News | 10K | Title-summary pairs |

In [None]:
import sys
from pathlib import Path


def find_project_root() -> Path:
    """Find the project root directory."""
    current = Path.cwd()
    for parent in [current] + list(current.parents):
        if (parent / "pyproject.toml").exists() or (parent / "src").exists():
            return parent
    return Path.cwd().parent.parent


PROJECT_ROOT = find_project_root()
sys.path.insert(0, str(PROJECT_ROOT))

print(f"Project root: {PROJECT_ROOT}")

In [None]:
import json
import random
import gc
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional

from tqdm.auto import tqdm

# Set random seed
random.seed(42)

In [None]:
# Install datasets if needed
try:
    from datasets import load_dataset, Dataset
    print("datasets library available")
except ImportError:
    print("Installing datasets...")
    %pip install datasets
    from datasets import load_dataset, Dataset

In [None]:
# Output directories - v22.0 specific
HF_DATA_DIR = PROJECT_ROOT / "data" / "huggingface_korean"
V22_DATA_DIR = PROJECT_ROOT / "data" / "v22.0"
HF_DATA_DIR.mkdir(parents=True, exist_ok=True)
V22_DATA_DIR.mkdir(parents=True, exist_ok=True)

print(f"HuggingFace data directory: {HF_DATA_DIR}")
print(f"v22.0 data directory: {V22_DATA_DIR}")

## Data Classes and Type Definitions

In [None]:
@dataclass
class DatasetResult:
    """Result container for a dataset loading operation."""
    
    name: str
    success: bool
    data: List[Dict[str, Any]] = field(default_factory=list)
    corpus: List[str] = field(default_factory=list)
    error_message: Optional[str] = None
    sample_count: int = 0
    
    def __post_init__(self) -> None:
        """Calculate sample count after initialization."""
        if self.data:
            self.sample_count = len(self.data)
        elif self.corpus:
            self.sample_count = len(self.corpus)


@dataclass
class DatasetConfig:
    """Configuration for a dataset loading task."""
    
    name: str
    loader_fn: Callable[..., DatasetResult]
    max_samples: int
    description: str = ""

## Dataset Loading Functions

In [None]:
def load_msmarco_korean(max_samples: int = 50000) -> DatasetResult:
    """Load Korean MS MARCO triplets."""
    name = "msmarco_ko"
    
    try:
        dataset = load_dataset(
            "williamjeong2/msmarco-triplets-ko-v1",
            split="train"
        )
    except Exception as e:
        return DatasetResult(name=name, success=False, error_message=f"Failed to load: {e}")
    
    triplets: List[Dict[str, Any]] = []
    total = min(len(dataset), max_samples)
    
    for i, item in enumerate(tqdm(dataset, total=total, desc=f"Processing {name}")):
        if i >= max_samples:
            break
        
        query = item.get("query", "")
        positives = item.get("pos", [])
        negatives = item.get("neg", [])
        
        if not query or not positives:
            continue
        
        pos = positives[0] if positives else ""
        neg = negatives[0] if negatives else ""
        
        if pos:
            triplets.append({
                "anchor": query,
                "positive": pos,
                "negative": neg if neg else "",
                "source": name,
            })
    
    return DatasetResult(name=name, success=True, data=triplets)

In [None]:
def load_klue_nli(max_samples: int = 30000) -> DatasetResult:
    """Load KLUE NLI dataset."""
    name = "klue_nli"
    
    try:
        dataset = load_dataset("klue", "nli", split="train")
    except Exception as e:
        return DatasetResult(name=name, success=False, error_message=f"Failed to load: {e}")
    
    pairs: List[Dict[str, Any]] = []
    total = min(len(dataset), max_samples)
    
    for i, item in enumerate(tqdm(dataset, total=total, desc=f"Processing {name}")):
        if i >= max_samples:
            break
        
        premise = item.get("premise", "")
        hypothesis = item.get("hypothesis", "")
        label = item.get("label", -1)
        
        if not premise or not hypothesis:
            continue
        
        if label == 0:  # Entailment
            pairs.append({
                "source": premise,
                "target": hypothesis,
                "similarity": 0.9,
                "category": "nli_entailment",
                "pair_type": "klue_nli",
            })
        elif label == 2:  # Contradiction (hard negative)
            pairs.append({
                "source": premise,
                "target": hypothesis,
                "similarity": 0.1,
                "category": "nli_contradiction",
                "pair_type": "klue_nli_neg",
            })
    
    return DatasetResult(name=name, success=True, data=pairs)

In [None]:
def load_klue_sts(max_samples: int = 15000) -> DatasetResult:
    """Load KLUE STS dataset."""
    name = "klue_sts"
    
    try:
        dataset = load_dataset("klue", "sts", split="train")
    except Exception as e:
        return DatasetResult(name=name, success=False, error_message=f"Failed to load: {e}")
    
    pairs: List[Dict[str, Any]] = []
    total = min(len(dataset), max_samples)
    
    for i, item in enumerate(tqdm(dataset, total=total, desc=f"Processing {name}")):
        if i >= max_samples:
            break
        
        sentence1 = item.get("sentence1", "")
        sentence2 = item.get("sentence2", "")
        
        labels = item.get("labels", {})
        score = labels.get("real-label", 0) if isinstance(labels, dict) else 0
        
        if not sentence1 or not sentence2:
            continue
        
        normalized_score = score / 5.0 if score > 0 else 0
        
        if normalized_score >= 0.5:
            pairs.append({
                "source": sentence1,
                "target": sentence2,
                "similarity": normalized_score,
                "category": "sts",
                "pair_type": "klue_sts",
            })
    
    return DatasetResult(name=name, success=True, data=pairs)

In [None]:
def load_korquad(max_samples: int = 30000) -> DatasetResult:
    """Load KorQuAD dataset."""
    name = "korquad"
    
    try:
        dataset = load_dataset("squad_kor_v1", split="train")
    except Exception as e:
        return DatasetResult(name=name, success=False, error_message=f"Failed to load: {e}")
    
    pairs: List[Dict[str, Any]] = []
    total = min(len(dataset), max_samples)
    
    for i, item in enumerate(tqdm(dataset, total=total, desc=f"Processing {name}")):
        if i >= max_samples:
            break
        
        question = item.get("question", "")
        context = item.get("context", "")
        answers = item.get("answers", {})
        
        if not question:
            continue
        
        answer_texts = answers.get("text", []) if isinstance(answers, dict) else []
        answer = answer_texts[0] if answer_texts else ""
        
        if answer:
            pairs.append({
                "source": question,
                "target": answer,
                "similarity": 0.85,
                "category": "qa",
                "pair_type": "korquad",
            })
        
        if context and len(context) > 20:
            truncated_context = context[:200].strip()
            pairs.append({
                "source": question,
                "target": truncated_context,
                "similarity": 0.7,
                "category": "qa_context",
                "pair_type": "korquad_context",
            })
    
    return DatasetResult(name=name, success=True, data=pairs)

In [None]:
def load_kobest_copa(max_samples: int = 5000) -> DatasetResult:
    """Load KoBEST COPA dataset."""
    name = "kobest_copa"
    
    try:
        dataset = load_dataset("skt/kobest_v1", "copa", split="train")
    except Exception as e:
        return DatasetResult(name=name, success=False, error_message=f"Failed to load: {e}")
    
    pairs: List[Dict[str, Any]] = []
    total = min(len(dataset), max_samples)
    
    for i, item in enumerate(tqdm(dataset, total=total, desc=f"Processing {name}")):
        if i >= max_samples:
            break
        
        premise = item.get("premise", "")
        alternative1 = item.get("alternative_1", "")
        alternative2 = item.get("alternative_2", "")
        label = item.get("label", 0)
        
        if not premise:
            continue
        
        correct = alternative1 if label == 0 else alternative2
        
        if correct:
            pairs.append({
                "source": premise,
                "target": correct,
                "similarity": 0.8,
                "category": "copa",
                "pair_type": "kobest_copa",
            })
    
    return DatasetResult(name=name, success=True, data=pairs)

In [None]:
def load_naver_news(max_samples: int = 10000) -> DatasetResult:
    """Load Naver News summarization dataset."""
    name = "naver_news"
    
    try:
        dataset = load_dataset(
            "daekeun-ml/naver-news-summarization-ko",
            split="train"
        )
    except Exception as e:
        return DatasetResult(name=name, success=False, error_message=f"Failed to load: {e}")
    
    pairs: List[Dict[str, Any]] = []
    total = min(len(dataset), max_samples)
    
    for i, item in enumerate(tqdm(dataset, total=total, desc=f"Processing {name}")):
        if i >= max_samples:
            break
        
        title = item.get("title", "") or item.get("document_title", "")
        content = (
            item.get("document", "") or 
            item.get("content", "") or 
            item.get("text", "")
        )
        summary = item.get("summary", "") or item.get("abstractive", "")
        
        if title and summary:
            pairs.append({
                "source": title,
                "target": summary[:200],
                "similarity": 0.75,
                "category": "news_summary",
                "pair_type": "naver_news",
            })
        
        if content and summary:
            pairs.append({
                "source": content[:200],
                "target": summary[:200],
                "similarity": 0.8,
                "category": "news_summary",
                "pair_type": "naver_news",
            })
    
    return DatasetResult(name=name, success=True, data=pairs)

In [None]:
def load_nsmc_corpus(max_samples: int = 50000) -> DatasetResult:
    """Load NSMC corpus for negative mining."""
    name = "nsmc"
    
    try:
        dataset = load_dataset("nsmc", split="train")
    except Exception as e:
        return DatasetResult(name=name, success=False, error_message=f"Failed to load: {e}")
    
    texts: List[str] = []
    total = min(len(dataset), max_samples)
    
    for i, item in enumerate(tqdm(dataset, total=total, desc=f"Processing {name}")):
        if i >= max_samples:
            break
        
        document = item.get("document", "")
        if document and len(document) > 5:
            texts.append(document)
    
    return DatasetResult(name=name, success=True, corpus=texts)

## Configure and Execute Loading

In [None]:
DATASET_CONFIGS: List[DatasetConfig] = [
    DatasetConfig(
        name="msmarco_ko",
        loader_fn=load_msmarco_korean,
        max_samples=50000,
        description="Korean MS MARCO triplets",
    ),
    DatasetConfig(
        name="klue_nli",
        loader_fn=load_klue_nli,
        max_samples=30000,
        description="KLUE Natural Language Inference",
    ),
    DatasetConfig(
        name="klue_sts",
        loader_fn=load_klue_sts,
        max_samples=15000,
        description="KLUE Semantic Textual Similarity",
    ),
    DatasetConfig(
        name="korquad",
        loader_fn=load_korquad,
        max_samples=30000,
        description="Korean Question Answering",
    ),
    DatasetConfig(
        name="kobest_copa",
        loader_fn=load_kobest_copa,
        max_samples=5000,
        description="KoBEST COPA reasoning",
    ),
    DatasetConfig(
        name="naver_news",
        loader_fn=load_naver_news,
        max_samples=10000,
        description="Naver News summarization",
    ),
    DatasetConfig(
        name="nsmc",
        loader_fn=load_nsmc_corpus,
        max_samples=50000,
        description="NSMC movie review corpus",
    ),
]

print("Dataset configurations:")
for config in DATASET_CONFIGS:
    print(f"  - {config.name}: {config.description} (max: {config.max_samples})")

In [None]:
def load_datasets_sequential(configs: List[DatasetConfig]) -> Dict[str, DatasetResult]:
    """Load datasets one by one to avoid memory issues."""
    results = {}
    
    print(f"\nLoading {len(configs)} datasets sequentially...")
    print("=" * 60)
    
    for config in configs:
        print(f"\n[{config.name}] Loading {config.description}...")
        try:
            result = config.loader_fn(max_samples=config.max_samples)
            results[config.name] = result
            
            status = "SUCCESS" if result.success else "FAILED"
            print(f"  [{status}] {result.sample_count:,} samples")
            
            if not result.success:
                print(f"    Error: {result.error_message}")
        except Exception as e:
            print(f"  [ERROR] {e}")
            results[config.name] = DatasetResult(
                name=config.name,
                success=False,
                error_message=str(e)
            )
        
        gc.collect()
    
    print("\n" + "=" * 60)
    return results


# Load all datasets
results = load_datasets_sequential(DATASET_CONFIGS)

# Summary
successful = [r for r in results.values() if r.success]
failed = [r for r in results.values() if not r.success]
total_samples = sum(r.sample_count for r in successful)

print(f"\nLoading Summary:")
print(f"  Successful: {len(successful)}/{len(results)}")
print(f"  Total samples: {total_samples:,}")

if failed:
    print(f"  Failed datasets: {', '.join(r.name for r in failed)}")

## Extract and Merge Results

In [None]:
# Extract individual results
msmarco_triplets = results.get("msmarco_ko", DatasetResult(name="msmarco_ko", success=False)).data
klue_nli_pairs = results.get("klue_nli", DatasetResult(name="klue_nli", success=False)).data
klue_sts_pairs = results.get("klue_sts", DatasetResult(name="klue_sts", success=False)).data
korquad_pairs = results.get("korquad", DatasetResult(name="korquad", success=False)).data
kobest_pairs = results.get("kobest_copa", DatasetResult(name="kobest_copa", success=False)).data
naver_news_pairs = results.get("naver_news", DatasetResult(name="naver_news", success=False)).data
nsmc_texts = results.get("nsmc", DatasetResult(name="nsmc", success=False)).corpus

print("Extracted results:")
print(f"  MS MARCO triplets: {len(msmarco_triplets)}")
print(f"  KLUE NLI pairs: {len(klue_nli_pairs)}")
print(f"  KLUE STS pairs: {len(klue_sts_pairs)}")
print(f"  KorQuAD pairs: {len(korquad_pairs)}")
print(f"  KoBEST COPA pairs: {len(kobest_pairs)}")
print(f"  Naver News pairs: {len(naver_news_pairs)}")
print(f"  NSMC texts: {len(nsmc_texts)}")

In [None]:
def merge_all_pairs(
    klue_nli: List[Dict],
    klue_sts: List[Dict],
    korquad: List[Dict],
    kobest: List[Dict],
    naver_news: List[Dict],
    msmarco: List[Dict],
) -> List[Dict[str, Any]]:
    """Merge all pair datasets into a unified format."""
    all_pairs: List[Dict[str, Any]] = []
    
    # Add KLUE pairs (skip contradictions for positive training)
    for pair in klue_nli:
        if pair.get("pair_type") != "klue_nli_neg":
            all_pairs.append(pair)
    
    all_pairs.extend(klue_sts)
    all_pairs.extend(korquad)
    all_pairs.extend(kobest)
    all_pairs.extend(naver_news)
    
    # Convert MS MARCO triplets to pairs
    for triplet in msmarco:
        if triplet.get("positive"):
            all_pairs.append({
                "source": triplet["anchor"],
                "target": triplet["positive"],
                "similarity": 0.85,
                "category": "retrieval",
                "pair_type": "msmarco_ko",
            })
    
    return all_pairs


def deduplicate_pairs(pairs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Remove duplicate pairs."""
    seen: set = set()
    unique_pairs: List[Dict[str, Any]] = []
    
    for pair in pairs:
        key = (pair.get("source", ""), pair.get("target", ""))
        if key not in seen:
            seen.add(key)
            unique_pairs.append(pair)
    
    return unique_pairs


# Merge all pairs
all_pairs = merge_all_pairs(
    klue_nli=klue_nli_pairs,
    klue_sts=klue_sts_pairs,
    korquad=korquad_pairs,
    kobest=kobest_pairs,
    naver_news=naver_news_pairs,
    msmarco=msmarco_triplets,
)

print(f"Total pairs collected: {len(all_pairs)}")

unique_pairs = deduplicate_pairs(all_pairs)
print(f"Unique pairs after deduplication: {len(unique_pairs)}")

In [None]:
# Statistics by source
source_counts: Dict[str, int] = defaultdict(int)
for pair in unique_pairs:
    source_counts[pair.get("pair_type", "unknown")] += 1

print("\nPairs by source:")
for source, count in sorted(source_counts.items(), key=lambda x: -x[1]):
    print(f"  {source}: {count:,}")

## Save Processed Data

In [None]:
def save_jsonl(data: List[Dict], output_path: Path) -> int:
    """Save data to JSONL format."""
    with open(output_path, "w", encoding="utf-8") as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")
    return len(data)


def save_text_corpus(texts: List[str], output_path: Path) -> int:
    """Save text corpus to file."""
    with open(output_path, "w", encoding="utf-8") as f:
        for text in texts:
            f.write(text + "\n")
    return len(texts)

In [None]:
# Save synonym pairs
pairs_output_path = HF_DATA_DIR / "huggingface_synonym_pairs.jsonl"
saved_pairs = save_jsonl(unique_pairs, pairs_output_path)
print(f"Saved {saved_pairs:,} pairs to {pairs_output_path}")

# Save MS MARCO triplets separately
triplets_with_negatives = [
    t for t in msmarco_triplets
    if t.get("positive") and t.get("negative")
]

triplets_output_path = HF_DATA_DIR / "msmarco_triplets.jsonl"
saved_triplets = save_jsonl(triplets_with_negatives, triplets_output_path)
print(f"Saved {saved_triplets:,} MS MARCO triplets to {triplets_output_path}")

# Save corpus for negative mining
corpus_output_path = HF_DATA_DIR / "nsmc_corpus.txt"
saved_texts = save_text_corpus(nsmc_texts, corpus_output_path)
print(f"Saved {saved_texts:,} texts to {corpus_output_path}")

## Summary

In [None]:
print("\n" + "=" * 60)
print("v22.0 HuggingFace Korean Data Loading Summary")
print("=" * 60)

print(f"\nLoading Performance:")
print(f"  Mode: Sequential (memory-safe)")
print(f"  Datasets attempted: {len(results)}")
print(f"  Successful: {len(successful)}")
print(f"  Failed: {len(failed)}")

print(f"\nDatasets Loaded:")
print(f"  MS MARCO Korean Triplets: {len(msmarco_triplets):,}")
print(f"  KLUE NLI Pairs: {len(klue_nli_pairs):,}")
print(f"  KLUE STS Pairs: {len(klue_sts_pairs):,}")
print(f"  KorQuAD Pairs: {len(korquad_pairs):,}")
print(f"  KoBEST COPA Pairs: {len(kobest_pairs):,}")
print(f"  Naver News Pairs: {len(naver_news_pairs):,}")
print(f"  NSMC Corpus: {len(nsmc_texts):,} texts")

print(f"\nTotal Unique Pairs: {len(unique_pairs):,}")

print(f"\nOutput Files:")
for f in sorted(HF_DATA_DIR.glob("*")):
    size_mb = f.stat().st_size / 1024 / 1024
    print(f"  {f.name}: {size_mb:.2f} MB")

## Next Steps

1. Run `01_data_augmentation.ipynb` to merge with single-term expanded data
2. The HuggingFace data will be automatically incorporated into training