# v21.4 HuggingFace Korean Dataset Loading

Load and process Korean datasets from HuggingFace for training data augmentation.

## Features

- **Sequential Loading**: Memory-safe sequential loading with garbage collection
- **Progress Tracking**: Individual progress bars for each dataset
- **Error Handling**: Graceful failure handling - if one dataset fails, others continue
- **Type Safety**: Full type hints throughout

## Datasets

| Dataset | Type | Size | Use Case |
|---------|------|------|----------|
| williamjeong2/msmarco-triplets-ko-v1 | Query-Doc Triplets | 50K | Direct triplet training |
| klue (nli, sts) | NLI, STS | 45K | Semantic similarity pairs |
| squad_kor_v1 | QA | 30K | Question-context pairs |
| nsmc | Sentiment | 50K | Text corpus for negatives |
| skt/kobest_v1 (copa) | COPA | 5K | Premise-alternative pairs |
| daekeun-ml/naver-news-summarization-ko | News | 10K | Title-summary pairs |

In [1]:
import sys
from pathlib import Path


def find_project_root() -> Path:
    """Find the project root directory."""
    current = Path.cwd()
    for parent in [current] + list(current.parents):
        if (parent / "pyproject.toml").exists() or (parent / "src").exists():
            return parent
    return Path.cwd().parent.parent


PROJECT_ROOT = find_project_root()
sys.path.insert(0, str(PROJECT_ROOT))

print(f"Project root: {PROJECT_ROOT}")

Project root: /home/west/Documents/cursor-workspace/opensearch-neural-pre-train


In [2]:
import json
import random
import threading
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed, Future
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

from tqdm.auto import tqdm

# Set random seed
random.seed(42)

# Thread-safe lock for progress updates
PROGRESS_LOCK = threading.Lock()

In [3]:
# Install datasets if needed
try:
    from datasets import load_dataset, Dataset
    print("datasets library available")
except ImportError:
    print("Installing datasets...")
    %pip install datasets
    from datasets import load_dataset, Dataset

datasets library available


In [4]:
# Output directories
HF_DATA_DIR = PROJECT_ROOT / "data" / "huggingface_korean"
HF_DATA_DIR.mkdir(parents=True, exist_ok=True)

print(f"HuggingFace data directory: {HF_DATA_DIR}")

HuggingFace data directory: /home/west/Documents/cursor-workspace/opensearch-neural-pre-train/data/huggingface_korean


## Data Classes and Type Definitions

In [5]:
@dataclass
class DatasetResult:
    """Result container for a dataset loading operation."""
    
    name: str
    success: bool
    data: List[Dict[str, Any]] = field(default_factory=list)
    corpus: List[str] = field(default_factory=list)
    error_message: Optional[str] = None
    sample_count: int = 0
    
    def __post_init__(self) -> None:
        """Calculate sample count after initialization."""
        if self.data:
            self.sample_count = len(self.data)
        elif self.corpus:
            self.sample_count = len(self.corpus)


@dataclass
class LoadingProgress:
    """Track loading progress for a dataset."""
    
    name: str
    status: str = "pending"
    current: int = 0
    total: int = 0
    
    def update(self, current: int, total: int) -> None:
        """Update progress values."""
        self.current = current
        self.total = total
        self.status = "loading"
    
    def complete(self, success: bool = True) -> None:
        """Mark as completed."""
        self.status = "completed" if success else "failed"

## Dataset Loading Functions

Each loader function is designed to:
1. Load data from HuggingFace
2. Process into standardized format
3. Return a `DatasetResult` with success/failure status

In [6]:
def load_msmarco_korean(max_samples: int = 50000) -> DatasetResult:
    """
    Load Korean MS MARCO triplets.
    
    Args:
        max_samples: Maximum number of samples to load.
        
    Returns:
        DatasetResult with triplet data.
    """
    name = "msmarco_ko"
    
    try:
        dataset = load_dataset(
            "williamjeong2/msmarco-triplets-ko-v1",
            split="train"
        )
    except Exception as e:
        return DatasetResult(
            name=name,
            success=False,
            error_message=f"Failed to load dataset: {e}"
        )
    
    triplets: List[Dict[str, Any]] = []
    total = min(len(dataset), max_samples)
    
    for i, item in enumerate(tqdm(dataset, total=total, desc=f"Processing {name}")):
        if i >= max_samples:
            break
        
        query = item.get("query", "")
        positives = item.get("pos", [])
        negatives = item.get("neg", [])
        
        if not query or not positives:
            continue
        
        pos = positives[0] if positives else ""
        neg = negatives[0] if negatives else ""
        
        if pos:
            triplets.append({
                "anchor": query,
                "positive": pos,
                "negative": neg if neg else "",
                "source": name,
            })
    
    return DatasetResult(name=name, success=True, data=triplets)

In [7]:
def load_klue_nli(max_samples: int = 30000) -> DatasetResult:
    """
    Load KLUE NLI dataset.
    
    Extracts entailment pairs as positives and contradiction as negatives.
    
    Args:
        max_samples: Maximum number of samples to load.
        
    Returns:
        DatasetResult with NLI pairs.
    """
    name = "klue_nli"
    
    try:
        # Use "klue" not "klue/klue"
        dataset = load_dataset("klue", "nli", split="train")
    except Exception as e:
        return DatasetResult(
            name=name,
            success=False,
            error_message=f"Failed to load dataset: {e}"
        )
    
    pairs: List[Dict[str, Any]] = []
    total = min(len(dataset), max_samples)
    
    for i, item in enumerate(tqdm(dataset, total=total, desc=f"Processing {name}")):
        if i >= max_samples:
            break
        
        premise = item.get("premise", "")
        hypothesis = item.get("hypothesis", "")
        label = item.get("label", -1)
        
        if not premise or not hypothesis:
            continue
        
        # label: 0=entailment, 1=neutral, 2=contradiction
        if label == 0:  # Entailment
            pairs.append({
                "source": premise,
                "target": hypothesis,
                "similarity": 0.9,
                "category": "nli_entailment",
                "pair_type": "klue_nli",
            })
        elif label == 2:  # Contradiction (hard negative)
            pairs.append({
                "source": premise,
                "target": hypothesis,
                "similarity": 0.1,
                "category": "nli_contradiction",
                "pair_type": "klue_nli_neg",
            })
    
    return DatasetResult(name=name, success=True, data=pairs)

In [8]:
def load_klue_sts(max_samples: int = 15000) -> DatasetResult:
    """
    Load KLUE STS dataset.
    
    Extracts high-similarity sentence pairs.
    
    Args:
        max_samples: Maximum number of samples to load.
        
    Returns:
        DatasetResult with STS pairs.
    """
    name = "klue_sts"
    
    try:
        # Use "klue" not "klue/klue"
        dataset = load_dataset("klue", "sts", split="train")
    except Exception as e:
        return DatasetResult(
            name=name,
            success=False,
            error_message=f"Failed to load dataset: {e}"
        )
    
    pairs: List[Dict[str, Any]] = []
    total = min(len(dataset), max_samples)
    
    for i, item in enumerate(tqdm(dataset, total=total, desc=f"Processing {name}")):
        if i >= max_samples:
            break
        
        sentence1 = item.get("sentence1", "")
        sentence2 = item.get("sentence2", "")
        
        # Handle labels field - it's a dict with 'label' (binary) and 'real-label' (0-5 float)
        labels = item.get("labels", {})
        score = 0.0
        if isinstance(labels, dict):
            score = labels.get("real-label", 0)
        
        if not sentence1 or not sentence2:
            continue
        
        # Normalize score (0-5 scale to 0-1)
        normalized_score = score / 5.0 if score > 0 else 0
        
        # Include pairs with similarity >= 0.5
        if normalized_score >= 0.5:
            pairs.append({
                "source": sentence1,
                "target": sentence2,
                "similarity": normalized_score,
                "category": "sts",
                "pair_type": "klue_sts",
            })
    
    return DatasetResult(name=name, success=True, data=pairs)

In [9]:
def load_korquad(max_samples: int = 30000) -> DatasetResult:
    """
    Load KorQuAD dataset.
    
    Extracts question-answer and question-context pairs.
    
    Args:
        max_samples: Maximum number of samples to load.
        
    Returns:
        DatasetResult with QA pairs.
    """
    name = "korquad"
    
    try:
        # Use the correct dataset name
        dataset = load_dataset("squad_kor_v1", split="train")
    except Exception as e:
        return DatasetResult(
            name=name,
            success=False,
            error_message=f"Failed to load dataset: {e}"
        )
    
    pairs: List[Dict[str, Any]] = []
    total = min(len(dataset), max_samples)
    
    for i, item in enumerate(tqdm(dataset, total=total, desc=f"Processing {name}")):
        if i >= max_samples:
            break
        
        question = item.get("question", "")
        context = item.get("context", "")
        answers = item.get("answers", {})
        
        if not question:
            continue
        
        # Get answer text
        answer_texts = answers.get("text", []) if isinstance(answers, dict) else []
        answer = answer_texts[0] if answer_texts else ""
        
        if answer:
            pairs.append({
                "source": question,
                "target": answer,
                "similarity": 0.85,
                "category": "qa",
                "pair_type": "korquad",
            })
        
        if context and len(context) > 20:
            truncated_context = context[:200].strip()
            pairs.append({
                "source": question,
                "target": truncated_context,
                "similarity": 0.7,
                "category": "qa_context",
                "pair_type": "korquad_context",
            })
    
    return DatasetResult(name=name, success=True, data=pairs)

In [10]:
def load_kobest_copa(max_samples: int = 5000) -> DatasetResult:
    """
    Load KoBEST COPA dataset.
    
    Extracts premise-alternative pairs for causal reasoning.
    
    Args:
        max_samples: Maximum number of samples to load.
        
    Returns:
        DatasetResult with COPA pairs.
    """
    name = "kobest_copa"
    
    try:
        dataset = load_dataset("skt/kobest_v1", "copa", split="train")
    except Exception as e:
        return DatasetResult(
            name=name,
            success=False,
            error_message=f"Failed to load dataset: {e}"
        )
    
    pairs: List[Dict[str, Any]] = []
    total = min(len(dataset), max_samples)
    
    for i, item in enumerate(tqdm(dataset, total=total, desc=f"Processing {name}")):
        if i >= max_samples:
            break
        
        # Correct field names: alternative_1, alternative_2 (with underscores)
        premise = item.get("premise", "")
        alternative1 = item.get("alternative_1", "")
        alternative2 = item.get("alternative_2", "")
        label = item.get("label", 0)
        
        if not premise:
            continue
        
        # label=0 means alternative_1 is correct, label=1 means alternative_2
        correct = alternative1 if label == 0 else alternative2
        
        if correct:
            pairs.append({
                "source": premise,
                "target": correct,
                "similarity": 0.8,
                "category": "copa",
                "pair_type": "kobest_copa",
            })
    
    return DatasetResult(name=name, success=True, data=pairs)

In [11]:
def load_naver_news(max_samples: int = 10000) -> DatasetResult:
    """
    Load Naver News summarization dataset.
    
    Extracts title-summary and content-summary pairs.
    
    Args:
        max_samples: Maximum number of samples to load.
        
    Returns:
        DatasetResult with news pairs.
    """
    name = "naver_news"
    
    try:
        dataset = load_dataset(
            "daekeun-ml/naver-news-summarization-ko",
            split="train"
        )
    except Exception as e:
        return DatasetResult(
            name=name,
            success=False,
            error_message=f"Failed to load dataset: {e}"
        )
    
    pairs: List[Dict[str, Any]] = []
    total = min(len(dataset), max_samples)
    
    for i, item in enumerate(tqdm(dataset, total=total, desc=f"Processing {name}")):
        if i >= max_samples:
            break
        
        title = item.get("title", "") or item.get("document_title", "")
        content = (
            item.get("document", "") or 
            item.get("content", "") or 
            item.get("text", "")
        )
        summary = item.get("summary", "") or item.get("abstractive", "")
        
        if title and summary:
            pairs.append({
                "source": title,
                "target": summary[:200],
                "similarity": 0.75,
                "category": "news_summary",
                "pair_type": "naver_news",
            })
        
        if content and summary:
            pairs.append({
                "source": content[:200],
                "target": summary[:200],
                "similarity": 0.8,
                "category": "news_summary",
                "pair_type": "naver_news",
            })
    
    return DatasetResult(name=name, success=True, data=pairs)

In [12]:
# KorMedLawQA dataset is currently unavailable (causes crashes)
# Placeholder function that returns empty result
def load_kormedlaw(max_samples: int = 5000) -> DatasetResult:
    """
    Load Korean Medical Law QA dataset.
    
    NOTE: This dataset is currently unavailable due to issues.
    Returns empty result.
    """
    name = "kormedlaw"
    print(f"  [SKIP] {name}: Dataset currently unavailable")
    return DatasetResult(name=name, success=True, data=[])

In [13]:
def load_nsmc_corpus(max_samples: int = 50000) -> DatasetResult:
    """
    Load NSMC corpus for negative mining.
    
    Returns movie review texts as a corpus.
    
    Args:
        max_samples: Maximum number of samples to load.
        
    Returns:
        DatasetResult with corpus texts.
    """
    name = "nsmc"
    
    try:
        # Use "nsmc" directly
        dataset = load_dataset("nsmc", split="train")
    except Exception as e:
        return DatasetResult(
            name=name,
            success=False,
            error_message=f"Failed to load dataset: {e}"
        )
    
    texts: List[str] = []
    total = min(len(dataset), max_samples)
    
    for i, item in enumerate(tqdm(dataset, total=total, desc=f"Processing {name}")):
        if i >= max_samples:
            break
        
        document = item.get("document", "")
        if document and len(document) > 5:
            texts.append(document)
    
    return DatasetResult(name=name, success=True, corpus=texts)

## Multi-threaded Dataset Loader

Core class that manages parallel dataset loading with progress tracking.

In [14]:
@dataclass
class DatasetConfig:
    """Configuration for a dataset loading task."""
    
    name: str
    loader_fn: Callable[..., DatasetResult]
    max_samples: int
    description: str = ""


class MultiThreadedDataLoader:
    """
    Multi-threaded dataset loader with progress tracking.
    
    Loads multiple HuggingFace datasets in parallel using ThreadPoolExecutor.
    Provides graceful error handling so that failure of one dataset
    does not affect others.
    
    Attributes:
        max_workers: Maximum number of parallel threads.
        results: Dictionary mapping dataset names to their results.
        progress: Dictionary mapping dataset names to progress info.
    """
    
    def __init__(self, max_workers: int = 4) -> None:
        """
        Initialize the multi-threaded loader.
        
        Args:
            max_workers: Maximum number of concurrent threads.
        """
        self.max_workers = max_workers
        self.results: Dict[str, DatasetResult] = {}
        self.progress: Dict[str, LoadingProgress] = {}
        self._lock = threading.Lock()
    
    def _execute_loader(
        self,
        config: DatasetConfig
    ) -> DatasetResult:
        """
        Execute a single dataset loader.
        
        Args:
            config: Dataset configuration.
            
        Returns:
            DatasetResult from the loader function.
        """
        with self._lock:
            self.progress[config.name] = LoadingProgress(
                name=config.name,
                status="loading"
            )
        
        try:
            result = config.loader_fn(max_samples=config.max_samples)
            
            with self._lock:
                self.progress[config.name].complete(success=result.success)
            
            return result
            
        except Exception as e:
            with self._lock:
                self.progress[config.name].complete(success=False)
            
            return DatasetResult(
                name=config.name,
                success=False,
                error_message=f"Unexpected error: {e}"
            )
    
    def load_all(
        self,
        configs: List[DatasetConfig]
    ) -> Dict[str, DatasetResult]:
        """
        Load all datasets in parallel.
        
        Args:
            configs: List of dataset configurations.
            
        Returns:
            Dictionary mapping dataset names to results.
        """
        print(f"\nLoading {len(configs)} datasets with {self.max_workers} workers...")
        print("=" * 60)
        
        # Initialize progress tracking
        for config in configs:
            self.progress[config.name] = LoadingProgress(
                name=config.name,
                status="pending"
            )
        
        # Submit all tasks
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            future_to_config: Dict[Future, DatasetConfig] = {
                executor.submit(self._execute_loader, config): config
                for config in configs
            }
            
            # Process completed futures
            for future in as_completed(future_to_config):
                config = future_to_config[future]
                
                try:
                    result = future.result()
                    self.results[config.name] = result
                    
                    status = "SUCCESS" if result.success else "FAILED"
                    count = result.sample_count
                    print(f"  [{status}] {config.name}: {count} samples")
                    
                    if not result.success:
                        print(f"    Error: {result.error_message}")
                        
                except Exception as e:
                    print(f"  [ERROR] {config.name}: {e}")
                    self.results[config.name] = DatasetResult(
                        name=config.name,
                        success=False,
                        error_message=str(e)
                    )
        
        print("=" * 60)
        return self.results
    
    def get_summary(self) -> Dict[str, Any]:
        """
        Get loading summary statistics.
        
        Returns:
            Dictionary with summary statistics.
        """
        successful = [r for r in self.results.values() if r.success]
        failed = [r for r in self.results.values() if not r.success]
        
        total_samples = sum(r.sample_count for r in successful)
        
        return {
            "total_datasets": len(self.results),
            "successful": len(successful),
            "failed": len(failed),
            "total_samples": total_samples,
            "failed_datasets": [r.name for r in failed],
        }

## Configure Dataset Loading Tasks

In [15]:
# Define all dataset configurations
# Note: kormedlaw removed due to dataset issues
DATASET_CONFIGS: List[DatasetConfig] = [
    DatasetConfig(
        name="msmarco_ko",
        loader_fn=load_msmarco_korean,
        max_samples=50000,
        description="Korean MS MARCO triplets",
    ),
    DatasetConfig(
        name="klue_nli",
        loader_fn=load_klue_nli,
        max_samples=30000,
        description="KLUE Natural Language Inference",
    ),
    DatasetConfig(
        name="klue_sts",
        loader_fn=load_klue_sts,
        max_samples=15000,
        description="KLUE Semantic Textual Similarity",
    ),
    DatasetConfig(
        name="korquad",
        loader_fn=load_korquad,
        max_samples=30000,
        description="Korean Question Answering",
    ),
    DatasetConfig(
        name="kobest_copa",
        loader_fn=load_kobest_copa,
        max_samples=5000,
        description="KoBEST COPA reasoning",
    ),
    DatasetConfig(
        name="naver_news",
        loader_fn=load_naver_news,
        max_samples=10000,
        description="Naver News summarization",
    ),
    DatasetConfig(
        name="nsmc",
        loader_fn=load_nsmc_corpus,
        max_samples=50000,
        description="NSMC movie review corpus",
    ),
]

print("Dataset configurations:")
for config in DATASET_CONFIGS:
    print(f"  - {config.name}: {config.description} (max: {config.max_samples})")

Dataset configurations:
  - msmarco_ko: Korean MS MARCO triplets (max: 50000)
  - klue_nli: KLUE Natural Language Inference (max: 30000)
  - klue_sts: KLUE Semantic Textual Similarity (max: 15000)
  - korquad: Korean Question Answering (max: 30000)
  - kobest_copa: KoBEST COPA reasoning (max: 5000)
  - naver_news: Naver News summarization (max: 10000)
  - nsmc: NSMC movie review corpus (max: 50000)


## Execute Parallel Loading

In [16]:
# Load datasets SEQUENTIALLY to avoid memory issues
# Multi-threading with HuggingFace can cause kernel crashes due to memory pressure

import gc

def load_datasets_sequential(configs: List[DatasetConfig]) -> Dict[str, DatasetResult]:
    """Load datasets one by one to avoid memory issues."""
    results = {}
    
    print(f"\nLoading {len(configs)} datasets sequentially...")
    print("=" * 60)
    
    for config in configs:
        print(f"\n[{config.name}] Loading {config.description}...")
        try:
            result = config.loader_fn(max_samples=config.max_samples)
            results[config.name] = result
            
            status = "SUCCESS" if result.success else "FAILED"
            print(f"  [{status}] {result.sample_count:,} samples")
            
            if not result.success:
                print(f"    Error: {result.error_message}")
        except Exception as e:
            print(f"  [ERROR] {e}")
            results[config.name] = DatasetResult(
                name=config.name,
                success=False,
                error_message=str(e)
            )
        
        # Force garbage collection after each dataset
        gc.collect()
    
    print("\n" + "=" * 60)
    return results


# Load all datasets
results = load_datasets_sequential(DATASET_CONFIGS)

# Summary
successful = [r for r in results.values() if r.success]
failed = [r for r in results.values() if not r.success]
total_samples = sum(r.sample_count for r in successful)

print(f"\nLoading Summary:")
print(f"  Successful: {len(successful)}/{len(results)}")
print(f"  Total samples: {total_samples:,}")

if failed:
    print(f"  Failed datasets: {', '.join(r.name for r in failed)}")


Loading 7 datasets sequentially...

[msmarco_ko] Loading Korean MS MARCO triplets...


Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]

Processing msmarco_ko:   0%|          | 0/50000 [00:00<?, ?it/s]

  [SUCCESS] 50,000 samples

[klue_nli] Loading KLUE Natural Language Inference...


Processing klue_nli:   0%|          | 0/24998 [00:00<?, ?it/s]

  [SUCCESS] 17,050 samples

[klue_sts] Loading KLUE Semantic Textual Similarity...


Processing klue_sts:   0%|          | 0/11668 [00:00<?, ?it/s]

  [SUCCESS] 6,016 samples

[korquad] Loading Korean Question Answering...


Processing korquad:   0%|          | 0/30000 [00:00<?, ?it/s]

  [SUCCESS] 60,000 samples

[kobest_copa] Loading KoBEST COPA reasoning...


Processing kobest_copa:   0%|          | 0/3076 [00:00<?, ?it/s]

  [SUCCESS] 3,076 samples

[naver_news] Loading Naver News summarization...


Processing naver_news:   0%|          | 0/10000 [00:00<?, ?it/s]

  [SUCCESS] 20,000 samples

[nsmc] Loading NSMC movie review corpus...


Processing nsmc:   0%|          | 0/50000 [00:00<?, ?it/s]

  [SUCCESS] 47,976 samples


Loading Summary:
  Successful: 7/7
  Total samples: 204,118


## Extract Results

In [17]:
# Extract individual results
msmarco_triplets = results.get("msmarco_ko", DatasetResult(name="msmarco_ko", success=False)).data
klue_nli_pairs = results.get("klue_nli", DatasetResult(name="klue_nli", success=False)).data
klue_sts_pairs = results.get("klue_sts", DatasetResult(name="klue_sts", success=False)).data
korquad_pairs = results.get("korquad", DatasetResult(name="korquad", success=False)).data
kobest_pairs = results.get("kobest_copa", DatasetResult(name="kobest_copa", success=False)).data
naver_news_pairs = results.get("naver_news", DatasetResult(name="naver_news", success=False)).data
nsmc_texts = results.get("nsmc", DatasetResult(name="nsmc", success=False)).corpus

print("Extracted results:")
print(f"  MS MARCO triplets: {len(msmarco_triplets)}")
print(f"  KLUE NLI pairs: {len(klue_nli_pairs)}")
print(f"  KLUE STS pairs: {len(klue_sts_pairs)}")
print(f"  KorQuAD pairs: {len(korquad_pairs)}")
print(f"  KoBEST COPA pairs: {len(kobest_pairs)}")
print(f"  Naver News pairs: {len(naver_news_pairs)}")
print(f"  NSMC texts: {len(nsmc_texts)}")

Extracted results:
  MS MARCO triplets: 50000
  KLUE NLI pairs: 17050
  KLUE STS pairs: 6016
  KorQuAD pairs: 60000
  KoBEST COPA pairs: 3076
  Naver News pairs: 20000
  NSMC texts: 47976


In [18]:
# Show sample from each successful dataset
print("\nSample data from each dataset:")
print("=" * 60)

if msmarco_triplets:
    print("\nMS MARCO triplet:")
    print(json.dumps(msmarco_triplets[0], ensure_ascii=False, indent=2))

if klue_nli_pairs:
    print("\nKLUE NLI pair:")
    print(json.dumps(klue_nli_pairs[0], ensure_ascii=False, indent=2))

if korquad_pairs:
    print("\nKorQuAD pair:")
    print(json.dumps(korquad_pairs[0], ensure_ascii=False, indent=2))


Sample data from each dataset:

MS MARCO triplet:
{
  "anchor": "인문학이란 무엇인가요?",
  "positive": "자유 예술. 1. 전문 또는 기술 과목과 달리 일반적인 지식을 제공하기 위한 대학의 학술 과정을 의미하며 예술, 인문학, 자연 과학 및 사회 과학을 포함합니다.",
  "negative": "자유 교육: 개인에게 힘을 실어주고 복잡성, 다양성 및 변화를 다룰 수 있도록 준비시키는 대학 학습에 대한 접근 방식입니다. 이 접근 방식은 특정 관심 분야의 심층적인 성취와 함께 더 넓은 세계(예: 과학, 문화, 사회)에 대한 폭넓은 지식을 강조합니다.",
  "source": "msmarco_ko"
}

KLUE NLI pair:
{
  "source": "힛걸 진심 최고다 그 어떤 히어로보다 멋지다",
  "target": "힛걸 진심 최고로 멋지다.",
  "similarity": 0.9,
  "category": "nli_entailment",
  "pair_type": "klue_nli"
}

KorQuAD pair:
{
  "source": "바그너는 괴테의 파우스트를 읽고 무엇을 쓰고자 했는가?",
  "target": "교향곡",
  "similarity": 0.85,
  "category": "qa",
  "pair_type": "korquad"
}


## Merge All Pairs

In [19]:
def merge_all_pairs(
    klue_nli: List[Dict],
    klue_sts: List[Dict],
    korquad: List[Dict],
    kobest: List[Dict],
    naver_news: List[Dict],
    msmarco: List[Dict],
) -> List[Dict[str, Any]]:
    """
    Merge all pair datasets into a unified format.
    
    Args:
        klue_nli: KLUE NLI pairs.
        klue_sts: KLUE STS pairs.
        korquad: KorQuAD pairs.
        kobest: KoBEST COPA pairs.
        naver_news: Naver News pairs.
        msmarco: MS MARCO triplets.
        
    Returns:
        List of merged pairs.
    """
    all_pairs: List[Dict[str, Any]] = []
    
    # Add KLUE pairs (skip contradictions for positive training)
    for pair in klue_nli:
        if pair.get("pair_type") != "klue_nli_neg":
            all_pairs.append(pair)
    
    all_pairs.extend(klue_sts)
    all_pairs.extend(korquad)
    all_pairs.extend(kobest)
    all_pairs.extend(naver_news)
    
    # Convert MS MARCO triplets to pairs
    for triplet in msmarco:
        if triplet.get("positive"):
            all_pairs.append({
                "source": triplet["anchor"],
                "target": triplet["positive"],
                "similarity": 0.85,
                "category": "retrieval",
                "pair_type": "msmarco_ko",
            })
    
    return all_pairs


def deduplicate_pairs(pairs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Remove duplicate pairs based on source-target combination.
    
    Args:
        pairs: List of pairs to deduplicate.
        
    Returns:
        Deduplicated list of pairs.
    """
    seen: set = set()
    unique_pairs: List[Dict[str, Any]] = []
    
    for pair in pairs:
        key = (pair.get("source", ""), pair.get("target", ""))
        if key not in seen:
            seen.add(key)
            unique_pairs.append(pair)
    
    return unique_pairs

In [20]:
# Merge all pairs
all_pairs = merge_all_pairs(
    klue_nli=klue_nli_pairs,
    klue_sts=klue_sts_pairs,
    korquad=korquad_pairs,
    kobest=kobest_pairs,
    naver_news=naver_news_pairs,
    msmarco=msmarco_triplets,
)

print(f"Total pairs collected: {len(all_pairs)}")

# Deduplicate
unique_pairs = deduplicate_pairs(all_pairs)
print(f"Unique pairs after deduplication: {len(unique_pairs)}")

Total pairs collected: 147653
Unique pairs after deduplication: 146680


In [21]:
# Statistics by source
source_counts: Dict[str, int] = defaultdict(int)
for pair in unique_pairs:
    source_counts[pair.get("pair_type", "unknown")] += 1

print("\nPairs by source:")
for source, count in sorted(source_counts.items(), key=lambda x: -x[1]):
    print(f"  {source}: {count:,}")


Pairs by source:
  msmarco_ko: 49,998
  korquad: 29,926
  korquad_context: 29,924
  naver_news: 19,195
  klue_nli: 8,560
  klue_sts: 6,009
  kobest_copa: 3,068


## Save Processed Data

In [22]:
def save_jsonl(data: List[Dict], output_path: Path) -> int:
    """
    Save data to JSONL format.
    
    Args:
        data: List of dictionaries to save.
        output_path: Path to output file.
        
    Returns:
        Number of records saved.
    """
    with open(output_path, "w", encoding="utf-8") as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")
    return len(data)


def save_text_corpus(texts: List[str], output_path: Path) -> int:
    """
    Save text corpus to file.
    
    Args:
        texts: List of text strings.
        output_path: Path to output file.
        
    Returns:
        Number of texts saved.
    """
    with open(output_path, "w", encoding="utf-8") as f:
        for text in texts:
            f.write(text + "\n")
    return len(texts)

In [23]:
# Save synonym pairs
pairs_output_path = HF_DATA_DIR / "huggingface_synonym_pairs.jsonl"
saved_pairs = save_jsonl(unique_pairs, pairs_output_path)
print(f"Saved {saved_pairs:,} pairs to {pairs_output_path}")

Saved 146,680 pairs to /home/west/Documents/cursor-workspace/opensearch-neural-pre-train/data/huggingface_korean/huggingface_synonym_pairs.jsonl


In [24]:
# Save MS MARCO triplets separately (for direct triplet training)
triplets_with_negatives = [
    t for t in msmarco_triplets
    if t.get("positive") and t.get("negative")
]

triplets_output_path = HF_DATA_DIR / "msmarco_triplets.jsonl"
saved_triplets = save_jsonl(triplets_with_negatives, triplets_output_path)
print(f"Saved {saved_triplets:,} MS MARCO triplets to {triplets_output_path}")

Saved 50,000 MS MARCO triplets to /home/west/Documents/cursor-workspace/opensearch-neural-pre-train/data/huggingface_korean/msmarco_triplets.jsonl


In [25]:
# Save corpus for negative mining
corpus_output_path = HF_DATA_DIR / "nsmc_corpus.txt"
saved_texts = save_text_corpus(nsmc_texts, corpus_output_path)
print(f"Saved {saved_texts:,} texts to {corpus_output_path}")

Saved 47,976 texts to /home/west/Documents/cursor-workspace/opensearch-neural-pre-train/data/huggingface_korean/nsmc_corpus.txt


## Summary Statistics

In [26]:
print("\n" + "=" * 60)
print("HuggingFace Korean Data Loading Summary")
print("=" * 60)

print(f"\nLoading Performance:")
print(f"  Mode: Sequential (memory-safe)")
print(f"  Datasets attempted: {len(results)}")
print(f"  Successful: {len(successful)}")
print(f"  Failed: {len(failed)}")

print(f"\nDatasets Loaded:")
print(f"  MS MARCO Korean Triplets: {len(msmarco_triplets):,}")
print(f"  KLUE NLI Pairs: {len(klue_nli_pairs):,}")
print(f"  KLUE STS Pairs: {len(klue_sts_pairs):,}")
print(f"  KorQuAD Pairs: {len(korquad_pairs):,}")
print(f"  KoBEST COPA Pairs: {len(kobest_pairs):,}")
print(f"  Naver News Pairs: {len(naver_news_pairs):,}")
print(f"  NSMC Corpus: {len(nsmc_texts):,} texts")

print(f"\nTotal Unique Pairs: {len(unique_pairs):,}")

print(f"\nOutput Files:")
for f in sorted(HF_DATA_DIR.glob("*")):
    size_mb = f.stat().st_size / 1024 / 1024
    print(f"  {f.name}: {size_mb:.2f} MB")


HuggingFace Korean Data Loading Summary

Loading Performance:
  Mode: Sequential (memory-safe)
  Datasets attempted: 7
  Successful: 7
  Failed: 0

Datasets Loaded:
  MS MARCO Korean Triplets: 50,000
  KLUE NLI Pairs: 17,050
  KLUE STS Pairs: 6,016
  KorQuAD Pairs: 60,000
  KoBEST COPA Pairs: 3,076
  Naver News Pairs: 20,000
  NSMC Corpus: 47,976 texts

Total Unique Pairs: 146,680

Output Files:
  huggingface_synonym_pairs.jsonl: 68.12 MB
  msmarco_triplets.jsonl: 43.01 MB
  nsmc_corpus.txt: 4.18 MB


## Next Steps

1. Run `01_data_augmentation.ipynb` to merge with existing synonym pairs
2. The HuggingFace data will be automatically incorporated into training