# v19 Data Ingestion

This notebook collects Korean-English term pairs from multiple sources for training the cross-lingual SPLADE model.

## Data Sources

| Source | Description | Expected Pairs |
|--------|-------------|----------------|
| MUSE | Facebook's bilingual dictionary | ~20,000 |
| Wikidata | Entity labels with Korean/English | ~50,000 |
| IT Terminology | Technical terms | ~400 |

## Output

- `dataset/v19_high_quality/term_pairs.jsonl` - Raw term pairs for preprocessing

In [1]:
import sys
from pathlib import Path

# Find project root
def find_project_root():
    """Find project root by looking for markers like pyproject.toml or src/"""
    current = Path.cwd()
    for parent in [current] + list(current.parents):
        if (parent / "pyproject.toml").exists() or (parent / "src").exists():
            return parent
    return Path.cwd().parent.parent

PROJECT_ROOT = find_project_root()
sys.path.insert(0, str(PROJECT_ROOT))

print(f"Project root: {PROJECT_ROOT}")

Project root: /home/west/Documents/cursor-workspace/opensearch-neural-pre-train


In [2]:
import json
import re
import time
from collections import defaultdict
from typing import List, Dict, Set, Tuple

import requests
from tqdm.notebook import tqdm

# Output directory
OUTPUT_DIR = PROJECT_ROOT / "dataset" / "v19_high_quality"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Output directory: {OUTPUT_DIR}")

Output directory: /home/west/Documents/cursor-workspace/opensearch-neural-pre-train/dataset/v19_high_quality


## Configuration

In [3]:
CONFIG = {
    # Quality filters
    "min_ko_length": 2,
    "max_ko_length": 20,
    "min_en_length": 2,
    "max_en_length": 30,
    
    # Request settings
    "request_timeout": 120,
    "wikidata_delay": 2.0,  # Delay between Wikidata queries
}

print("Configuration:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

Configuration:
  min_ko_length: 2
  max_ko_length: 20
  min_en_length: 2
  max_en_length: 30
  request_timeout: 120
  wikidata_delay: 2.0


## Helper Functions

In [4]:
def is_valid_korean(text: str) -> bool:
    """Check if text contains Korean characters."""
    return any('\uac00' <= c <= '\ud7a3' for c in text)


def is_valid_english(text: str) -> bool:
    """Check if text is valid English (letters only, no special chars)."""
    if not text:
        return False
    # Must contain at least one ASCII letter
    has_letter = any(c.isalpha() and c.isascii() for c in text)
    # Should not be all uppercase abbreviations longer than 5 chars
    if text.isupper() and len(text) > 5:
        return False
    return has_letter


def clean_text(text: str) -> str:
    """Clean text for consistency."""
    text = text.strip()
    # Remove parenthetical content
    text = re.sub(r'\s*\([^)]*\)', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


print("Helper functions defined.")
print(f"  is_valid_korean('안녕'): {is_valid_korean('안녕')}")
print(f"  is_valid_english('hello'): {is_valid_english('hello')}")

Helper functions defined.
  is_valid_korean('안녕'): True
  is_valid_english('hello'): True


## 1. MUSE Bilingual Dictionary

Facebook's MUSE project provides high-quality bilingual dictionaries.
- KO -> EN: https://dl.fbaipublicfiles.com/arrival/dictionaries/ko-en.txt
- EN -> KO: https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ko.txt

In [5]:
def collect_muse_dictionary() -> List[Dict]:
    """Collect KO-EN pairs from MUSE bilingual dictionaries."""
    print("=" * 70)
    print("1. COLLECTING MUSE DICTIONARY")
    print("=" * 70)

    pairs = []

    muse_urls = [
        ("https://dl.fbaipublicfiles.com/arrival/dictionaries/ko-en.txt", "ko", "en"),
        ("https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ko.txt", "en", "ko"),
    ]

    for url, src_lang, tgt_lang in muse_urls:
        print(f"\nDownloading from {url}...")
        try:
            response = requests.get(url, timeout=CONFIG["request_timeout"], headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
            })
            print(f"Status: {response.status_code}")

            if response.status_code == 200:
                # Force UTF-8 encoding for Korean characters
                response.encoding = 'utf-8'
                content = response.text.strip()
                if not content:
                    print(f"Empty response from {url}")
                    continue

                lines = content.split('\n')
                print(f"Got {len(lines):,} lines")

                for line in tqdm(lines, desc=f"MUSE ({src_lang}->{tgt_lang})"):
                    parts = line.strip().split()  # Split by whitespace
                    if len(parts) >= 2:
                        if src_lang == "ko":
                            ko_word, en_word = parts[0].strip(), parts[1].strip()
                        else:
                            en_word, ko_word = parts[0].strip(), parts[1].strip()

                        if (is_valid_korean(ko_word) and
                            is_valid_english(en_word) and
                            len(ko_word) >= CONFIG["min_ko_length"] and
                            len(en_word) >= CONFIG["min_en_length"]):
                            pairs.append({
                                "ko": ko_word,
                                "en": en_word.lower(),
                                "source": "muse"
                            })
            else:
                print(f"Failed: {response.status_code}")
        except Exception as e:
            print(f"Error: {e}")

    print(f"\nCollected {len(pairs):,} pairs from MUSE")
    return pairs


muse_pairs = collect_muse_dictionary()

1. COLLECTING MUSE DICTIONARY

Downloading from https://dl.fbaipublicfiles.com/arrival/dictionaries/ko-en.txt...
Status: 200
Got 20,549 lines


MUSE (ko->en):   0%|          | 0/20549 [00:00<?, ?it/s]


Downloading from https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ko.txt...
Status: 200
Got 22,357 lines


MUSE (en->ko):   0%|          | 0/22357 [00:00<?, ?it/s]


Collected 41,032 pairs from MUSE


## 2. Wikidata Labels

Wikidata provides entity labels in multiple languages. We query for entities with both Korean and English labels.

In [None]:
def collect_wikidata_labels() -> List[Dict]:
    """Collect KO-EN pairs from Wikidata entity labels."""
    print("\n" + "=" * 70)
    print("2. COLLECTING WIKIDATA LABELS")
    print("=" * 70)

    pairs = []
    endpoint = "https://query.wikidata.org/sparql"
    headers = {
        "User-Agent": "KoEnTermCollector/1.0",
        "Accept": "application/json"
    }

    # Multiple queries to get diverse data
    queries = [
        # Query 1: Common entities (people, places, things)
        """
        SELECT ?item ?koLabel ?enLabel WHERE {
          ?item wdt:P31 ?type .
          ?item rdfs:label ?koLabel . FILTER(LANG(?koLabel) = "ko")
          ?item rdfs:label ?enLabel . FILTER(LANG(?enLabel) = "en")
          FILTER(STRLEN(?koLabel) >= 2 && STRLEN(?koLabel) <= 20)
          FILTER(STRLEN(?enLabel) >= 2 && STRLEN(?enLabel) <= 30)
        }
        LIMIT 30000
        """,

        # Query 2: Scientific and technical terms
        """
        SELECT ?item ?koLabel ?enLabel WHERE {
          { ?item wdt:P31 wd:Q11862829 } UNION  # academic discipline
          { ?item wdt:P31 wd:Q5633421 } UNION   # scientific journal
          { ?item wdt:P31 wd:Q7397 } UNION      # software
          { ?item wdt:P31 wd:Q28243 }           # programming language
          ?item rdfs:label ?koLabel . FILTER(LANG(?koLabel) = "ko")
          ?item rdfs:label ?enLabel . FILTER(LANG(?enLabel) = "en")
        }
        LIMIT 20000
        """,

        # Query 3: Organizations and companies
        """
        SELECT ?item ?koLabel ?enLabel WHERE {
          { ?item wdt:P31 wd:Q4830453 } UNION   # business
          { ?item wdt:P31 wd:Q43229 }           # organization
          ?item rdfs:label ?koLabel . FILTER(LANG(?koLabel) = "ko")
          ?item rdfs:label ?enLabel . FILTER(LANG(?enLabel) = "en")
          FILTER(STRLEN(?koLabel) >= 2 && STRLEN(?koLabel) <= 15)
        }
        LIMIT 15000
        """,

        # Query 4: Concepts and abstract terms
        """
        SELECT ?item ?koLabel ?enLabel WHERE {
          { ?item wdt:P31 wd:Q35120 } UNION     # entity
          { ?item wdt:P31 wd:Q151885 }          # concept
          ?item rdfs:label ?koLabel . FILTER(LANG(?koLabel) = "ko")
          ?item rdfs:label ?enLabel . FILTER(LANG(?enLabel) = "en")
          FILTER(STRLEN(?koLabel) >= 2 && STRLEN(?koLabel) <= 10)
          FILTER(STRLEN(?enLabel) >= 3 && STRLEN(?enLabel) <= 20)
        }
        LIMIT 10000
        """
    ]

    for i, query in enumerate(queries):
        print(f"\nExecuting Wikidata query {i+1}/{len(queries)}...")
        try:
            response = requests.get(
                endpoint,
                params={"query": query, "format": "json"},
                headers=headers,
                timeout=120
            )

            if response.status_code == 200:
                data = response.json()
                results = data.get("results", {}).get("bindings", [])

                for item in tqdm(results, desc=f"Wikidata Q{i+1}"):
                    ko_label = item.get("koLabel", {}).get("value", "")
                    en_label = item.get("enLabel", {}).get("value", "")

                    ko_label = clean_text(ko_label)
                    en_label = clean_text(en_label)

                    if (is_valid_korean(ko_label) and
                        is_valid_english(en_label) and
                        len(ko_label) >= CONFIG["min_ko_length"] and
                        len(en_label) >= CONFIG["min_en_length"]):
                        pairs.append({
                            "ko": ko_label,
                            "en": en_label.lower(),
                            "source": "wikidata"
                        })
            else:
                print(f"Query {i+1} failed: {response.status_code}")

            # Rate limiting
            time.sleep(CONFIG["wikidata_delay"])

        except Exception as e:
            print(f"Query {i+1} error: {e}")

    print(f"\nCollected {len(pairs):,} pairs from Wikidata")
    return pairs


wikidata_pairs = collect_wikidata_labels()


2. COLLECTING WIKIDATA LABELS

Executing Wikidata query 1/4...


## 3. IT/Tech Terminology

Curated list of common IT and technical terms.

In [None]:
def collect_it_terminology() -> List[Dict]:
    """Collect IT/Tech terminology pairs."""
    print("\n" + "=" * 70)
    print("3. COLLECTING IT/TECH TERMINOLOGY")
    print("=" * 70)

    # Extended IT terminology list
    it_terms = [
        # Programming
        ("프로그램", "program"), ("프로그래밍", "programming"), ("코드", "code"),
        ("코딩", "coding"), ("소프트웨어", "software"), ("하드웨어", "hardware"),
        ("알고리즘", "algorithm"), ("함수", "function"), ("변수", "variable"),
        ("클래스", "class"), ("객체", "object"), ("메서드", "method"),
        ("인터페이스", "interface"), ("모듈", "module"), ("라이브러리", "library"),
        ("프레임워크", "framework"), ("패키지", "package"), ("컴파일러", "compiler"),
        ("인터프리터", "interpreter"), ("디버깅", "debugging"), ("테스트", "test"),
        ("배포", "deployment"), ("버전", "version"), ("업데이트", "update"),
        
        # Web/Network
        ("네트워크", "network"), ("서버", "server"), ("클라이언트", "client"),
        ("데이터베이스", "database"), ("쿼리", "query"), ("인덱스", "index"),
        ("캐시", "cache"), ("프록시", "proxy"), ("방화벽", "firewall"),
        ("프로토콜", "protocol"), ("도메인", "domain"), ("호스트", "host"),
        ("라우터", "router"), ("스위치", "switch"), ("게이트웨이", "gateway"),
        
        # Data/AI
        ("데이터", "data"), ("정보", "information"), ("분석", "analysis"),
        ("머신러닝", "machine learning"), ("딥러닝", "deep learning"),
        ("인공지능", "artificial intelligence"), ("신경망", "neural network"),
        ("모델", "model"), ("학습", "training"), ("추론", "inference"),
        ("예측", "prediction"), ("분류", "classification"), ("회귀", "regression"),
        ("클러스터링", "clustering"), ("임베딩", "embedding"), ("벡터", "vector"),
        ("텐서", "tensor"), ("가중치", "weight"), ("편향", "bias"),
        ("손실함수", "loss function"), ("최적화", "optimization"),
        
        # Cloud/DevOps
        ("클라우드", "cloud"), ("컨테이너", "container"), ("도커", "docker"),
        ("쿠버네티스", "kubernetes"), ("마이크로서비스", "microservice"),
        ("오케스트레이션", "orchestration"), ("스케일링", "scaling"),
        ("로드밸런서", "load balancer"), ("모니터링", "monitoring"),
        ("로깅", "logging"), ("파이프라인", "pipeline"), ("자동화", "automation"),
        
        # Security
        ("보안", "security"), ("인증", "authentication"), ("권한", "authorization"),
        ("암호화", "encryption"), ("복호화", "decryption"), ("해시", "hash"),
        ("토큰", "token"), ("세션", "session"), ("쿠키", "cookie"),
        
        # UI/UX
        ("사용자", "user"), ("인터페이스", "interface"), ("디자인", "design"),
        ("레이아웃", "layout"), ("컴포넌트", "component"), ("위젯", "widget"),
        ("버튼", "button"), ("메뉴", "menu"), ("네비게이션", "navigation"),
        ("폼", "form"), ("입력", "input"), ("출력", "output"),
        
        # General tech
        ("시스템", "system"), ("플랫폼", "platform"), ("애플리케이션", "application"),
        ("서비스", "service"), ("솔루션", "solution"), ("아키텍처", "architecture"),
        ("인프라", "infrastructure"), ("리소스", "resource"), ("환경", "environment"),
        ("설정", "configuration"), ("옵션", "option"), ("파라미터", "parameter"),
        ("프로세스", "process"), ("스레드", "thread"), ("메모리", "memory"),
        ("스토리지", "storage"), ("파일", "file"), ("폴더", "folder"),
        ("디렉토리", "directory"), ("경로", "path"), ("확장자", "extension"),
        
        # Search/NLP
        ("검색", "search"), ("색인", "indexing"), ("랭킹", "ranking"),
        ("토큰화", "tokenization"), ("형태소", "morpheme"), ("어휘", "vocabulary"),
        ("말뭉치", "corpus"), ("문서", "document"), ("텍스트", "text"),
        ("자연어처리", "natural language processing"), ("번역", "translation"),
        ("요약", "summarization"), ("질의응답", "question answering"),
        
        # Additional common terms
        ("기능", "feature"), ("성능", "performance"), ("효율", "efficiency"),
        ("정확도", "accuracy"), ("정밀도", "precision"), ("재현율", "recall"),
        ("오류", "error"), ("예외", "exception"), ("버그", "bug"),
        ("이슈", "issue"), ("태스크", "task"), ("작업", "job"),
        ("요청", "request"), ("응답", "response"), ("상태", "status"),
        ("이벤트", "event"), ("핸들러", "handler"), ("콜백", "callback"),
        ("비동기", "asynchronous"), ("동기", "synchronous"), ("병렬", "parallel"),
        ("순차", "sequential"), ("반복", "iteration"), ("재귀", "recursion"),
        
        # More IT terms
        ("아이디", "id"), ("비밀번호", "password"), ("로그인", "login"),
        ("로그아웃", "logout"), ("계정", "account"), ("프로필", "profile"),
        ("설치", "installation"), ("다운로드", "download"), ("업로드", "upload"),
        ("동기화", "synchronization"), ("백업", "backup"), ("복원", "restore"),
        ("삭제", "delete"), ("수정", "edit"), ("생성", "create"),
        ("조회", "read"), ("갱신", "update"), ("추가", "add"),
        ("제거", "remove"), ("복사", "copy"), ("붙여넣기", "paste"),
        ("실행", "execute"), ("중지", "stop"), ("재시작", "restart"),
        ("초기화", "initialize"), ("종료", "terminate"), ("시작", "start"),
        
        # Hardware/OS
        ("운영체제", "operating system"), ("커널", "kernel"), ("드라이버", "driver"),
        ("프로세서", "processor"), ("그래픽카드", "graphics card"),
        ("마더보드", "motherboard"), ("전원공급장치", "power supply"),
        ("랜카드", "network card"), ("사운드카드", "sound card"),
        
        # Mobile
        ("앱", "app"), ("모바일", "mobile"), ("스마트폰", "smartphone"),
        ("태블릿", "tablet"), ("터치스크린", "touchscreen"), ("제스처", "gesture"),
        ("알림", "notification"), ("푸시", "push"), ("위치정보", "location"),
    ]

    pairs = []
    for ko, en in it_terms:
        if is_valid_korean(ko) and is_valid_english(en):
            pairs.append({
                "ko": ko,
                "en": en.lower(),
                "source": "it_terminology"
            })

    print(f"Collected {len(pairs):,} IT/Tech terms")
    return pairs


it_pairs = collect_it_terminology()

## 4. Combine and Filter Data

In [None]:
# Combine all pairs
print("\n" + "=" * 70)
print("COMBINING ALL DATA")
print("=" * 70)

all_pairs = muse_pairs + wikidata_pairs + it_pairs
print(f"\nTotal raw pairs: {len(all_pairs):,}")
print(f"  MUSE: {len(muse_pairs):,}")
print(f"  Wikidata: {len(wikidata_pairs):,}")
print(f"  IT Terminology: {len(it_pairs):,}")

In [None]:
def filter_and_deduplicate(pairs: List[Dict]) -> List[Dict]:
    """Filter and deduplicate pairs."""
    print("\n" + "=" * 70)
    print("FILTERING AND DEDUPLICATION")
    print("=" * 70)

    # Quality filtering
    filtered = []
    rejection_reasons = defaultdict(int)

    for pair in tqdm(pairs, desc="Filtering"):
        ko = pair.get("ko", "")
        en = pair.get("en", "")

        # Check Korean
        if not is_valid_korean(ko):
            rejection_reasons["no_korean"] += 1
            continue
        if len(ko) < CONFIG["min_ko_length"]:
            rejection_reasons["ko_too_short"] += 1
            continue
        if len(ko) > CONFIG["max_ko_length"]:
            rejection_reasons["ko_too_long"] += 1
            continue

        # Check English
        if not is_valid_english(en):
            rejection_reasons["en_no_letters"] += 1
            continue
        if len(en) < CONFIG["min_en_length"]:
            rejection_reasons["en_too_short"] += 1
            continue
        if len(en) > CONFIG["max_en_length"]:
            rejection_reasons["en_too_long"] += 1
            continue

        filtered.append(pair)

    print(f"Filtered: {len(pairs):,} -> {len(filtered):,}")
    if rejection_reasons:
        print("Rejection reasons:")
        for reason, count in sorted(rejection_reasons.items(), key=lambda x: -x[1]):
            print(f"  {reason}: {count:,}")

    # Deduplicate by (ko, en) pair
    seen = set()
    unique_pairs = []

    for pair in tqdm(filtered, desc="Deduplicating"):
        key = (pair["ko"], pair["en"])
        if key not in seen:
            seen.add(key)
            unique_pairs.append(pair)

    print(f"After deduplication: {len(unique_pairs):,}")
    return unique_pairs


final_pairs = filter_and_deduplicate(all_pairs)

## 5. Statistics and Save

In [None]:
# Final statistics
print("\n" + "=" * 70)
print("FINAL STATISTICS")
print("=" * 70)

print(f"\nTotal unique pairs: {len(final_pairs):,}")

# Count by source
source_counts = defaultdict(int)
for pair in final_pairs:
    source_counts[pair["source"]] += 1

print("\nPairs by source:")
for source, count in sorted(source_counts.items(), key=lambda x: -x[1]):
    pct = count / len(final_pairs) * 100
    print(f"  {source}: {count:,} ({pct:.1f}%)")

# Sample data
print("\nSample pairs:")
import random
for pair in random.sample(final_pairs, min(10, len(final_pairs))):
    print(f"  {pair['ko']} -> {pair['en']} ({pair['source']})")

In [None]:
# Save to JSONL
output_path = OUTPUT_DIR / "term_pairs.jsonl"

with open(output_path, "w", encoding="utf-8") as f:
    for pair in tqdm(final_pairs, desc="Saving"):
        f.write(json.dumps(pair, ensure_ascii=False) + "\n")

print(f"\nSaved to: {output_path}")
print(f"File size: {output_path.stat().st_size / 1024:.1f} KB")

## Summary

Data collection complete! The term pairs have been saved to `dataset/v19_high_quality/term_pairs.jsonl`.

### Next Steps

1. **Run `01_data_preparation.ipynb`** - Process and cluster terms
2. **Run `02_training.ipynb`** - Train the SPLADE model
3. **Run `03_inference_test.ipynb`** - Test the trained model

In [None]:
print("=" * 70)
print("DATA COLLECTION COMPLETE")
print("=" * 70)
print(f"\nOutput: {output_path}")
print(f"Total pairs: {len(final_pairs):,}")
print("\nNext: Run 01_data_preparation.ipynb to process this data")