# v19 Data Ingestion

This notebook collects Korean-English term pairs for training the cross-lingual SPLADE model.

## Data Sources

| Source | Description | Expected Pairs |
|--------|-------------|----------------|
| MUSE | Facebook's bilingual dictionary (ko-en, en-ko) | ~40K |
| Wikidata | Entity labels from knowledge graph | ~50K |
| IT Terminology | Technical terms (curated) | ~500 |

## Target: 70K+ High-Quality Term Pairs

In [None]:
import sys
from pathlib import Path

def find_project_root():
    current = Path.cwd()
    for parent in [current] + list(current.parents):
        if (parent / "pyproject.toml").exists() or (parent / "src").exists():
            return parent
    return Path.cwd().parent.parent

PROJECT_ROOT = find_project_root()
sys.path.insert(0, str(PROJECT_ROOT))
print(f"Project root: {PROJECT_ROOT}")

In [None]:
import json
import re
import time
from collections import defaultdict
from typing import List, Dict

import requests
from tqdm.notebook import tqdm

OUTPUT_DIR = PROJECT_ROOT / "dataset" / "v19_high_quality"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
print(f"Output directory: {OUTPUT_DIR}")

## Configuration

In [None]:
CONFIG = {
    # Length constraints
    "min_ko_length": 2,
    "max_ko_length": 30,
    "min_en_length": 2,
    "max_en_length": 50,
    
    # Request settings
    "request_timeout": 120,
    "wikidata_delay": 2.0,
}

print("Configuration:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

## Helper Functions

In [None]:
def is_valid_korean(text: str) -> bool:
    """Check if text contains Korean characters."""
    return any('\uac00' <= c <= '\ud7a3' for c in text)


def is_valid_english(text: str) -> bool:
    """Check if text is valid English (letters only, no special chars)."""
    if not text:
        return False
    has_letter = any(c.isalpha() and c.isascii() for c in text)
    if text.isupper() and len(text) > 5:
        return False
    return has_letter


def clean_text(text: str) -> str:
    """Clean text - remove parenthetical content."""
    text = text.strip()
    if '(' in text and ')' in text:
        main_part = text.split('(')[0].strip()
        if main_part:
            return main_part
    return text


def extract_english_words(text: str) -> List[str]:
    """Extract individual English words from a phrase."""
    words = []
    for word in text.split():
        word = word.strip().lower()
        if word and word.isalpha() and word.isascii() and len(word) >= 2:
            words.append(word)
    return words


# Test
print("Validation Tests:")
print(f"  is_valid_korean('프로그램'): {is_valid_korean('프로그램')}")
print(f"  is_valid_english('program'): {is_valid_english('program')}")
print(f"  extract_english_words('machine learning'): {extract_english_words('machine learning')}")

## 1. MUSE Bilingual Dictionary

MUSE contains high-quality bilingual word pairs from Facebook Research.

In [None]:
def collect_muse_dictionary() -> List[Dict]:
    """Collect KO-EN pairs from MUSE bilingual dictionaries."""
    print("=" * 70)
    print("1. COLLECTING MUSE DICTIONARY")
    print("=" * 70)

    pairs = []

    muse_urls = [
        ("https://dl.fbaipublicfiles.com/arrival/dictionaries/ko-en.txt", "ko", "en"),
        ("https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ko.txt", "en", "ko"),
    ]

    for url, src_lang, tgt_lang in muse_urls:
        print(f"\nDownloading from {url}...")
        try:
            response = requests.get(url, timeout=CONFIG["request_timeout"], headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
            })
            print(f"Status: {response.status_code}")

            if response.status_code == 200:
                response.encoding = 'utf-8'
                content = response.text.strip()
                if not content:
                    print(f"Empty response from {url}")
                    continue

                lines = content.split('\n')
                print(f"Got {len(lines):,} lines")

                for line in tqdm(lines, desc=f"MUSE ({src_lang}->{tgt_lang})"):
                    parts = line.strip().split()
                    if len(parts) >= 2:
                        if src_lang == "ko":
                            ko_word, en_word = parts[0].strip(), parts[1].strip()
                        else:
                            en_word, ko_word = parts[0].strip(), parts[1].strip()

                        if (is_valid_korean(ko_word) and
                            is_valid_english(en_word) and
                            len(ko_word) >= CONFIG["min_ko_length"] and
                            len(en_word) >= CONFIG["min_en_length"]):
                            pairs.append({
                                "ko": ko_word,
                                "en": en_word.lower(),
                                "source": "muse"
                            })
            else:
                print(f"Failed: {response.status_code}")
        except Exception as e:
            print(f"Error: {e}")

    print(f"\nCollected {len(pairs):,} pairs from MUSE")
    return pairs


muse_pairs = collect_muse_dictionary()

## 2. Wikidata Labels

Query Wikidata for entity labels in Korean and English.

In [None]:
def collect_wikidata_labels() -> List[Dict]:
    """Collect KO-EN term pairs from Wikidata with multiple queries."""
    print("\n" + "=" * 70)
    print("2. COLLECTING WIKIDATA LABELS")
    print("=" * 70)

    pairs = []
    sparql_url = "https://query.wikidata.org/sparql"

    queries = [
        # General entities
        """
        SELECT ?koLabel ?enLabel WHERE {
            ?item wdt:P31 ?type .
            ?item rdfs:label ?koLabel . FILTER(LANG(?koLabel) = "ko")
            ?item rdfs:label ?enLabel . FILTER(LANG(?enLabel) = "en")
            FILTER(STRLEN(?koLabel) >= 2 && STRLEN(?koLabel) <= 20)
            FILTER(STRLEN(?enLabel) >= 2 && STRLEN(?enLabel) <= 30)
        } LIMIT 30000
        """,
        # Organizations
        """
        SELECT ?koLabel ?enLabel WHERE {
            ?item wdt:P31/wdt:P279* wd:Q43229 .
            ?item rdfs:label ?koLabel . FILTER(LANG(?koLabel) = "ko")
            ?item rdfs:label ?enLabel . FILTER(LANG(?enLabel) = "en")
            FILTER(STRLEN(?koLabel) >= 2)
        } LIMIT 20000
        """,
        # Scientific concepts
        """
        SELECT ?koLabel ?enLabel WHERE {
            ?item wdt:P31/wdt:P279* wd:Q35120 .
            ?item rdfs:label ?koLabel . FILTER(LANG(?koLabel) = "ko")
            ?item rdfs:label ?enLabel . FILTER(LANG(?enLabel) = "en")
            FILTER(STRLEN(?koLabel) >= 2)
        } LIMIT 20000
        """,
        # Software/Technology
        """
        SELECT ?koLabel ?enLabel WHERE {
            ?item wdt:P31/wdt:P279* wd:Q7397 .
            ?item rdfs:label ?koLabel . FILTER(LANG(?koLabel) = "ko")
            ?item rdfs:label ?enLabel . FILTER(LANG(?enLabel) = "en")
            FILTER(STRLEN(?koLabel) >= 2)
        } LIMIT 10000
        """,
    ]

    for i, query in enumerate(queries):
        print(f"\nExecuting Wikidata query {i + 1}/{len(queries)}...")
        try:
            response = requests.get(
                sparql_url,
                params={"query": query, "format": "json"},
                headers={"User-Agent": "TermCollector/2.0 (v19 data collection)"},
                timeout=300
            )

            if response.status_code == 200:
                data = response.json()
                results = data.get("results", {}).get("bindings", [])
                print(f"Got {len(results):,} results")

                for item in tqdm(results, desc=f"Wikidata Q{i + 1}"):
                    ko_label = item.get("koLabel", {}).get("value", "")
                    en_label = item.get("enLabel", {}).get("value", "")

                    if ko_label and en_label:
                        ko_clean = clean_text(ko_label)
                        en_clean = clean_text(en_label)

                        if (is_valid_korean(ko_clean) and
                            is_valid_english(en_clean) and
                            len(ko_clean) >= CONFIG["min_ko_length"] and
                            len(en_clean) >= CONFIG["min_en_length"]):
                            pairs.append({
                                "ko": ko_clean,
                                "en": en_clean.lower(),
                                "source": "wikidata"
                            })
            else:
                print(f"Query {i + 1} failed: {response.status_code}")

            time.sleep(CONFIG["wikidata_delay"])

        except Exception as e:
            print(f"Wikidata error: {e}")

    print(f"\nCollected {len(pairs):,} pairs from Wikidata")
    return pairs


wikidata_pairs = collect_wikidata_labels()

## 3. IT/Tech Terminology (Extended)

Curated IT and technical terminology for better domain coverage.

In [None]:
def collect_it_terminology() -> List[Dict]:
    """Collect extensive IT and technical terminology."""
    print("\n" + "=" * 70)
    print("3. COLLECTING IT/TECH TERMINOLOGY")
    print("=" * 70)

    it_terms = [
        # Machine Learning / AI
        ("머신러닝", "machine learning"), ("기계학습", "machine learning"),
        ("딥러닝", "deep learning"), ("심층학습", "deep learning"),
        ("인공지능", "artificial intelligence"), ("자연어처리", "natural language processing"),
        ("신경망", "neural network"), ("컴퓨터비전", "computer vision"),
        ("강화학습", "reinforcement learning"), ("지도학습", "supervised learning"),
        ("비지도학습", "unsupervised learning"), ("전이학습", "transfer learning"),
        ("트랜스포머", "transformer"), ("어텐션", "attention"),
        ("임베딩", "embedding"), ("벡터", "vector"), ("텐서", "tensor"),
        ("그래디언트", "gradient"), ("역전파", "backpropagation"),
        ("손실함수", "loss function"), ("최적화", "optimization"),
        ("정규화", "regularization"), ("드롭아웃", "dropout"),
        ("배치", "batch"), ("에폭", "epoch"), ("학습률", "learning rate"),
        ("하이퍼파라미터", "hyperparameter"), ("오버피팅", "overfitting"),
        ("분류", "classification"), ("회귀", "regression"),
        ("클러스터링", "clustering"), ("군집화", "clustering"),
        ("파인튜닝", "fine tuning"), ("사전학습", "pretraining"),
        ("토크나이저", "tokenizer"), ("토큰화", "tokenization"),
        
        # Programming
        ("프로그래밍", "programming"), ("코딩", "coding"),
        ("알고리즘", "algorithm"), ("자료구조", "data structure"),
        ("함수", "function"), ("변수", "variable"), ("클래스", "class"),
        ("객체", "object"), ("메서드", "method"), ("인스턴스", "instance"),
        ("상속", "inheritance"), ("캡슐화", "encapsulation"),
        ("인터페이스", "interface"), ("모듈", "module"), ("패키지", "package"),
        ("라이브러리", "library"), ("프레임워크", "framework"),
        ("컴파일러", "compiler"), ("인터프리터", "interpreter"),
        ("디버깅", "debugging"), ("테스트", "test"),
        ("배포", "deployment"), ("버전관리", "version control"),
        ("리팩토링", "refactoring"), ("코드리뷰", "code review"),
        
        # Database
        ("데이터베이스", "database"), ("쿼리", "query"),
        ("테이블", "table"), ("인덱스", "index"), ("키", "key"),
        ("조인", "join"), ("트랜잭션", "transaction"),
        ("스키마", "schema"), ("정규화", "normalization"),
        ("샤딩", "sharding"), ("파티셔닝", "partitioning"),
        ("캐싱", "caching"), ("레디스", "redis"),
        
        # Web / Network
        ("웹", "web"), ("웹사이트", "website"),
        ("서버", "server"), ("클라이언트", "client"),
        ("프론트엔드", "frontend"), ("백엔드", "backend"),
        ("네트워크", "network"), ("프로토콜", "protocol"),
        ("라우터", "router"), ("방화벽", "firewall"),
        ("로드밸런서", "load balancer"), ("프록시", "proxy"),
        
        # Cloud
        ("클라우드", "cloud"), ("클라우드컴퓨팅", "cloud computing"),
        ("가상화", "virtualization"), ("가상머신", "virtual machine"),
        ("컨테이너", "container"), ("도커", "docker"),
        ("쿠버네티스", "kubernetes"), ("마이크로서비스", "microservice"),
        ("서버리스", "serverless"), ("스케일링", "scaling"),
        
        # Security
        ("보안", "security"), ("암호화", "encryption"),
        ("복호화", "decryption"), ("해시", "hash"),
        ("인증", "authentication"), ("권한", "authorization"),
        ("인증서", "certificate"), ("취약점", "vulnerability"),
        
        # Search / IR
        ("검색", "search"), ("검색엔진", "search engine"),
        ("정보검색", "information retrieval"), ("인덱싱", "indexing"),
        ("랭킹", "ranking"), ("문서", "document"),
        ("시맨틱검색", "semantic search"), ("벡터검색", "vector search"),
        ("유사도", "similarity"), ("코사인유사도", "cosine similarity"),
        ("추천", "recommendation"), ("추천시스템", "recommendation system"),
        ("리랭킹", "reranking"), ("쿼리확장", "query expansion"),
        
        # Data Science
        ("데이터사이언스", "data science"), ("데이터분석", "data analysis"),
        ("데이터마이닝", "data mining"), ("빅데이터", "big data"),
        ("데이터시각화", "data visualization"), ("대시보드", "dashboard"),
        ("통계", "statistics"), ("확률", "probability"),
        ("분석", "analysis"), ("모니터링", "monitoring"),
    ]

    pairs = []
    for ko, en in it_terms:
        pairs.append({
            "ko": ko,
            "en": en.lower(),
            "source": "it_terminology"
        })
        # Also add individual English words for multi-word terms
        if ' ' in en:
            for word in extract_english_words(en):
                if len(word) >= 3:
                    pairs.append({
                        "ko": ko,
                        "en": word,
                        "source": "it_terminology"
                    })

    print(f"Collected {len(pairs):,} IT/Tech terms")
    return pairs


it_pairs = collect_it_terminology()

## 4. Combine and Filter

In [None]:
print("\n" + "=" * 70)
print("COMBINING ALL DATA")
print("=" * 70)

all_pairs = muse_pairs + wikidata_pairs + it_pairs
print(f"\nTotal raw: {len(all_pairs):,}")
print(f"  MUSE: {len(muse_pairs):,}")
print(f"  Wikidata: {len(wikidata_pairs):,}")
print(f"  IT: {len(it_pairs):,}")

In [None]:
def filter_and_deduplicate(pairs: List[Dict]) -> List[Dict]:
    """Filter low-quality pairs and deduplicate."""
    print("\n" + "=" * 70)
    print("FILTERING AND DEDUPLICATION")
    print("=" * 70)

    filtered = []
    rejected = defaultdict(int)

    for pair in tqdm(pairs, desc="Filtering"):
        ko = pair["ko"]
        en = pair["en"]

        # Length checks
        if len(ko) < CONFIG["min_ko_length"]:
            rejected["ko_too_short"] += 1
            continue
        if len(en) < CONFIG["min_en_length"]:
            rejected["en_too_short"] += 1
            continue
        if len(ko) > CONFIG["max_ko_length"]:
            rejected["ko_too_long"] += 1
            continue
        if len(en) > CONFIG["max_en_length"]:
            rejected["en_too_long"] += 1
            continue

        # Korean validation
        if not is_valid_korean(ko):
            rejected["no_korean"] += 1
            continue

        # English validation
        if not is_valid_english(en):
            rejected["invalid_english"] += 1
            continue

        # Skip if English is just numbers or special chars
        clean_en = re.sub(r'[^a-zA-Z]', '', en)
        if len(clean_en) < 2:
            rejected["en_no_letters"] += 1
            continue

        filtered.append(pair)

    print(f"Filtered: {len(pairs):,} -> {len(filtered):,}")
    print("Rejection reasons:")
    for reason, count in sorted(rejected.items(), key=lambda x: -x[1])[:10]:
        print(f"  {reason}: {count:,}")

    # Deduplication
    seen = set()
    unique = []

    for pair in tqdm(filtered, desc="Deduplicating"):
        key = (pair["ko"].strip(), pair["en"].strip().lower())
        if key not in seen:
            seen.add(key)
            unique.append(pair)

    print(f"After deduplication: {len(unique):,}")
    return unique


final_pairs = filter_and_deduplicate(all_pairs)

## 5. Statistics and Save

In [None]:
print("\n" + "=" * 70)
print("FINAL STATISTICS")
print("=" * 70)

print(f"\nTotal pairs: {len(final_pairs):,}")

# By source
sources = defaultdict(int)
for p in final_pairs:
    sources[p["source"]] += 1

print("\nBy source:")
for src, cnt in sorted(sources.items(), key=lambda x: -x[1]):
    print(f"  {src}: {cnt:,} ({cnt/len(final_pairs)*100:.1f}%)")

# Unique Korean terms
unique_ko = len(set(p["ko"] for p in final_pairs))
unique_en = len(set(p["en"] for p in final_pairs))
print(f"\nUnique Korean terms: {unique_ko:,}")
print(f"Unique English terms: {unique_en:,}")

# Samples
import random
print("\nSample pairs:")
for p in random.sample(final_pairs, min(15, len(final_pairs))):
    print(f"  {p['ko']} -> {p['en']} ({p['source']})")

In [None]:
# Check key terms
print("\n" + "=" * 70)
print("KEY TERMS CHECK")
print("=" * 70)

key_terms = ['자연어처리', '인증', '인공지능', '검색', '추천', '신경망', '기계학습']

for term in key_terms:
    matches = [p for p in final_pairs if p['ko'] == term]
    if matches:
        en_terms = list(set([m['en'] for m in matches]))[:5]
        print(f"{term}: {len(matches)}건 -> {en_terms}")
    else:
        print(f"{term}: NOT FOUND")

In [None]:
# Save
output_path = OUTPUT_DIR / "term_pairs.jsonl"

with open(output_path, "w", encoding="utf-8") as f:
    for p in tqdm(final_pairs, desc="Saving"):
        f.write(json.dumps(p, ensure_ascii=False) + "\n")

print(f"\nSaved: {output_path}")
print(f"Size: {output_path.stat().st_size / 1024:.1f} KB")

In [None]:
print("\n" + "=" * 70)
print("DATA COLLECTION COMPLETE")
print("=" * 70)
print(f"\nOutput: {output_path}")
print(f"Total: {len(final_pairs):,} term pairs")
print("\nNext: Run 01_data_preparation.ipynb")