# Cross-lingual Synonym Data Synthesis

Ollama gpt-oss:20b를 활용하여 한-영 동의어 데이터를 합성 생성합니다.

## 목표
- IT/ML 기술 용어 한-영 동의어 쌍 생성
- 병렬 문장 데이터 생성
- Cross-lingual KD 학습용 데이터셋 구축

## 1. Setup

In [1]:
import sys
import os
import json
import time
import requests
from pathlib import Path
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass, asdict
from tqdm.auto import tqdm
import random

# Project root
project_root = Path.cwd().parent.parent
sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")

Project root: /home/west/Documents/cursor-workspace/opensearch-neural-pre-train


## 2. Ollama Client

In [2]:
@dataclass
class OllamaConfig:
    """Ollama API configuration."""
    base_url: str = "http://localhost:11434"
    model: str = "gpt-oss:20b"
    temperature: float = 0.7
    max_retries: int = 3
    timeout: int = 120


class OllamaClient:
    """Ollama API client for synonym generation."""
    
    def __init__(self, config: OllamaConfig):
        self.config = config
        self.api_url = f"{config.base_url}/api/generate"
    
    def generate(self, prompt: str, temperature: Optional[float] = None) -> str:
        """Generate text using Ollama API."""
        temp = temperature if temperature is not None else self.config.temperature
        
        payload = {
            "model": self.config.model,
            "prompt": prompt,
            "stream": False,
            "options": {"temperature": temp}
        }
        
        for attempt in range(self.config.max_retries):
            try:
                response = requests.post(
                    self.api_url,
                    json=payload,
                    timeout=self.config.timeout
                )
                response.raise_for_status()
                return response.json().get("response", "").strip()
            except Exception as e:
                if attempt < self.config.max_retries - 1:
                    time.sleep(2 ** attempt)
                else:
                    raise RuntimeError(f"Ollama API failed: {e}")
        return ""
    
    def health_check(self) -> bool:
        """Check if Ollama is running and model is available."""
        try:
            response = requests.get(f"{self.config.base_url}/api/tags", timeout=5)
            models = [m["name"] for m in response.json().get("models", [])]
            return self.config.model in models or any(self.config.model in m for m in models)
        except Exception:
            return False


# Initialize client
config = OllamaConfig()
client = OllamaClient(config)

# Health check
print(f"Ollama model: {config.model}")
print(f"Health check: {'OK' if client.health_check() else 'FAILED'}")

Ollama model: gpt-oss:20b
Health check: OK


## 3. Seed Terms (카테고리별 시드 용어)

각 카테고리에서 시드 용어를 제공하고, LLM이 관련 용어들을 확장합니다.

In [3]:
# 카테고리별 시드 용어 (LLM이 확장할 기준점)
SEED_CATEGORIES = {
    "machine_learning": [
        "머신러닝", "딥러닝", "신경망", "학습", "모델", "예측", "분류", "회귀",
        "과적합", "정규화", "손실함수", "최적화", "경사하강법", "역전파"
    ],
    "natural_language_processing": [
        "자연어처리", "토큰화", "임베딩", "트랜스포머", "어텐션", "언어모델",
        "텍스트분류", "개체명인식", "기계번역", "질의응답", "요약"
    ],
    "computer_vision": [
        "컴퓨터비전", "이미지분류", "객체탐지", "영상분할", "특징추출",
        "합성곱", "풀링", "데이터증강", "전이학습"
    ],
    "data_science": [
        "데이터분석", "데이터전처리", "특성공학", "차원축소", "군집화",
        "이상탐지", "시각화", "통계분석"
    ],
    "software_engineering": [
        "알고리즘", "자료구조", "프로그래밍", "소프트웨어", "데이터베이스",
        "서버", "클라이언트", "API", "마이크로서비스", "컨테이너"
    ],
    "cloud_infrastructure": [
        "클라우드", "가상화", "컨테이너", "쿠버네티스", "도커",
        "로드밸런싱", "스케일링", "모니터링"
    ],
    "search_retrieval": [
        "검색엔진", "정보검색", "인덱싱", "쿼리", "랭킹", "관련성",
        "벡터검색", "시맨틱검색", "역색인"
    ]
}

print(f"Total seed categories: {len(SEED_CATEGORIES)}")
print(f"Total seed terms: {sum(len(v) for v in SEED_CATEGORIES.values())}")

Total seed categories: 7
Total seed terms: 69


## 4. Synonym Generation Prompts

In [4]:
def create_translation_prompt(korean_term: str) -> str:
    """Create prompt for Korean to English translation."""
    return f"""Translate the Korean technical term to English.

Korean term: {korean_term}

Provide the English translation(s) as a JSON object with the following format:
{{
    "primary": "main English translation",
    "alternatives": ["alternative 1", "alternative 2"],
    "abbreviation": "abbrev or null"
}}

Only output the JSON, nothing else."""


def create_expansion_prompt(category: str, seed_terms: List[str]) -> str:
    """Create prompt to expand seed terms with related terms."""
    seeds = ", ".join(seed_terms[:5])
    return f"""Generate 20 additional Korean technical terms related to the category "{category}".

Example terms in this category: {seeds}

Requirements:
- Terms should be commonly used in Korean IT/tech context
- Include both pure Korean terms and Konglish (외래어)
- Terms should be specific enough to have clear English translations

Output as JSON array of Korean terms only:
["용어1", "용어2", ...]

Only output the JSON array, nothing else."""


def create_parallel_sentence_prompt(korean_term: str, english_term: str) -> str:
    """Create prompt for generating parallel sentences."""
    return f"""Generate 3 pairs of parallel sentences (Korean and English) using the terms:
- Korean term: {korean_term}
- English term: {english_term}

Requirements:
- Sentences should be technical/educational context
- Korean and English sentences should have the same meaning
- Use natural language in both languages

Output as JSON array:
[
    {{"ko": "한국어 문장", "en": "English sentence"}},
    ...
]

Only output the JSON array, nothing else."""


print("Prompt templates defined")

Prompt templates defined


## 5. Data Generation Functions

In [5]:
def parse_json_response(response: str) -> Optional[dict | list]:
    """Parse JSON from LLM response, handling common issues."""
    # Try direct parse
    try:
        return json.loads(response)
    except json.JSONDecodeError:
        pass
    
    # Try to extract JSON from response
    import re
    
    # Look for JSON object
    obj_match = re.search(r'\{[^{}]*\}', response, re.DOTALL)
    if obj_match:
        try:
            return json.loads(obj_match.group())
        except json.JSONDecodeError:
            pass
    
    # Look for JSON array
    arr_match = re.search(r'\[[^\[\]]*\]', response, re.DOTALL)
    if arr_match:
        try:
            return json.loads(arr_match.group())
        except json.JSONDecodeError:
            pass
    
    return None


def translate_term(client: OllamaClient, korean_term: str) -> Optional[Dict]:
    """Translate a Korean term to English using LLM."""
    prompt = create_translation_prompt(korean_term)
    response = client.generate(prompt, temperature=0.3)
    
    parsed = parse_json_response(response)
    if parsed and isinstance(parsed, dict):
        return {
            "ko": korean_term,
            "en_primary": parsed.get("primary", ""),
            "en_alternatives": parsed.get("alternatives", []),
            "abbreviation": parsed.get("abbreviation")
        }
    return None


def expand_category(client: OllamaClient, category: str, seed_terms: List[str]) -> List[str]:
    """Expand a category with additional terms using LLM."""
    prompt = create_expansion_prompt(category, seed_terms)
    response = client.generate(prompt, temperature=0.8)
    
    parsed = parse_json_response(response)
    if parsed and isinstance(parsed, list):
        return [t for t in parsed if isinstance(t, str) and len(t) > 1]
    return []


def generate_parallel_sentences(
    client: OllamaClient, 
    korean_term: str, 
    english_term: str
) -> List[Dict[str, str]]:
    """Generate parallel sentences for a term pair."""
    prompt = create_parallel_sentence_prompt(korean_term, english_term)
    response = client.generate(prompt, temperature=0.7)
    
    parsed = parse_json_response(response)
    if parsed and isinstance(parsed, list):
        return [
            {"ko": item.get("ko", ""), "en": item.get("en", "")}
            for item in parsed
            if isinstance(item, dict) and item.get("ko") and item.get("en")
        ]
    return []


print("Generation functions defined")

Generation functions defined


## 6. Test Generation

In [6]:
# Test translation
print("Testing translation...")
test_result = translate_term(client, "머신러닝")
print(f"Translation result: {json.dumps(test_result, ensure_ascii=False, indent=2)}")

print("\nTesting category expansion...")
test_expansion = expand_category(client, "machine_learning", SEED_CATEGORIES["machine_learning"][:5])
print(f"Expanded terms ({len(test_expansion)}): {test_expansion[:10]}")

print("\nTesting parallel sentence generation...")
if test_result:
    test_sentences = generate_parallel_sentences(client, "머신러닝", test_result["en_primary"])
    for sent in test_sentences:
        print(f"  KO: {sent['ko']}")
        print(f"  EN: {sent['en']}")
        print()

Testing translation...
Translation result: {
  "ko": "머신러닝",
  "en_primary": "machine learning",
  "en_alternatives": [
    "ML"
  ],
  "abbreviation": "ML"
}

Testing category expansion...
Expanded terms (20): ['인공신경망', '컨볼루션', '전이학습', '과적합', '정규화', '하이퍼파라미터', '배치정규화', '피처엔지니어링', '특성추출', '분류기']

Testing parallel sentence generation...
  KO: 머신러닝은 대규모 데이터에서 패턴을 학습하여 예측 모델을 생성하는 기계 학습 기술이다.
  EN: Machine learning is a technology that learns patterns from large-scale data to create predictive models.

  KO: 머신러닝 알고리즘은 입력 데이터와 목표 변수 간의 관계를 자동으로 최적화한다.
  EN: Machine learning algorithms automatically optimize the relationship between input data and target variables.

  KO: 머신러닝 프로젝트를 성공적으로 수행하려면 데이터 전처리, 모델 선택, 하이퍼파라미터 튜닝이 필수적이다.
  EN: To successfully execute a machine learning project, data preprocessing, model selection, and hyperparameter tuning are essential.



## 7. Full Dataset Generation

In [7]:
# Configuration
OUTPUT_DIR = project_root / "dataset" / "synonyms"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Generation settings
EXPAND_CATEGORIES = True  # Whether to expand categories with LLM
GENERATE_SENTENCES = True  # Whether to generate parallel sentences
SAMPLE_RATE_SENTENCES = 0.3  # Fraction of terms to generate sentences for

print(f"Output directory: {OUTPUT_DIR}")

Output directory: /home/west/Documents/cursor-workspace/opensearch-neural-pre-train/dataset/synonyms


In [8]:
def generate_full_dataset(
    client: OllamaClient,
    seed_categories: Dict[str, List[str]],
    expand_categories: bool = True,
    generate_sentences: bool = True,
    sentence_sample_rate: float = 0.3,
) -> Tuple[List[Dict], List[Dict]]:
    """
    Generate full cross-lingual dataset.
    
    Returns:
        synonyms: List of synonym entries
        parallel_sentences: List of parallel sentence pairs
    """
    all_terms = []
    synonyms = []
    parallel_sentences = []
    
    # Step 1: Collect all terms (seed + expanded)
    print("Step 1: Collecting terms...")
    for category, seeds in tqdm(seed_categories.items(), desc="Categories"):
        category_terms = list(seeds)  # Start with seeds
        
        if expand_categories:
            try:
                expanded = expand_category(client, category, seeds)
                # Filter duplicates
                new_terms = [t for t in expanded if t not in category_terms]
                category_terms.extend(new_terms[:15])  # Limit expansion
                print(f"  {category}: {len(seeds)} seeds + {len(new_terms[:15])} expanded")
            except Exception as e:
                print(f"  {category}: expansion failed - {e}")
        
        for term in category_terms:
            all_terms.append({"term": term, "category": category})
    
    print(f"\nTotal terms to translate: {len(all_terms)}")
    
    # Step 2: Translate all terms
    print("\nStep 2: Translating terms...")
    for item in tqdm(all_terms, desc="Translating"):
        try:
            result = translate_term(client, item["term"])
            if result and result.get("en_primary"):
                result["category"] = item["category"]
                synonyms.append(result)
        except Exception as e:
            print(f"  Failed to translate '{item['term']}': {e}")
        
        # Rate limiting
        time.sleep(0.1)
    
    print(f"\nSuccessfully translated: {len(synonyms)} terms")
    
    # Step 3: Generate parallel sentences (sampled)
    if generate_sentences:
        print("\nStep 3: Generating parallel sentences...")
        sampled = random.sample(synonyms, int(len(synonyms) * sentence_sample_rate))
        
        for entry in tqdm(sampled, desc="Generating sentences"):
            try:
                sentences = generate_parallel_sentences(
                    client, 
                    entry["ko"], 
                    entry["en_primary"]
                )
                for sent in sentences:
                    sent["term_ko"] = entry["ko"]
                    sent["term_en"] = entry["en_primary"]
                    sent["category"] = entry["category"]
                parallel_sentences.extend(sentences)
            except Exception as e:
                print(f"  Failed for '{entry['ko']}': {e}")
            
            time.sleep(0.1)
        
        print(f"\nGenerated {len(parallel_sentences)} parallel sentences")
    
    return synonyms, parallel_sentences


print("Dataset generation function defined")

Dataset generation function defined


In [9]:
# Generate dataset
print("="*60)
print("STARTING FULL DATASET GENERATION")
print("="*60)

synonyms, parallel_sentences = generate_full_dataset(
    client=client,
    seed_categories=SEED_CATEGORIES,
    expand_categories=EXPAND_CATEGORIES,
    generate_sentences=GENERATE_SENTENCES,
    sentence_sample_rate=SAMPLE_RATE_SENTENCES,
)

print("\n" + "="*60)
print("GENERATION COMPLETE")
print("="*60)
print(f"Synonym entries: {len(synonyms)}")
print(f"Parallel sentences: {len(parallel_sentences)}")

STARTING FULL DATASET GENERATION
Step 1: Collecting terms...


Categories:   0%|          | 0/7 [00:00<?, ?it/s]

  machine_learning: 14 seeds + 14 expanded
  natural_language_processing: 11 seeds + 15 expanded
  computer_vision: 9 seeds + 15 expanded
  data_science: 8 seeds + 15 expanded
  software_engineering: 10 seeds + 15 expanded
  cloud_infrastructure: 8 seeds + 15 expanded
  search_retrieval: 9 seeds + 15 expanded

Total terms to translate: 173

Step 2: Translating terms...


Translating:   0%|          | 0/173 [00:00<?, ?it/s]


Successfully translated: 173 terms

Step 3: Generating parallel sentences...


Generating sentences:   0%|          | 0/51 [00:00<?, ?it/s]


Generated 153 parallel sentences

GENERATION COMPLETE
Synonym entries: 173
Parallel sentences: 153


## 8. Save Dataset

In [10]:
# Save synonyms
synonyms_path = OUTPUT_DIR / "ko_en_terms.jsonl"
with open(synonyms_path, "w", encoding="utf-8") as f:
    for entry in synonyms:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")
print(f"Saved synonyms to: {synonyms_path}")

# Save parallel sentences
sentences_path = OUTPUT_DIR / "ko_en_parallel.jsonl"
with open(sentences_path, "w", encoding="utf-8") as f:
    for entry in parallel_sentences:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")
print(f"Saved parallel sentences to: {sentences_path}")

# Save metadata
metadata = {
    "total_synonyms": len(synonyms),
    "total_parallel_sentences": len(parallel_sentences),
    "categories": list(SEED_CATEGORIES.keys()),
    "generation_model": config.model,
    "created_at": time.strftime("%Y-%m-%d %H:%M:%S"),
}
metadata_path = OUTPUT_DIR / "metadata.json"
with open(metadata_path, "w", encoding="utf-8") as f:
    json.dump(metadata, f, ensure_ascii=False, indent=2)
print(f"Saved metadata to: {metadata_path}")

Saved synonyms to: /home/west/Documents/cursor-workspace/opensearch-neural-pre-train/dataset/synonyms/ko_en_terms.jsonl
Saved parallel sentences to: /home/west/Documents/cursor-workspace/opensearch-neural-pre-train/dataset/synonyms/ko_en_parallel.jsonl
Saved metadata to: /home/west/Documents/cursor-workspace/opensearch-neural-pre-train/dataset/synonyms/metadata.json


## 9. Dataset Analysis

In [11]:
import pandas as pd

# Load and analyze
df_synonyms = pd.DataFrame(synonyms)

print("=" * 60)
print("SYNONYM DATASET ANALYSIS")
print("=" * 60)

print(f"\nTotal entries: {len(df_synonyms)}")
print(f"\nBy category:")
print(df_synonyms["category"].value_counts())

print(f"\nSample entries:")
for _, row in df_synonyms.sample(min(5, len(df_synonyms))).iterrows():
    print(f"  {row['ko']} → {row['en_primary']}")
    if row.get('en_alternatives'):
        print(f"    alternatives: {row['en_alternatives']}")
    if row.get('abbreviation'):
        print(f"    abbreviation: {row['abbreviation']}")

SYNONYM DATASET ANALYSIS

Total entries: 173

By category:
category
machine_learning               28
natural_language_processing    26
software_engineering           25
computer_vision                24
search_retrieval               24
data_science                   23
cloud_infrastructure           23
Name: count, dtype: int64

Sample entries:
  어휘 정규화 → lexical normalization
    alternatives: ['vocabulary normalization', 'lexicon normalization']
  활성화함수 → activation function
    alternatives: ['activation function']
    abbreviation: AF
  가중치 → weight
    alternatives: ['weighted coefficient', 'weight factor']
    abbreviation: wt
  언어모델 → language model
    alternatives: ['linguistic model', 'language modeling']
    abbreviation: LM
  회귀분석 → regression analysis
    alternatives: ['regression analysis']


In [12]:
if parallel_sentences:
    df_sentences = pd.DataFrame(parallel_sentences)
    
    print("=" * 60)
    print("PARALLEL SENTENCES ANALYSIS")
    print("=" * 60)
    
    print(f"\nTotal sentences: {len(df_sentences)}")
    print(f"\nBy category:")
    print(df_sentences["category"].value_counts())
    
    print(f"\nSample sentences:")
    for _, row in df_sentences.sample(min(3, len(df_sentences))).iterrows():
        print(f"  KO: {row['ko']}")
        print(f"  EN: {row['en']}")
        print(f"  Term: {row['term_ko']} / {row['term_en']}")
        print()

PARALLEL SENTENCES ANALYSIS

Total sentences: 153

By category:
category
computer_vision                30
data_science                   27
natural_language_processing    24
search_retrieval               24
cloud_infrastructure           18
machine_learning               18
software_engineering           12
Name: count, dtype: int64

Sample sentences:
  KO: 검색엔진스파이더는 웹 페이지를 탐색하여 정보를 수집하고 색인에 저장합니다.
  EN: The search engine spider navigates web pages, collects information, and stores it in the index.
  Term: 검색엔진스파이더 / search engine spider

  KO: 배치 정규화는 딥러닝 모델의 학습을 안정화시키기 위해 각 미니배치의 평균과 분산을 정규화하는 기법이다.
  EN: Batch Normalization is a technique that normalizes the mean and variance of each mini-batch to stabilize training in deep learning models.
  Term: 배치 정규화 / Batch Normalization

  KO: 컴퓨터비전 응용 예로는 자율주행 차량의 차선 인식, 의료 영상에서 종양 검출 등이 있다.
  EN: Examples of Computer Vision applications include lane detection in autonomous vehicles and tumor detection in medical imaging.
  Term: 컴퓨터비전 / C

## 10. Create Training Format

Cross-lingual KD 학습에 사용할 형식으로 변환합니다.

In [13]:
def create_training_pairs(synonyms: List[Dict]) -> List[Dict]:
    """
    Create training pairs for cross-lingual alignment.
    
    Format:
    {
        "ko_term": "머신러닝",
        "en_terms": ["machine learning", "ML"],
        "category": "machine_learning"
    }
    """
    pairs = []
    for entry in synonyms:
        en_terms = [entry["en_primary"]]
        if entry.get("en_alternatives"):
            en_terms.extend(entry["en_alternatives"])
        if entry.get("abbreviation"):
            en_terms.append(entry["abbreviation"])
        
        # Filter empty/None
        en_terms = [t for t in en_terms if t and isinstance(t, str)]
        
        if en_terms:
            pairs.append({
                "ko_term": entry["ko"],
                "en_terms": en_terms,
                "category": entry.get("category", "unknown")
            })
    
    return pairs


training_pairs = create_training_pairs(synonyms)

# Save training format
training_path = OUTPUT_DIR / "cross_lingual_pairs.jsonl"
with open(training_path, "w", encoding="utf-8") as f:
    for pair in training_pairs:
        f.write(json.dumps(pair, ensure_ascii=False) + "\n")

print(f"Created {len(training_pairs)} training pairs")
print(f"Saved to: {training_path}")

# Show samples
print("\nSample training pairs:")
for pair in random.sample(training_pairs, min(5, len(training_pairs))):
    print(f"  {pair['ko_term']} → {pair['en_terms']}")

Created 173 training pairs
Saved to: /home/west/Documents/cursor-workspace/opensearch-neural-pre-train/dataset/synonyms/cross_lingual_pairs.jsonl

Sample training pairs:
  오버피팅 → ['overfitting', 'overfit', 'over-fitting']
  학습 → ['learning', 'study', 'training']
  정규화 → ['normalization', 'standardization', 'regularization', 'norm.']
  모듈화 → ['modularization', 'modularization']
  검색결과 → ['search results', 'search result', 'search outcome']


## 11. Summary

In [14]:
print("=" * 60)
print("CROSS-LINGUAL DATA SYNTHESIS COMPLETE")
print("=" * 60)

print(f"\nGenerated files:")
print(f"  - {synonyms_path.name}: {len(synonyms)} synonym entries")
print(f"  - {sentences_path.name}: {len(parallel_sentences)} parallel sentences")
print(f"  - {training_path.name}: {len(training_pairs)} training pairs")
print(f"  - {metadata_path.name}: metadata")

print(f"\nOutput directory: {OUTPUT_DIR}")

print(f"\nNext steps:")
print(f"  1. Review generated data quality")
print(f"  2. Implement CrossLingualKDLoss in src/training/losses.py")
print(f"  3. Update training notebook to use cross-lingual data")
print(f"  4. Run cross-lingual training")

CROSS-LINGUAL DATA SYNTHESIS COMPLETE

Generated files:
  - ko_en_terms.jsonl: 173 synonym entries
  - ko_en_parallel.jsonl: 153 parallel sentences
  - cross_lingual_pairs.jsonl: 173 training pairs
  - metadata.json: metadata

Output directory: /home/west/Documents/cursor-workspace/opensearch-neural-pre-train/dataset/synonyms

Next steps:
  1. Review generated data quality
  2. Implement CrossLingualKDLoss in src/training/losses.py
  3. Update training notebook to use cross-lingual data
  4. Run cross-lingual training
