# v13 Inference Test: Noun-Focused KO-EN Model

Test the trained v13 model using the `TermExpander` module.

**Key Features:**
- `TermExpander.expand_to_list()`: Get expanded terms as list
- `TermExpander.expand()`: Get detailed expansion result
- `TermExpander.get_sparse_vector()`: Get OpenSearch-compatible sparse vector

In [None]:
import sys
from pathlib import Path

def find_project_root():
    candidates = [
        Path.cwd(),
        Path.cwd().parent,
        Path.cwd().parent.parent,
        Path("/home/west/Documents/cursor-workspace/opensearch-neural-pre-train"),
    ]
    for candidate in candidates:
        if (candidate / "CLAUDE.md").exists() or (candidate / ".git").exists():
            return candidate
    return Path("/home/west/Documents/cursor-workspace/opensearch-neural-pre-train")

PROJECT_ROOT = find_project_root()
sys.path.insert(0, str(PROJECT_ROOT))

print(f"Project root: {PROJECT_ROOT}")

## Load TermExpander

In [None]:
from src.inference import TermExpander

# Load v13 model
checkpoint_path = PROJECT_ROOT / 'outputs' / 'v13_nouns' / 'best_model.pt'
expander = TermExpander.from_checkpoint(checkpoint_path)

print(f"Model: {expander.config['model_name']}")
print(f"Device: {expander.device}")

## Basic Usage: expand_to_list()

Get a flat list of expanded terms: `[원본, 서브워드들, 영어번역들]`

In [None]:
# Basic usage
test_queries = [
    '머신러닝',
    '딥러닝',
    '자연어처리',
    '검색엔진',
    '추천시스템',
    '데이터베이스',
    '인공지능',
    '클라우드',
    '네트워크',
    '알고리즘',
]

print("=" * 70)
print("expand_to_list() 결과")
print("=" * 70)

for query in test_queries:
    result = expander.expand_to_list(query)
    print(f"{query} → {result}")

## Detailed Usage: expand()

Get detailed expansion result with scores.

In [None]:
# Detailed expansion
result = expander.expand('머신러닝')

print(f"Original: {result.original}")
print(f"Subwords: {result.subwords}")
print(f"\nEnglish tokens (with scores):")
for token, score in result.english_tokens[:10]:
    print(f"  {token:15s}: {score:.4f}")

In [None]:
# Convert to dictionary
import json

result_dict = result.to_dict()
print(json.dumps(result_dict, ensure_ascii=False, indent=2))

## OpenSearch Integration: get_sparse_vector()

Get sparse vector for OpenSearch neural sparse queries.

In [None]:
# Get sparse vector for OpenSearch
sparse_vec = expander.get_sparse_vector('머신러닝', top_k=20)

print("Sparse Vector (top 15):")
for token, score in sorted(sparse_vec.items(), key=lambda x: -x[1])[:15]:
    print(f"  {token:15s}: {score:.4f}")

In [None]:
# Example OpenSearch query format
opensearch_query = {
    "query": {
        "neural_sparse": {
            "content_embedding": {
                "query_tokens": sparse_vec
            }
        }
    }
}

print("OpenSearch Query Example:")
print(json.dumps(opensearch_query, ensure_ascii=False, indent=2)[:500] + "...")

## Batch Processing

In [None]:
# Batch expansion
queries = ['머신러닝', '딥러닝', '자연어처리']
results = expander.expand_batch(queries)

for r in results:
    print(f"{r.original}: {r.to_list()}")

## Performance Summary

In [None]:
import torch

# Load checkpoint to get metrics
checkpoint = torch.load(checkpoint_path, map_location='cpu', weights_only=False)

print("=" * 50)
print("v13 Model Performance")
print("=" * 50)
print(f"English Activation Rate: {checkpoint.get('en_rate', 'N/A')}%")
print(f"Korean Preservation Rate: {checkpoint.get('ko_rate', 'N/A')}%")
print(f"Training Epoch: {checkpoint.get('epoch', 'N/A')}")

In [None]:
print("\n" + "=" * 50)
print("INFERENCE TEST COMPLETE")
print("=" * 50)