# v11 Inference Test: Term-Level KO-EN Model

Test the trained v11 model on various Korean queries.

In [None]:
import sys
import json
from pathlib import Path

def find_project_root():
    candidates = [
        Path.cwd(),
        Path.cwd().parent,
        Path.cwd().parent.parent,
        Path("/home/west/Documents/cursor-workspace/opensearch-neural-pre-train"),
    ]
    for candidate in candidates:
        if (candidate / "CLAUDE.md").exists() or (candidate / ".git").exists():
            return candidate
    return Path("/home/west/Documents/cursor-workspace/opensearch-neural-pre-train")

PROJECT_ROOT = find_project_root()
sys.path.insert(0, str(PROJECT_ROOT))

print(f"Project root: {PROJECT_ROOT}")

In [None]:
import torch
from transformers import AutoTokenizer
from src.model.splade_model import create_splade_model

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## Load Model

In [None]:
# Load checkpoint
checkpoint_path = PROJECT_ROOT / 'outputs' / 'v11_term_level' / 'final_model.pt'

if not checkpoint_path.exists():
    print(f"Checkpoint not found at {checkpoint_path}")
    print("Please run 01_training_term_level.ipynb first!")
else:
    checkpoint = torch.load(checkpoint_path, map_location=device)
    config = checkpoint['config']
    
    print(f"Loaded checkpoint from: {checkpoint_path}")
    print(f"Model: {config['model_name']}")

In [None]:
# Create and load model
tokenizer = AutoTokenizer.from_pretrained(config['model_name'])

model = create_splade_model(
    model_name=config['model_name'],
    use_idf=False,
    use_expansion=True,
    expansion_mode='mlm',
)
model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(device)
model.eval()

print("Model loaded successfully!")

## Inference Function

In [None]:
def is_korean_char(c: str) -> bool:
    return '\uac00' <= c <= '\ud7a3' or '\u1100' <= c <= '\u11ff' or '\u3130' <= c <= '\u318f'

def is_english_char(c: str) -> bool:
    return c.isalpha() and c.isascii()

def classify_token(token: str) -> str:
    """Classify token as Korean, English, or Other."""
    clean = token.replace('##', '')
    if not clean:
        return 'other'
    
    has_korean = any(is_korean_char(c) for c in clean)
    has_english = any(is_english_char(c) for c in clean)
    
    if has_korean:
        return 'korean'
    elif has_english:
        return 'english'
    else:
        return 'other'

def analyze_query(text: str, top_k: int = 50) -> dict:
    """Analyze a query and return top activated tokens."""
    encoding = tokenizer(
        text,
        max_length=64,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    with torch.no_grad():
        sparse_rep, _ = model(
            encoding['input_ids'].to(device),
            encoding['attention_mask'].to(device)
        )
    
    sparse_rep = sparse_rep[0].cpu()
    
    # Get top-k tokens
    top_values, top_indices = torch.topk(sparse_rep, k=top_k)
    top_tokens = tokenizer.convert_ids_to_tokens(top_indices.tolist())
    
    # Classify tokens
    korean_tokens = []
    english_tokens = []
    other_tokens = []
    
    for token, value in zip(top_tokens, top_values.tolist()):
        token_type = classify_token(token)
        if token_type == 'korean':
            korean_tokens.append((token, value))
        elif token_type == 'english':
            english_tokens.append((token, value))
        else:
            other_tokens.append((token, value))
    
    return {
        'input': text,
        'korean': korean_tokens,
        'english': english_tokens,
        'other': other_tokens,
        'top_10': list(zip(top_tokens[:10], top_values[:10].tolist())),
    }

## Test Queries

In [None]:
test_queries = [
    # IT/Tech
    "머신러닝",
    "딥러닝",
    "자연어처리",
    "인공지능",
    "데이터베이스",
    "추천시스템",
    "검색엔진",
    "클라우드",
    "서버",
    "네트워크",
    
    # General
    "컴퓨터",
    "인터넷",
    "프로그래밍",
    "알고리즘",
    "데이터",
]

print("=" * 80)
print("v11 INFERENCE TEST RESULTS")
print("=" * 80)

for query in test_queries:
    result = analyze_query(query)
    
    print(f"\n입력: {result['input']}")
    
    # Korean tokens
    ko_tokens = [t for t, v in result['korean'][:5]]
    print(f"  한글 토큰: {ko_tokens}")
    
    # English tokens
    en_tokens = [t for t, v in result['english'][:5]]
    print(f"  영어 토큰: {en_tokens}")
    
    # Top 10
    top_10 = [f"{t}({v:.2f})" for t, v in result['top_10'][:5]]
    print(f"  Top-5: {top_10}")

## Detailed Analysis

In [None]:
# Detailed analysis for key queries
key_queries = [
    ("추천시스템", ["추천", "시스템"], ["recommend", "system", "recommendation"]),
    ("검색엔진", ["검색", "엔진"], ["search", "engine"]),
    ("머신러닝", ["머신", "러닝"], ["machine", "learning"]),
    ("데이터베이스", ["데이터", "베이스"], ["database", "data"]),
]

print("\n" + "=" * 80)
print("DETAILED ANALYSIS")
print("=" * 80)

for query, expected_ko, expected_en in key_queries:
    result = analyze_query(query, top_k=100)
    
    print(f"\n입력: {query}")
    print(f"  기대 한글: {expected_ko}")
    print(f"  기대 영어: {expected_en}")
    
    # Check Korean
    found_ko = []
    for exp_tok in expected_ko:
        exp_subtoks = tokenizer.tokenize(exp_tok)
        for subtok in exp_subtoks:
            for tok, val in result['korean']:
                if subtok == tok:
                    found_ko.append((tok, val))
    
    # Check English
    found_en = []
    for exp_tok in expected_en:
        exp_subtoks = tokenizer.tokenize(exp_tok.lower())
        for subtok in exp_subtoks:
            for tok, val in result['english']:
                if subtok == tok:
                    found_en.append((tok, val))
    
    print(f"  발견된 한글: {found_ko}")
    print(f"  발견된 영어: {found_en}")
    print(f"  전체 한글 ({len(result['korean'])}): {[t for t, v in result['korean'][:10]]}")
    print(f"  전체 영어 ({len(result['english'])}): {[t for t, v in result['english'][:10]]}")

## Summary Statistics

In [None]:
# Calculate statistics
total_ko_activated = 0
total_en_activated = 0
total_other = 0

for query in test_queries:
    result = analyze_query(query, top_k=50)
    total_ko_activated += len(result['korean'])
    total_en_activated += len(result['english'])
    total_other += len(result['other'])

n_queries = len(test_queries)

print("\n" + "=" * 80)
print("SUMMARY STATISTICS (Top-50 tokens per query)")
print("=" * 80)
print(f"\n  Queries tested: {n_queries}")
print(f"  Avg Korean tokens: {total_ko_activated / n_queries:.1f}")
print(f"  Avg English tokens: {total_en_activated / n_queries:.1f}")
print(f"  Avg Other tokens: {total_other / n_queries:.1f}")
print(f"\n  Korean ratio: {total_ko_activated / (total_ko_activated + total_en_activated + total_other) * 100:.1f}%")
print(f"  English ratio: {total_en_activated / (total_ko_activated + total_en_activated + total_other) * 100:.1f}%")

In [None]:
print("\n" + "=" * 80)
print("v11 INFERENCE TEST COMPLETE")
print("=" * 80)