# v10 Inference Test

v10 대규모 학습 모델의 성능을 평가합니다.

In [1]:
import sys
from pathlib import Path

def find_project_root():
    candidates = [
        Path.cwd(),
        Path.cwd().parent,
        Path.cwd().parent.parent,
        Path("/home/west/Documents/cursor-workspace/opensearch-neural-pre-train"),
    ]
    for candidate in candidates:
        if (candidate / "CLAUDE.md").exists() or (candidate / ".git").exists():
            return candidate
    return Path("/home/west/Documents/cursor-workspace/opensearch-neural-pre-train")

project_root = find_project_root()
sys.path.insert(0, str(project_root))

import torch
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer
from src.model.splade_model import create_splade_model

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

Device: cuda


In [2]:
# Load model
MODEL_NAME = "bert-base-multilingual-cased"
CHECKPOINT_PATH = project_root / "outputs/v10_large_scale/final_model.pt"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = create_splade_model(
    model_name=MODEL_NAME,
    use_idf=False,
    use_expansion=True,
    expansion_mode="mlm",
)

checkpoint = torch.load(CHECKPOINT_PATH, map_location=device, weights_only=True)
model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(device)
model.eval()

print("v10 Model loaded!")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
    Found GPU0 NVIDIA GB10 which is of cuda capability 12.1.
    Minimum and Maximum cuda capability supported by this version of PyTorch is
    (8.0) - (12.0)
    
  queued_call()


v10 Model loaded!


In [3]:
def get_sparse_representation(text: str, top_k: int = 50):
    encoding = tokenizer(text, max_length=64, padding='max_length', truncation=True, return_tensors='pt')
    with torch.no_grad():
        sparse_rep, _ = model(encoding['input_ids'].to(device), encoding['attention_mask'].to(device))
    sparse_rep = sparse_rep[0].cpu()
    top_scores, top_indices = torch.topk(sparse_rep, k=top_k)
    return tokenizer.convert_ids_to_tokens(top_indices.tolist()), top_scores.tolist()

def is_korean(token):
    clean = token.replace('##', '')
    return any('\uac00' <= c <= '\ud7a3' for c in clean)

def is_english(token):
    clean = token.replace('##', '')
    return clean.isalpha() and clean.isascii()

In [4]:
# Test queries
TEST_PAIRS = [
    ("머신러닝", ["machine", "learning"]),
    ("딥러닝", ["deep", "learning"]),
    ("자연어처리", ["natural", "language", "processing"]),
    ("인공지능", ["artificial", "intelligence"]),
    ("신경망", ["neural", "network"]),
    ("알고리즘", ["algorithm"]),
    ("데이터베이스", ["database"]),
    ("프로그래밍", ["programming"]),
    ("소프트웨어", ["software"]),
    ("하드웨어", ["hardware"]),
    ("학습", ["training", "learning"]),
    ("모델", ["model"]),
    ("데이터", ["data"]),
    ("컴퓨터", ["computer"]),
    ("네트워크", ["network"]),
]

In [5]:
# Evaluate
results = []

for ko_term, en_expected in TEST_PAIRS:
    tokens, scores = get_sparse_representation(ko_term)
    tokens_lower = [t.lower() for t in tokens]
    
    # Korean preservation
    input_tokens = set(tokenizer.tokenize(ko_term))
    preserved = [t for t in input_tokens if t in tokens]
    
    # English activation
    activated_en = []
    for en in en_expected:
        for tok in tokenizer.tokenize(en.lower()):
            if tok.lower() in tokens_lower:
                activated_en.append(tok)
    
    results.append({
        'Korean': ko_term,
        'KO Preserved': ', '.join(preserved) if preserved else '-',
        'EN Expected': ', '.join(en_expected),
        'EN Activated': ', '.join(activated_en) if activated_en else '-',
        'Top-5': ', '.join(tokens[:5]),
        'KO': '✅' if preserved else '❌',
        'EN': '✅' if activated_en else '❌',
    })

df = pd.DataFrame(results)
print("v10 Evaluation Results")
print("="*80)
df

v10 Evaluation Results


Unnamed: 0,Korean,KO Preserved,EN Expected,EN Activated,Top-5,KO,EN
0,머신러닝,-,"machine, learning",-,"the, that, it, this, .",❌,❌
1,딥러닝,-,"deep, learning",-,"the, ., that, it, this",❌,❌
2,자연어처리,-,"natural, language, processing",-,"the, that, it, to, this",❌,❌
3,인공지능,-,"artificial, intelligence",-,"the, that, it, this, you",❌,❌
4,신경망,-,"neural, network",-,"the, that, it, this, you",❌,❌
5,알고리즘,-,algorithm,-,"the, that, it, this, you",❌,❌
6,데이터베이스,-,database,-,"the, that, ., it, this",❌,❌
7,프로그래밍,-,programming,-,"the, that, it, this, a",❌,❌
8,소프트웨어,-,software,-,"., the, to, and, that",❌,❌
9,하드웨어,-,hardware,-,"the, ., that, it, this",❌,❌


In [6]:
# Summary
ko_success = sum(1 for r in results if r['KO'] == '✅')
en_success = sum(1 for r in results if r['EN'] == '✅')
both_success = sum(1 for r in results if r['KO'] == '✅' and r['EN'] == '✅')

print(f"\nSummary:")
print(f"  Korean preserved: {ko_success}/{len(results)} ({ko_success/len(results)*100:.1f}%)")
print(f"  English activated: {en_success}/{len(results)} ({en_success/len(results)*100:.1f}%)")
print(f"  Both succeeded: {both_success}/{len(results)} ({both_success/len(results)*100:.1f}%)")


Summary:
  Korean preserved: 0/15 (0.0%)
  English activated: 0/15 (0.0%)
  Both succeeded: 0/15 (0.0%)


In [7]:
# Custom queries
custom_queries = [
    "파이썬 프로그래밍",
    "웹 개발",
    "클라우드 컴퓨팅",
    "빅데이터 분석",
    "검색 엔진",
    "추천 시스템",
]

print("\nCustom Query Results")
print("="*80)

for query in custom_queries:
    tokens, scores = get_sparse_representation(query, top_k=20)
    input_tokens = set(tokenizer.tokenize(query))
    preserved = [t for t in input_tokens if t in tokens]
    en_tokens = [t for t in tokens if is_english(t) and t not in ['the', 'a', 'an', 'in', 'of', 'to']]
    
    print(f"\n{query}:")
    print(f"  KO preserved: {preserved if preserved else 'None'}")
    print(f"  EN tokens: {', '.join(en_tokens[:5]) if en_tokens else 'None'}")
    print(f"  Top-5: {', '.join(tokens[:5])}")


Custom Query Results

파이썬 프로그래밍:
  KO preserved: None
  EN tokens: that, it, this, you, they
  Top-5: the, that, it, this, you

웹 개발:
  KO preserved: None
  EN tokens: time, like, then, just, more
  Top-5: ., ', ,, time, like

클라우드 컴퓨팅:
  KO preserved: None
  EN tokens: that, it, this, you, they
  Top-5: the, that, it, this, to

빅데이터 분석:
  KO preserved: None
  EN tokens: that, it, this, they, you
  Top-5: the, ., that, it, this

검색 엔진:
  KO preserved: None
  EN tokens: that, it, this, they, you
  Top-5: the, ., that, it, this

추천 시스템:
  KO preserved: None
  EN tokens: that, it, this, you, they
  Top-5: the, that, it, this, a


In [8]:
print("\nv10 Inference Test Complete!")


v10 Inference Test Complete!
