# v21.4 HuggingFace Upload Preparation

This notebook prepares the trained v21.4 model for HuggingFace Hub upload.

## v21.4 Improvements

- **Curriculum Learning**: 3-phase training (single-term → balanced → full)
- **Dynamic Lambda Self**: 8.0 for single terms → 4.0 for sentences
- **Minimum Activation Loss**: Prevents garbage outputs
- **Enhanced Training Data**: Added explicit single-term synonym pairs

In [None]:
import sys
from pathlib import Path

def find_project_root():
    current = Path.cwd()
    for parent in [current] + list(current.parents):
        if (parent / "pyproject.toml").exists() or (parent / "src").exists():
            return parent
    return Path.cwd().parent.parent

PROJECT_ROOT = find_project_root()
sys.path.insert(0, str(PROJECT_ROOT))

import json
import shutil
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

print(f"Project root: {PROJECT_ROOT}")
print(f"PyTorch version: {torch.__version__}")

## 1. Load Trained Model

In [None]:
# Paths
CHECKPOINT_PATH = PROJECT_ROOT / "outputs" / "v21.4_korean_enhanced" / "best_model.pt"
OUTPUT_DIR = PROJECT_ROOT / "huggingface" / "v21.4"

# Check if checkpoint exists
if not CHECKPOINT_PATH.exists():
    raise FileNotFoundError(f"Checkpoint not found: {CHECKPOINT_PATH}")

print(f"Loading checkpoint from: {CHECKPOINT_PATH}")

# Load checkpoint (weights_only=False since we trust our own checkpoint)
checkpoint = torch.load(CHECKPOINT_PATH, map_location="cpu", weights_only=False)

print(f"Checkpoint keys: {checkpoint.keys()}")
print(f"Training epoch: {checkpoint.get('epoch')}")
print(f"Training phase: {checkpoint.get('phase')}")
print(f"Eval results: {checkpoint.get('eval_results')}")
print(f"Config: {checkpoint.get('config')}")

In [None]:
# Load base model and tokenizer
model_name = checkpoint.get('config', {}).get('model_name', 'skt/A.X-Encoder-base')
max_length = checkpoint.get('config', {}).get('max_length', 64)

print(f"Base model: {model_name}")
print(f"Max length: {max_length}")

# Initialize base model
base_model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(f"\nModel parameters: {sum(p.numel() for p in base_model.parameters()):,}")
print(f"Vocab size: {tokenizer.vocab_size}")

In [None]:
# Load trained weights
state_dict = checkpoint['model_state_dict']

# The training model has 'model.' prefix for the transformer layers
# We need to map these to the AutoModelForMaskedLM structure
new_state_dict = {}
for key, value in state_dict.items():
    if key.startswith('model.'):
        new_key = key[6:]  # Remove 'model.' prefix
        new_state_dict[new_key] = value
    else:
        new_state_dict[key] = value

# Load weights into base model
base_model.load_state_dict(new_state_dict, strict=True)
print("Weights loaded successfully!")

## 2. Quick Validation

In [None]:
import torch.nn as nn

class SPLADEInference:
    """SPLADE inference wrapper."""
    
    def __init__(self, model, tokenizer, device='cpu'):
        self.model = model.to(device)
        self.tokenizer = tokenizer
        self.device = device
        self.relu = nn.ReLU()
        self.model.eval()
    
    @torch.no_grad()
    def encode(self, text: str, top_k: int = 20):
        """Encode text to sparse representation."""
        inputs = self.tokenizer(
            text,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=64
        ).to(self.device)
        
        outputs = self.model(**inputs)
        logits = outputs.logits
        
        # SPLADE: log(1 + ReLU(x))
        token_scores = torch.log1p(self.relu(logits))
        
        # Mask padding
        mask = inputs['attention_mask'].unsqueeze(-1).float()
        token_scores = token_scores * mask
        
        # Max pooling
        sparse_repr = token_scores.max(dim=1).values[0]  # [vocab_size]
        
        # Get top tokens
        top_values, top_indices = sparse_repr.topk(top_k)
        
        results = []
        for idx, val in zip(top_indices.tolist(), top_values.tolist()):
            if val > 0:
                token = self.tokenizer.decode([idx]).strip()
                results.append((token, round(val, 4)))
        
        return results

# Create inference wrapper
device = 'cuda' if torch.cuda.is_available() else 'cpu'
splade = SPLADEInference(base_model, tokenizer, device)
print(f"Inference device: {device}")

In [None]:
# Test problem terms from v21.3
problem_terms = ['추천', '데이터베이스', '증상', '질환', '인슐린']

print("=" * 60)
print("Problem Terms Evaluation (v21.4)")
print("=" * 60)

for term in problem_terms:
    results = splade.encode(term, top_k=10)
    print(f"\n{term}:")
    
    # Check if self-token is in top results
    has_self = any(term in token for token, _ in results[:5])
    status = "✓" if has_self else "✗"
    
    for token, weight in results[:5]:
        marker = "←" if term in token else ""
        print(f"  {token}: {weight:.4f} {marker}")
    
    print(f"  Status: {status} Self-reconstruction")

In [None]:
# Test general terms
general_terms = ['검색', '법률', '의료', '암', '약물']

print("\n" + "=" * 60)
print("General Terms Evaluation")
print("=" * 60)

for term in general_terms:
    results = splade.encode(term, top_k=10)
    print(f"\n{term}:")
    for token, weight in results[:5]:
        print(f"  {token}: {weight:.4f}")

In [None]:
# Test natural language queries
nl_queries = [
    '당뇨병 환자의 인슐린 치료 방법',
    '부동산 계약 해지 조건',
    '암 환자를 위한 식이요법 추천',
]

print("\n" + "=" * 60)
print("Natural Language Queries")
print("=" * 60)

for query in nl_queries:
    results = splade.encode(query, top_k=15)
    print(f"\nQuery: {query}")
    print(f"Top tokens: {', '.join([f'{t}({w:.2f})' for t, w in results[:10]])[:80]}...")

## 3. Save for HuggingFace

In [None]:
# Create output directory
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Save model and tokenizer in HuggingFace format
base_model.save_pretrained(OUTPUT_DIR, safe_serialization=True)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Model saved to: {OUTPUT_DIR}")
print(f"\nSaved files:")
for f in sorted(OUTPUT_DIR.iterdir()):
    size_mb = f.stat().st_size / (1024 * 1024)
    print(f"  {f.name}: {size_mb:.2f} MB")

In [None]:
# Copy modeling_splade.py for custom model loading
modeling_src = PROJECT_ROOT / "huggingface" / "modeling_splade.py"
modeling_dst = OUTPUT_DIR / "modeling_splade.py"

if modeling_src.exists():
    shutil.copy(modeling_src, modeling_dst)
    print(f"Copied modeling_splade.py")
else:
    print(f"Warning: {modeling_src} not found")

In [None]:
# Create README
readme_content = f'''---
language:
- ko
license: apache-2.0
library_name: transformers
tags:
- sparse-retrieval
- splade
- opensearch
- korean
- neural-sparse
pipeline_tag: feature-extraction
---

# Korean Neural Sparse Encoder v21.4

한국어 신경망 희소 인코더 - OpenSearch Neural Sparse 검색을 위한 SPLADE 기반 모델

## Model Description

This model is a SPLADE-based sparse encoder fine-tuned for Korean text, specifically optimized for:
- Legal domain terminology
- Medical domain terminology  
- General Korean synonym expansion

### v21.4 Improvements

- **Curriculum Learning**: 3-phase training focusing on single-terms → balanced → full coverage
- **Dynamic Lambda Self**: Higher weight (8.0) for single-term self-reconstruction, lower (4.0) for sentences
- **Minimum Activation Loss**: Prevents garbage outputs by ensuring meaningful top-k activations
- **Enhanced Training Data**: Added explicit single-term synonym pairs for problem terms

### Training Results

- **Best Epoch**: {checkpoint.get('epoch', 'N/A')}
- **Recall@1**: {checkpoint.get('eval_results', {{}}).get('recall', 'N/A'):.1f}%
- **MRR**: {checkpoint.get('eval_results', {{}}).get('mrr', 'N/A'):.4f}

## Usage

```python
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
import torch.nn as nn

# Load model
tokenizer = AutoTokenizer.from_pretrained("sewoong/korean-neural-sparse-encoder-v21.4")
model = AutoModelForMaskedLM.from_pretrained("sewoong/korean-neural-sparse-encoder-v21.4")

# Encode text
def encode(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        relu = nn.ReLU()
        token_scores = torch.log1p(relu(logits))
        mask = inputs["attention_mask"].unsqueeze(-1).float()
        sparse_repr = (token_scores * mask).max(dim=1).values[0]
    return sparse_repr

# Example
sparse = encode("당뇨병 치료 방법")
top_values, top_indices = sparse.topk(10)
for idx, val in zip(top_indices, top_values):
    print(f"{{tokenizer.decode([idx])}}: {{val:.4f}}")
```

## OpenSearch Integration

This model is designed to work with OpenSearch Neural Sparse Search. See the [OpenSearch documentation](https://opensearch.org/docs/latest/search-plugins/neural-sparse-search/) for integration details.

## Base Model

- **Base**: skt/A.X-Encoder-base
- **Parameters**: {sum(p.numel() for p in base_model.parameters()):,}
- **Vocabulary**: {tokenizer.vocab_size:,} tokens

## Citation

```bibtex
@misc{{korean-neural-sparse-v21.4,
  author = {{Sewoong Lee}},
  title = {{Korean Neural Sparse Encoder v21.4}},
  year = {{2025}},
  publisher = {{HuggingFace}},
  url = {{https://huggingface.co/sewoong/korean-neural-sparse-encoder-v21.4}}
}}
```
'''

with open(OUTPUT_DIR / "README.md", "w", encoding="utf-8") as f:
    f.write(readme_content)

print("README.md created")

## 4. Upload to HuggingFace Hub

In [None]:
# To upload, run in terminal:
# huggingface-cli login
# huggingface-cli upload sewoong/korean-neural-sparse-encoder-v21.4 ./huggingface/v21.4

print("Upload Instructions:")
print("="*60)
print("1. Login to HuggingFace:")
print("   huggingface-cli login")
print("\n2. Upload the model:")
print(f"   huggingface-cli upload sewoong/korean-neural-sparse-encoder-v21.4 {OUTPUT_DIR}")
print("\n3. Or use Python API:")
print("   from huggingface_hub import HfApi")
print("   api = HfApi()")
print("   api.upload_folder(")
print(f"       folder_path='{OUTPUT_DIR}',")
print("       repo_id='sewoong/korean-neural-sparse-encoder-v21.4',")
print("       repo_type='model'")
print("   )")

In [None]:
print("\n" + "="*60)
print("v21.4 HuggingFace Preparation Complete!")
print("="*60)
print(f"\nModel directory: {OUTPUT_DIR}")
print(f"\nNext steps:")
print("  1. Review the generated README.md")
print("  2. Upload to HuggingFace Hub")
print("  3. Run OpenSearch integration tests")