# SCALE-Align: Quick Start in Google Colab

This notebook demonstrates how to use the `scale-align` package for classification alignment in Google Colab.

## 1. Installation

In [None]:
# Install the package
!pip install scale-align

**⚠️ Important:** After installation, you may need to restart the runtime:
- Click `Runtime` → `Restart runtime` in the menu
- Then continue with the cells below

## 2. Verify Installation & GPU

In [None]:
import torch
from scale_align import E5Embedder

# Check GPU availability
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 3. Initialize the Embedder

In [None]:
# Initialize E5 embedder (this will download the model on first run)
# Use 'cuda' if GPU is available, otherwise 'cpu'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
embedder = E5Embedder(device=device, batch_size=32)

print(f"\n✅ Embedder initialized successfully on {device}!")

## 4. Quick Test: Encode Sample Texts

In [None]:
# Sample classification descriptions
sample_texts = [
    "Growing of cereals (except rice), leguminous crops and oil seeds",
    "Growing of rice",
    "Growing of vegetables and melons, roots and tubers"
]

# Encode as passages
embeddings = embedder.encode_passages(sample_texts)

print(f"Encoded {len(sample_texts)} texts")
print(f"Embedding shape: {embeddings.shape}")
print(f"Embedding dimension: {embeddings.shape[1]}")

## 5. Full Example: Classification Alignment

In [None]:
from scale_align import BidirectionalRetriever, CompetitiveSelector
import numpy as np

# Example: ISIC (source) vs NACE (target) alignment
source_texts = [
    "Growing of cereals",
    "Growing of rice",
    "Growing of wheat and barley",
    "Growing of vegetables"
]

target_texts = [
    "Growing of cereals (except rice)",
    "Growing of rice",
    "Growing of other cereals",
    "Growing of vegetables and melons"
]

print("Encoding source texts...")
source_embeddings = embedder.encode_passages(source_texts)

print("Encoding target texts...")
target_embeddings = embedder.encode_passages(target_texts)

print("\nPerforming bidirectional retrieval...")
retriever = BidirectionalRetriever()
scores = retriever.retrieve(source_embeddings, target_embeddings)

print("\nApplying competitive selection...")
selector = CompetitiveSelector(threshold=0.7, margin=0.05)
selected = selector.select(scores)

print("\n" + "="*60)
print("ALIGNMENT RESULTS")
print("="*60)
for src_idx, (tgt_indices, score) in enumerate(selected):
    print(f"\n[{src_idx}] {source_texts[src_idx]}")
    if tgt_indices:
        for tgt_idx in tgt_indices:
            print(f"  → [{tgt_idx}] {target_texts[tgt_idx]} (score: {score:.4f})")
    else:
        print(f"  → [NO MATCH] (score: {score:.4f})")
print("="*60)

## 6. Using CLI with Sample Data

In [None]:
import os
import json

# Create sample directory structure
os.makedirs('data/isic', exist_ok=True)
os.makedirs('data/nace', exist_ok=True)

# Create sample ISIC file
with open('data/isic/A0111.txt', 'w') as f:
    f.write('Growing of cereals\n')
    f.write('Growing of rice\n')
    f.write('Growing of wheat\n')

# Create sample NACE file
with open('data/nace/A01.11.txt', 'w') as f:
    f.write('Growing of cereals (except rice)\n')
    f.write('Growing of rice\n')
    f.write('Growing of other cereals\n')

# Create correspondence file
correspondence = [["A0111", "A01.11"]]
with open('correspondence.json', 'w') as f:
    json.dump(correspondence, f)

print("✅ Sample data created!")
print("\nDirectory structure:")
!tree data 2>/dev/null || find data -type f

In [None]:
# Run the CLI tool
!scale-align \
    --source-dir data/isic \
    --target-dir data/nace \
    --correspondence correspondence.json \
    --output-dir output \
    --threshold 0.7 \
    --margin 0.05 \
    --device cuda

In [None]:
# View results
print("Output files:")
!ls -la output/

print("\nAlignment results:")
!cat output/A0111_A01.11.txt