# Multilingual Symmetry Evaluation with Sliced KS

This notebook implements the core pieces of a **multilingual symmetry** evaluation pipeline:

1. Generate multiple answers per prompt, per `(model, language)` condition.
2. Encode answers with a **shared multilingual embedding model**.
3. Compute a **Sliced Kolmogorov–Smirnov (sKS)** distance between the two embedding distributions.
4. Aggregate sKS across random projection directions to get a **mean symmetry score with uncertainty**.

In [1]:

import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional
from tqdm.auto import tqdm

from scipy.stats import ks_2samp

np.random.seed(12345)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# For multilingual embeddings
from sentence_transformers import SentenceTransformer

In [3]:
import torch
print("PyTorch version:", torch.__version__)
print("MPS available:", torch.backends.mps.is_available())
print("MPS built:", torch.backends.mps.is_built())

PyTorch version: 2.9.1
MPS available: True
MPS built: True


In [4]:

# ---------------------------------------------------------
# Configuration
# ---------------------------------------------------------

# Example prompts for quick testing (replace with dataset-driven prompts later)
PROMPTS = [
    # Factual
    {
        "id": "fact_1",
        "en": "Who discovered penicillin?",
        "fr": "Qui a découvert la pénicilline ?",
        "type": "factual",
    },
    {
        "id": "fact_2",
        "en": "What is the capital of Japan?",
        "fr": "Quelle est la capitale du Japon ?",
        "type": "factual",
    },
    # Open-ended / intent-like
    {
        "id": "open_1",
        "en": "How would you resolve a conflict between two colleagues at work?",
        "fr": "Comment résoudriez-vous un conflit entre deux collègues au travail ?",
        "type": "open",
    },
]


In [5]:
# Define conditions you want to compare.
# For example, same model but different languages:
CONDITIONS = [
    {
        "name": "llama31_gguf_en",
        "model": "llama31_8b_instruct_gguf",  # internal key, not HF id
        "lang": "en",
    },
    {
        "name": "llama31_gguf_fr",
        "model": "llama31_8b_instruct_gguf",
        "lang": "fr",
    },
    #{"name": "aya23_en", "model": "CohereForAI/aya-23-8B", "lang": "en"}, # too slow on my mac!
    #{"name": "aya23_fr", "model": "CohereForAI/aya-23-8B", "lang": "fr"}, # too slow on my mac!
    #{"name": "llama31_en", "model": "meta-llama/Llama-3.1-8B-Instruct", "lang": "en"},
    #{"name": "llama31_fr", "model": "meta-llama/Llama-3.1-8B-Instruct", "lang": "fr"},
    # You can add more, e.g. another model:
    # {"name": "modelB_en", "model": "MODEL_B_NAME", "lang": "en"},
    # {"name": "modelB_fr", "model": "MODEL_B_NAME", "lang": "fr"},
]

# Pairs of conditions to compare with the symmetry metric
CONDITION_PAIRS = [
    ("llama31_gguf_en", "llama31_gguf_fr"),
    # e.g. ("modelA_en", "modelB_en"),
    # e.g. ("modelA_fr", "modelB_fr"),
]

In [7]:
# Number of stochastic samples per (prompt, condition)
N_SAMPLES_PER_CONDITION = 16

# Number of random projection directions for sliced KS
N_DIRECTIONS = 64

# Random seed for reproducibility
RANDOM_STATE = 123

## Answer generation

In this section, we define a stub for generating multiple answers from each model.
You should plug in your own LLM backend (OpenAI, Cohere, Anthropic, local models, etc.).

By default, we use a dummy implementation that echoes the prompt and appends a random suffix,
so that the rest of the pipeline can be tested without external APIs.

In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Cache so we don't reload the same model repeatedly
HF_MODEL_CACHE = {}

def load_hf_model(model_name: str):
    """
    Load a HuggingFace causal LM + tokenizer locally and return a generation pipeline.
    Device automatically maps to GPU if available.
    """
    tok = AutoTokenizer.from_pretrained(model_name)
    # Decide device
    if torch.backends.mps.is_available():
        device = "mps"
    else:
        device = "cpu"
    print("Using device:", device)

    # Choose dtype (float16 on GPU, float32 on CPU)
    dtype = torch.float16 if device != "cpu" else torch.float32

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        dtype=dtype,
    )

    # Move model to the chosen device
    model.to(device)
    
    return pipeline(
        "text-generation",
        model=model,
        tokenizer=tok,
    )

In [15]:
# Download Llama 3.1 8B Instruct (quantized) from HuggingFace
# This variant is fast, accurate, and runs well on Mac (MPS).

import os
from huggingface_hub import hf_hub_download

# Where to store the model locally
GGUF_DIR = "./models"
os.makedirs(GGUF_DIR, exist_ok=True)

# HuggingFace repo + filename
REPO_ID = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
FILENAME = "Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"

print(f"Downloading {FILENAME} ...")

local_path = hf_hub_download(
    repo_id=REPO_ID,
    filename=FILENAME,
    local_dir=GGUF_DIR,
)

print("Downloaded GGUF model to:", local_path)

Downloading Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf ...
Downloaded GGUF model to: models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf


In [16]:
from llama_cpp import Llama

GGUF_PATHS = {
    "llama31_8b_instruct_gguf": "models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
}

LLAMA_CACHE = {}

def load_llama_gguf(model_key: str) -> Llama:
    if model_key in LLAMA_CACHE:
        return LLAMA_CACHE[model_key]

    gguf_path = GGUF_PATHS[model_key]
    print(f"Loading GGUF model from: {gguf_path}")

    llm = Llama(
        model_path=gguf_path,
        n_ctx=4096,
        n_threads=8,      # tune for your CPU
        n_gpu_layers=50,  # offload layers to GPU via Metal
    )
    LLAMA_CACHE[model_key] = llm
    return llm

In [17]:

def generate_answers_for_condition_hf(
    prompt_en: str,
    prompt_fr: str,
    condition: Dict,
    n_samples: int = 8,
) -> List[str]:
    """
    Generate multiple answers using a local HuggingFace model.

    Parameters
    ----------
    prompt_en, prompt_fr : str
        English and French versions of the same semantic prompt.
    condition : dict
        Contains "model", "name", and "lang".
    n_samples : int
        Number of stochastic generations.

    Returns
    -------
    List[str]
        Generated answers.
    """
    lang = condition["lang"]
    model_name = condition["model"]

    # Choose the appropriate language prompt
    prompt = prompt_en if lang == "en" else prompt_fr

    # Load model if not already cached
    if model_name not in HF_MODEL_CACHE:
        HF_MODEL_CACHE[model_name] = load_hf_model(model_name)

    pipe = HF_MODEL_CACHE[model_name]

    outputs = []
    for _ in range(n_samples):
        resp = pipe(
            prompt,
            max_new_tokens=128,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
        )[0]["generated_text"]
        outputs.append(resp)

    return outputs


def generate_answers_for_condition_llama(
    prompt_en: str,
    prompt_fr: str,
    condition: Dict,
    n_samples: int = 8,
) -> List[str]:
    """
    Generate multiple answers using a local GGUF Llama model via llama-cpp-python.

    Parameters
    ----------
    prompt_en, prompt_fr : str
        English and French versions of the same semantic prompt.
    condition : dict
        Contains "model", "name", and "lang".
    n_samples : int
        Number of stochastic generations.

    Returns
    -------
    List[str]
        Generated answers.
    """
    lang = condition["lang"]
    model_key = condition["model"]

    # Select the correct language prompt
    prompt = prompt_en if lang == "en" else prompt_fr

    # Load GGUF model (cached after first use)
    llm = load_llama_gguf(model_key)

    outputs = []
    for _ in range(n_samples):
        resp = llm(
            prompt,
            max_tokens=128,
            temperature=0.7,
            top_p=0.9,
        )
        # llama_cpp returns a dict with 'choices'
        text = resp["choices"][0]["text"]
        outputs.append(text)

    return outputs


In [18]:
generate_answers_for_condition = generate_answers_for_condition_llama

## Multilingual embedding model

We now load a single **multilingual sentence encoder** that will embed **all** answers
into a shared semantic vector space. This decouples the evaluation geometry from any particular LLM.

In [19]:
import torch
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL_NAME = "sentence-transformers/LaBSE"

def load_embedding_model(model_name: str = EMBEDDING_MODEL_NAME):
    if SentenceTransformer is None:
        raise ImportError(
            "sentence-transformers is not installed. "
            "Install it with `pip install sentence-transformers`."
        )

    print(f"Loading embedding model: {model_name}")

    # Load model normally (CPU)
    model = SentenceTransformer(model_name)

    # Detect MPS (Mac GPU)
    if torch.backends.mps.is_available():
        device = "mps"
    else:
        device = "cpu"

    print("Embedding model using device:", device)

    # Move the underlying transformer to the device
    model = model.to(device)

    return model

embedding_model = load_embedding_model()

Loading embedding model: sentence-transformers/LaBSE
Embedding model using device: mps


In [20]:

def embed_texts(texts: List[str]):
    """Encode a list of strings into embeddings (np.ndarray of shape [N, D])."""
    if not texts:
        return np.zeros((0, embedding_model.get_sentence_embedding_dimension()))
    emb = embedding_model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
    return emb


In [21]:
# ---------------------------------------------------------
# Sanity check: make sure embedding model and LLMs work
# ---------------------------------------------------------

def sanity_check_models(
    n_samples_per_condition: int = 2,
):
    """
    Quick smoke test to verify:
    1. The embedding model encodes texts on the chosen device (CPU/MPS).
    2. Each LLM condition in CONDITIONS can generate text.
    3. Embeddings can be computed for generated answers.
    """
    print("=== Sanity check: embedding model ===")
    try:
        test_texts = ["Hello world", "Bonjour le monde"]
        embs = embed_texts(test_texts)
        assert embs.shape[0] == len(test_texts), "Unexpected embedding batch size."
        assert embs.ndim == 2, "Embeddings should be 2D (batch, dim)."
        print(f"Embeddings OK. Shape: {embs.shape}")
    except Exception as e:
        print("❌ Embedding model failed:")
        raise e

    print("\n=== Sanity check: LLM conditions ===")
    # Simple bilingual test prompt
    test_prompt_en = "Briefly introduce yourself in one sentence."
    test_prompt_fr = "Présente-toi en une phrase."

    for cond in CONDITIONS:
        print(f"\n-- Testing condition: {cond['name']} ({cond['model']} / {cond['lang']})")
        try:
            answers = generate_answers_for_condition(
                prompt_en=test_prompt_en,
                prompt_fr=test_prompt_fr,
                condition=cond,
                n_samples=n_samples_per_condition,
            )

            if not answers:
                raise ValueError("No answers returned.")
            print(f"  Generated {len(answers)} answers.")
            print("  Sample answer:", answers[0][:120].replace("\n", " ") + ("..." if len(answers[0]) > 120 else ""))

            # Try embedding the answers
            ans_embs = embed_texts(answers)
            assert ans_embs.shape[0] == len(answers), "Embedding batch size mismatch."
            print(f"  Embeddings OK. Shape: {ans_embs.shape}")

        except Exception as e:
            print(f"❌ Condition {cond['name']} failed:")
            raise e

    print("\n✅ Sanity check passed: embedding model and all CONDITIONS are usable.")


# Run this once before the evaluation loop
sanity_check_models()

llama_model_load_from_file_impl: using device Metal (Apple M4) - 7283 MiB free
llama_model_loader: loaded meta data with 33 key-value pairs and 292 tensors from models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Meta Llama 3.1 8B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Meta-Llama-3.1
llama_model_loader: - kv   5:                         general.size_label str              = 8B
llama_model_loader: - kv   6:                     

=== Sanity check: embedding model ===
Embeddings OK. Shape: (2, 768)

=== Sanity check: LLM conditions ===

-- Testing condition: llama31_gguf_en (llama31_8b_instruct_gguf / en)
Loading GGUF model from: models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf


init_tokenizer: initializing tokenizer for type 2
load: control token: 128254 '<|reserved_special_token_246|>' is not marked as EOG
load: control token: 128252 '<|reserved_special_token_244|>' is not marked as EOG
load: control token: 128251 '<|reserved_special_token_243|>' is not marked as EOG
load: control token: 128250 '<|reserved_special_token_242|>' is not marked as EOG
load: control token: 128248 '<|reserved_special_token_240|>' is not marked as EOG
load: control token: 128247 '<|reserved_special_token_239|>' is not marked as EOG
load: control token: 128245 '<|reserved_special_token_237|>' is not marked as EOG
load: control token: 128244 '<|reserved_special_token_236|>' is not marked as EOG
load: control token: 128243 '<|reserved_special_token_235|>' is not marked as EOG
load: control token: 128240 '<|reserved_special_token_232|>' is not marked as EOG
load: control token: 128238 '<|reserved_special_token_230|>' is not marked as EOG
load: control token: 128235 '<|reserved_special_

  Generated 2 answers.
  Sample answer:  My name is Kaitlin, and I am a senior at the University of Nebraska-Lincoln studying journalism and public relations. W...
  Embeddings OK. Shape: (2, 768)

-- Testing condition: llama31_gguf_fr (llama31_8b_instruct_gguf / fr)


Llama.generate: 1 prefix-match hit, remaining 9 prompt tokens to eval
llama_perf_context_print:        load time =     192.83 ms
llama_perf_context_print: prompt eval time =     172.01 ms /     9 tokens (   19.11 ms per token,    52.32 tokens per second)
llama_perf_context_print:        eval time =    6294.47 ms /   127 runs   (   49.56 ms per token,    20.18 tokens per second)
llama_perf_context_print:       total time =    6543.23 ms /   136 tokens
llama_perf_context_print:    graphs reused =        122
Llama.generate: 9 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =     192.83 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =    6595.77 ms /   128 runs   (   51.53 ms per token,    19.41 tokens per second)
llama_perf_context_print:       total time =    6698.04 ms /   129 tokens
llama_perf_context_print:    graph

  Generated 2 answers.
  Sample answer:  Je m'appelle Yacine, je suis un étudiant algérien qui étudie la philosophie à l'Université d'Alger. Sais-tu ce que sign...
  Embeddings OK. Shape: (2, 768)

✅ Sanity check passed: embedding model and all CONDITIONS are usable.


## Sliced Kolmogorov–Smirnov (sKS) distance

We now implement the **sliced KS** metric:

1. Draw `L` random unit vectors in embedding space.
2. Project the two sets of embeddings onto each vector.
3. Compute the 1D KS distance per direction.
4. Aggregate over directions to get a mean, standard deviation, and confidence interval.

In [29]:

@dataclass
class SlicedKSMetrics:
    mean: float
    std: float
    sem: float
    ci_low: float
    ci_high: float
    per_direction: np.ndarray


def _ks_1d(a: np.ndarray, b: np.ndarray) -> float:
    """1D Kolmogorov–Smirnov distance between two samples.

    If scipy is available, we use ks_2samp; otherwise we fall back to a simple implementation.
    """
    if a.size == 0 or b.size == 0:
        return 0.0

    if ks_2samp is not None:
        return ks_2samp(a, b, alternative="two-sided", mode="auto").statistic

    # Fallback: manual KS
    a_sorted = np.sort(a)
    b_sorted = np.sort(b)
    data_all = np.concatenate([a_sorted, b_sorted])

    # Empirical CDFs
    cdf_a = np.searchsorted(a_sorted, data_all, side="right") / a_sorted.size
    cdf_b = np.searchsorted(b_sorted, data_all, side="right") / b_sorted.size

    return np.max(np.abs(cdf_a - cdf_b))


def sliced_ks_distance(
    emb_A: np.ndarray,
    emb_B: np.ndarray,
    n_directions: int = 64,
    random_state: Optional[int] = None,
) -> SlicedKSMetrics:
    """Compute Sliced Kolmogorov–Smirnov distance between two embedding sets.

    Parameters
    ----------
    emb_A : np.ndarray
        Embeddings of set A, shape (N_A, D).
    emb_B : np.ndarray
        Embeddings of set B, shape (N_B, D).
    n_directions : int
        Number of random projection directions.
    random_state : int, optional
        Seed for reproducibility.

    Returns
    -------
    SlicedKSMetrics
        Container with mean, std, sem, CI, and per-direction KS values.
    """
    if emb_A.size == 0 or emb_B.size == 0:
        return SlicedKSMetrics(
            mean=0.0, std=0.0, sem=0.0, ci_low=0.0, ci_high=0.0,
            per_direction=np.zeros(n_directions)
        )

    assert emb_A.shape[1] == emb_B.shape[1], "Embedding dimensions must match."
    d = emb_A.shape[1]

    rng = np.random.default_rng(random_state)

    ks_values = []
    for _ in range(n_directions):
        # Sample a random direction on the unit sphere
        v = rng.normal(size=d)
        v /= np.linalg.norm(v) + 1e-12

        proj_A = emb_A @ v
        proj_B = emb_B @ v

        ks_val = _ks_1d(proj_A, proj_B)
        ks_values.append(ks_val)

    ks_values = np.array(ks_values)
    mean = float(ks_values.mean())
    std = float(ks_values.std(ddof=1)) if ks_values.size > 1 else 0.0
    sem = float(std / np.sqrt(ks_values.size)) if ks_values.size > 0 else 0.0
    # 95% CI (approx)
    ci_low = max(0.0, mean - 1.96 * sem)
    ci_high = min(1.0, mean + 1.96 * sem)

    return SlicedKSMetrics(
        mean=mean,
        std=std,
        sem=sem,
        ci_low=ci_low,
        ci_high=ci_high,
        per_direction=ks_values,
    )


In [30]:

def symmetry_from_sks(metrics: SlicedKSMetrics) -> Dict[str, float]:
    """Convert sliced KS metrics into a symmetry-oriented view.

    Symmetry is defined as 1 - mean KS, with the same uncertainty structure.
    """
    sym_mean = 1.0 - metrics.mean
    sym_ci_low = 1.0 - metrics.ci_high
    sym_ci_high = 1.0 - metrics.ci_low
    return {
        "sym_mean": sym_mean,
        "sym_std": metrics.std,
        "sym_sem": metrics.sem,
        "sym_ci_low": sym_ci_low,
        "sym_ci_high": sym_ci_high,
    }


## Evaluation loop

This cell ties everything together:

1. Iterate over prompts.
2. For each `(condition, prompt)` pair, generate multiple answers and embed them.
3. For each **condition pair**, compute the sliced KS metrics and corresponding symmetry scores.
4. Collect results in a DataFrame for further analysis.

In [None]:

def build_condition_lookup(conditions: List[Dict]) -> Dict[str, Dict]:
    return {c["name"]: c for c in conditions}


condition_lookup = build_condition_lookup(CONDITIONS)

results = []

for prompt in tqdm(PROMPTS, desc="Prompts"):
    prompt_id = prompt["id"]
    prompt_type = prompt.get("type", "unknown")
    en_text = prompt["en"]
    fr_text = prompt["fr"]

    # Cache answers and embeddings per condition
    answers_by_condition: Dict[str, List[str]] = {}
    emb_by_condition: Dict[str, np.ndarray] = {}

    # Generate answers and embeddings for each condition
    for cond in CONDITIONS:
        cond_name = cond["name"]
        answers = generate_answers_for_condition(
            prompt_en=en_text,
            prompt_fr=fr_text,
            condition=cond,
            n_samples=N_SAMPLES_PER_CONDITION,
        )
        answers_by_condition[cond_name] = answers
        emb_by_condition[cond_name] = embed_texts(answers)

    # Now compute symmetry metrics for each condition pair
    for cond_a, cond_b in CONDITION_PAIRS:
        emb_A = emb_by_condition[cond_a]
        emb_B = emb_by_condition[cond_b]

        sks_metrics = sliced_ks_distance(
            emb_A,
            emb_B,
            n_directions=N_DIRECTIONS,
            random_state=RANDOM_STATE,
        )
        sym = symmetry_from_sks(sks_metrics)

        results.append(
            {
                "prompt_id": prompt_id,
                "prompt_type": prompt_type,
                "condition_A": cond_a,
                "condition_B": cond_b,
                "ks_mean": sks_metrics.mean,
                "ks_std": sks_metrics.std,
                "ks_sem": sks_metrics.sem,
                "ks_ci_low": sks_metrics.ci_low,
                "ks_ci_high": sks_metrics.ci_high,
                "sym_mean": sym["sym_mean"],
                "sym_std": sym["sym_std"],
                "sym_sem": sym["sym_sem"],
                "sym_ci_low": sym["sym_ci_low"],
                "sym_ci_high": sym["sym_ci_high"],
            }
        )

results_df = pd.DataFrame(results)
results_df


Prompts:   0%|          | 0/3 [00:00<?, ?it/s]`torch_dtype` is deprecated! Use `dtype` instead!


Using device: mps


Fetching 4 files:   0%|          | 0/4 [02:02<?, ?it/s]


## Aggregate symmetry scores

We can now aggregate symmetry metrics across prompts, or restrict to factual vs open-ended prompts.

In [None]:

# Aggregate by condition pair and prompt type
agg = (
    results_df
    .groupby(["condition_A", "condition_B", "prompt_type"])
    .agg(
        sym_mean=("sym_mean", "mean"),
        sym_std=("sym_mean", "std"),
        n_prompts=("prompt_id", "nunique"),
    )
    .reset_index()
)

agg


You can extend this section with:

- Boxplots or violin plots of `sym_mean` per prompt,
- Separate reporting for factual vs open-ended prompts,
- Per-prompt inspection of the most asymmetric cases (lowest `sym_mean`).