# Language Response Asymmetry (Cohere)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from dotenv import load_dotenv
import cohere

load_dotenv()

COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
if not COHERE_API_KEY:
    raise RuntimeError("Missing COHERE_API_KEY in environment/.env")

In [4]:
co = cohere.ClientV2(api_key=COHERE_API_KEY)

In [5]:
# Generation models (examples; adjust to what your grant enables)
MODEL_AYA = "c4ai-aya-expanse-8b"  # main model
MODEL_COMMAND = "command-r"   # optional baseline

# Embedding model
MODEL_EMBED = "embed-multilingual-v3.0"

In [6]:
CONDITIONS = [
    {"name": "aya_en", "model": MODEL_AYA, "lang": "en"},
    {"name": "aya_fr", "model": MODEL_AYA, "lang": "fr"},
    # later:
    # {"name": "aya_it", "model": MODEL_AYA, "lang": "it"},
    # {"name": "aya_sw", "model": MODEL_AYA, "lang": "sw"},
]

CONDITION_PAIRS = [
    ("aya_en", "aya_fr"),
    # later:
    # ("aya_en", "aya_it"),
    # ("aya_en", "aya_sw"),
]

In [7]:
# Number of stochastic samples per (prompt, condition)
N_SAMPLES_PER_CONDITION = int(os.environ.get("N_SAMPLES_PER_CONDITION", 2))

# Number of random projection directions for sliced KS
N_DIRECTIONS = int(os.environ.get("N_DIRECTIONS", 64))

# Random seed for reproducibility
RANDOM_STATE = int(os.environ.get("RANDOM_STATE", 12345))

In [8]:
from typing import Dict, List

def cohere_generate_answers(
    prompt_en: str,
    prompt_fr: str,
    condition: Dict,
    n_samples: int = 8,
    max_tokens: int = 64,
    temperature: float = 0.7,
    top_p: float = 0.9,
) -> List[str]:
    lang = condition["lang"]
    model = condition["model"]
    prompt = prompt_en if lang == "en" else prompt_fr

    outputs = []
    for _ in range(n_samples):
        resp = co.chat(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=temperature,
            p=top_p,                 # Cohere uses `p` for top-p in many examples/docs
            max_tokens=max_tokens,
        )
        # v2 responses can vary by SDK version; this is the common pattern:
        text = resp.message.content[0].text if hasattr(resp, "message") else resp.text
        outputs.append(text)

    return outputs

In [9]:
import numpy as np

def cohere_embed_texts(
    texts: List[str],
    model: str = MODEL_EMBED,
    input_type: str = "search_document",
) -> np.ndarray:
    inputs = [{"content": [{"type": "text", "text": t}]} for t in texts]

    resp = co.embed(
        model=model,
        inputs=inputs,
        input_type=input_type,
        embedding_types=["float"],
    )

    # Common: resp.embeddings.float is a list[list[float]]
    emb = resp.embeddings.float
    return np.asarray(emb, dtype=np.float32)

In [15]:
test_prompt_en = "Who discovered penicillin?"
test_prompt_fr = "Qui a découvert la pénicilline ?"

answers = cohere_generate_answers(test_prompt_en, test_prompt_fr, CONDITIONS[0], n_samples=2)
print("Sample answers:", answers)

E = cohere_embed_texts(answers)
print("Embeddings shape:", E.shape)

Sample answers: ['Penicillin was discovered by Alexander Fleming, a Scottish biologist, and pharmacist, in 1928. Fleming noticed that mold, specifically Penicillium notatum, had grown in a culture dish of Staphylococcus bacteria, causing the bacteria to deteriorate. This observation led him to conclude that the mold produced a substance that could kill or inhibit the growth of bacteria.\n\nFleming\'s discovery was a significant milestone in the field of medicine and marked the beginning of the antibiotic era. He named the substance "penicillin" and shared his findings with other scientists, including Howard Florey and Ernst Chain, who later played crucial roles', "Penicillin was discovered by Alexander Fleming, a Scottish bacteriologist, in 1928. Fleming noticed that a mold, later identified as Penicillium notatum, had grown in a petri dish containing Staphylococcus bacteria, causing the bacteria to deteriorate and die. This observation led to the understanding that the mold produced a

In [10]:
from stats_helpers import sliced_ks_distance, symmetry_from_sks

generate_answers_for_condition = cohere_generate_answers
embed_texts = cohere_embed_texts

In [11]:

from tqdm.auto import tqdm
import pandas as pd

from prompts import PROMPTS


def build_condition_lookup(conditions: List[Dict]) -> Dict[str, Dict]:
    return {c["name"]: c for c in conditions}



condition_lookup = build_condition_lookup(CONDITIONS)

results = []

for prompt in tqdm(PROMPTS, desc="Prompts"):
    prompt_id = prompt["id"]
    prompt_type = prompt.get("type", "unknown")
    en_text = prompt["en"]
    fr_text = prompt["fr"]

    # Cache answers and embeddings per condition
    answers_by_condition: Dict[str, List[str]] = {}
    emb_by_condition: Dict[str, np.ndarray] = {}

    # Generate answers and embeddings for each condition
    for cond in CONDITIONS:
        cond_name = cond["name"]
        answers = generate_answers_for_condition(
            prompt_en=en_text,
            prompt_fr=fr_text,
            condition=cond,
            n_samples=N_SAMPLES_PER_CONDITION,
        )
        answers_by_condition[cond_name] = answers
        emb_by_condition[cond_name] = embed_texts(answers)

    # Now compute symmetry metrics for each condition pair
    for cond_a, cond_b in CONDITION_PAIRS:
        emb_A = emb_by_condition[cond_a]
        emb_B = emb_by_condition[cond_b]

        sks_metrics = sliced_ks_distance(
            emb_A,
            emb_B,
            n_directions=N_DIRECTIONS,
            random_state=RANDOM_STATE,
        )
        sym = symmetry_from_sks(sks_metrics)

        results.append(
            {
                "prompt_id": prompt_id,
                "prompt_type": prompt_type,
                "condition_A": cond_a,
                "condition_B": cond_b,
                "ks_mean": sks_metrics.mean,
                "ks_std": sks_metrics.std,
                "ks_sem": sks_metrics.sem,
                "ks_ci_low": sks_metrics.ci_low,
                "ks_ci_high": sks_metrics.ci_high,
                "sym_mean": sym["sym_mean"],
                "sym_std": sym["sym_std"],
                "sym_sem": sym["sym_sem"],
                "sym_ci_low": sym["sym_ci_low"],
                "sym_ci_high": sym["sym_ci_high"],
            }
        )

results_df = pd.DataFrame(results)
results_df


  from .autonotebook import tqdm as notebook_tqdm
Prompts: 100%|██████████| 6/6 [00:26<00:00,  4.38s/it]


Unnamed: 0,prompt_id,prompt_type,condition_A,condition_B,ks_mean,ks_std,ks_sem,ks_ci_low,ks_ci_high,sym_mean,sym_std,sym_sem,sym_ci_low,sym_ci_high
0,fact_1,factual,aya_en,aya_fr,0.84375,0.233588,0.029199,0.786521,0.900979,0.15625,0.233588,0.029199,0.099021,0.213479
1,fact_2,factual,aya_en,aya_fr,0.796875,0.247507,0.030938,0.736236,0.857514,0.203125,0.247507,0.030938,0.142486,0.263764
2,fact_3,factual,aya_en,aya_fr,0.914062,0.190127,0.023766,0.867481,0.960644,0.085938,0.190127,0.023766,0.039356,0.132519
3,open_1,open,aya_en,aya_fr,0.820312,0.241805,0.030226,0.76107,0.879555,0.179688,0.241805,0.030226,0.120445,0.23893
4,open_2,open,aya_en,aya_fr,0.773438,0.250867,0.031358,0.711975,0.8349,0.226562,0.250867,0.031358,0.1651,0.288025
5,open_3,open,aya_en,aya_fr,0.90625,0.196699,0.024587,0.858059,0.954441,0.09375,0.196699,0.024587,0.045559,0.141941


In [12]:
# Aggregate by condition pair and prompt type
agg = (
    results_df
    .groupby(["condition_A", "condition_B", "prompt_type"])
    .agg(
        sym_mean=("sym_mean", "mean"),
        sym_std=("sym_mean", "std"),
        n_prompts=("prompt_id", "nunique"),
    )
    .reset_index()
)

agg


Unnamed: 0,condition_A,condition_B,prompt_type,sym_mean,sym_std,n_prompts
0,aya_en,aya_fr,factual,0.148438,0.058983,3
1,aya_en,aya_fr,open,0.166667,0.067357,3


### Note on Inuktitut (IU) as an Out-of-Distribution Language

Inuktitut is not part of the declared language coverage of the models evaluated here. Nevertheless, the models can still be prompted with Inuktitut text, and will produce outputs conditioned on that input, even if the tokenizer, internal representations, or generation behavior are not optimized for this language. In this setting, we do not assume fluent or correct Inuktitut generation. Instead, we treat Inuktitut as an **out-of-distribution stress test** for multilingual robustness. Model outputs generated from Inuktitut prompts are translated back to English using a fixed translation system, and the resulting English texts are used for embedding and distributional comparison. This translation step is treated as a measurement instrument rather than ground truth, allowing us to assess how much semantic signal survives when the model is driven through an extreme low-resource linguistic channel.
Since the IU language is not included, the model will probably try to answer in English. We can try to do some prompt engineering, for example

"Please answer in Inuktitut. Do not switch languages. If unsure, still respond in Inuktitut."