# **Merging Goldfish: An Initial Baseline**



A simple baseline – implements a pipeline for creating and fine-tuning a bilingual language model by merging two monolingual models (English and Spanish), leveraging a pretrained bilingual tokenizer from B-GPT (Arnett et al 2025). It begins by loading the monolingual models and their tokenizers, along with a bilingual tokenizer for the target merged model. Next, it performs a Procrustes alignment to map embeddings from the two monolingual vocabularies into a shared space using shared anchor tokens. The script then constructs a merged embedding matrix by combining static subword embeddings with contextual prototypes for high-frequency tokens. Transformer layers from both models are merged using a layer-wise weighted average controlled by an α schedule. After merging, LoRA adapters are added to enable efficient fine-tuning, which is performed on a small bilingual dataset using Hugging Face’s Trainer. Finally, the merged and fine-tuned model, along with the bilingual tokenizer, is saved to disk for downstream use.

In [None]:
import os
import torch
import random
import numpy as np
from tqdm import tqdm
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model
from datasets import Dataset

# ---------------------------------------------------------------------
# STEP 0 — SETTINGS
# ---------------------------------------------------------------------

MODEL_L1 = "goldfish-models/eng_latn_100mb"
MODEL_L2 = "goldfish-models/spa_latn_100mb"
OUT_DIR  = "goldfish-bilingual"

# load tokenizer from pretrained B‑GPT bilingual model
BILINGUAL_TOKENIZER = "catherinearnett/B-GPT_en_es_simultaneous"  # pretrained bilingual tokenizer

# how many tokens get full contextual estimates
CONTEXT_TOPK = 2000
DEVICE = "cuda"
LAYERWISE_ALPHA = 0.5

# ---------------------------------------------------------------------
# STEP 1 — LOAD MODELS & TOKENIZERS
# ---------------------------------------------------------------------

print("Loading monolingual models...")
tok_L1 = AutoTokenizer.from_pretrained(MODEL_L1)
tok_L2 = AutoTokenizer.from_pretrained(MODEL_L2)

model_L1 = AutoModelForCausalLM.from_pretrained(MODEL_L1).to(DEVICE)
model_L2 = AutoModelForCausalLM.from_pretrained(MODEL_L2).to(DEVICE)

print("Loading pretrained bilingual tokenizer...")
tok_new = AutoTokenizer.from_pretrained(BILINGUAL_TOKENIZER)
V_new = len(tok_new)
d_model = model_L1.config.hidden_size

# ---------------------------------------------------------------------
# STEP 2 — PROCUSTES ALIGNMENT
# ---------------------------------------------------------------------

def compute_procrustes(E1, E2, anchor_tokens):
    X = E2[anchor_tokens]
    Y = E1[anchor_tokens]
    U, _, Vt = torch.linalg.svd(Y.T @ X)
    return U @ Vt

def get_anchor_indices(tok1, tok2, top_k=5000):
    vocab1 = tok1.get_vocab()
    vocab2 = tok2.get_vocab()
    shared = list(set(vocab1.keys()) & set(vocab2.keys()))[:top_k]
    return [(vocab1[t], vocab2[t]) for t in shared]

print("Computing Procrustes alignment...")
E1 = model_L1.get_input_embeddings().weight.data
E2 = model_L2.get_input_embeddings().weight.data

anchor_pairs = get_anchor_indices(tok_L1, tok_L2)
idx2 = torch.tensor([p[1] for p in anchor_pairs], device=DEVICE)
R = compute_procrustes(E1, E2, idx2)

# ---------------------------------------------------------------------
# STEP 3 — EMBEDDING UTILITIES
# ---------------------------------------------------------------------

def token_embedding_from_subwords(model, tokenizer, new_token_str):
    old_ids = tokenizer.encode(new_token_str, add_special_tokens=False)
    emb_layer = model.get_input_embeddings().weight
    return emb_layer[old_ids].mean(dim=0)

def contextual_prototype(model, tokenizer, token_str, n=8):
    """
    Compute a cheap contextual prototype on a few synthetic contexts.
    """
    texts = [
        f"This example contains {token_str} in an interesting setting."
        for _ in range(n)
    ]
    vecs = []
    for text in texts:
        ids = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            outputs = model(input_ids=ids, output_hidden_states=True)
        # first transformer layer output
        hidden = outputs.hidden_states[1][:, -1, :]
        vecs.append(hidden.squeeze(0))
    return torch.stack(vecs).mean(dim=0)

# ---------------------------------------------------------------------
# STEP 4 — BUILD MERGED EMBEDDING MATRIX (FAST)
# ---------------------------------------------------------------------

print("Building merged embedding matrix (fast mode)...")
E_new = torch.zeros((V_new, d_model), device=DEVICE)

# pick a subset of tokens to run cheap contextual estimates
# (e.g., high-frequency or shared with monolingual vocabs)
freq_tokens = list(set(tok_L1.get_vocab().keys()) & set(tok_L2.get_vocab().keys()))
freq_tokens = freq_tokens[:CONTEXT_TOPK]

for i in tqdm(range(V_new)):
    token_str = tok_new.convert_ids_to_tokens(i)

    # static subword based embeddings
    e1_sub = token_embedding_from_subwords(model_L1, tok_L1, token_str)
    e2_sub = token_embedding_from_subwords(model_L2, tok_L2, token_str)
    e2_sub = R @ e2_sub

    # if the token is in our selected high‑priority set,
    # compute contextual prototype; else skip to save time
    if token_str in freq_tokens:
        try:
            e1_ctx = contextual_prototype(model_L1, tok_L1, token_str, n=4)
            e2_ctx = contextual_prototype(model_L2, tok_L2, token_str, n=4)
            e2_ctx = R @ e2_ctx
            e1 = 0.5 * e1_sub + 0.5 * e1_ctx
            e2 = 0.5 * e2_sub + 0.5 * e2_ctx
        except Exception:
            e1, e2 = e1_sub, e2_sub
    else:
        e1, e2 = e1_sub, e2_sub

    # merge
    E_new[i] = 0.5 * e1 + 0.5 * e2

# ---------------------------------------------------------------------
# STEP 5 — MERGE TRANSFORMER BLOCKS (LAYERWISE α)
# ---------------------------------------------------------------------

def layer_alpha(layer_id, total_layers, base=0.5):
    t = layer_id / total_layers
    return base * (1 - t) + 0.3 * t

def merge_models(model1, model2):
    out = AutoModelForCausalLM.from_pretrained(MODEL_L1)
    out.get_input_embeddings().weight.data = E_new.clone()
    out.lm_head.weight.data = E_new.clone()

    for i in range(model1.config.num_hidden_layers):
        a = layer_alpha(i, model1.config.num_hidden_layers, base=LAYERWISE_ALPHA)

        # attention
        for k, v in model1.transformer.h[i].attn.state_dict().items():
            merged = a * v + (1 - a) * model2.transformer.h[i].attn.state_dict()[k]
            out.transformer.h[i].attn.state_dict()[k].copy_(merged)

        # mlp
        for k, v in model1.transformer.h[i].mlp.state_dict().items():
            merged = a * v + (1 - a) * model2.transformer.h[i].mlp.state_dict()[k]
            out.transformer.h[i].mlp.state_dict()[k].copy_(merged)

        # layernorm
        ln1 = model1.transformer.h[i].ln_1
        ln2 = model2.transformer.h[i].ln_1
        ln_out = out.transformer.h[i].ln_1
        ln_out.weight.data = a * ln1.weight.data + (1 - a) * ln2.weight.data
        ln_out.bias.data   = a * ln1.bias.data   + (1 - a) * ln2.bias.data

    return out

print("Merging transformer weights...")
model_merged = merge_models(model_L1, model_L2).to(DEVICE)

# ---------------------------------------------------------------------
# STEP 6 — ADD LoRA + FINETUNE
# ---------------------------------------------------------------------

print("Adding LoRA adapters...")
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj", "mlp.c_fc", "mlp.c_proj"],
    lora_dropout=0.05,
    bias="none",
)

model_lora = get_peft_model(model_merged, config)

# dummy bilingual dataset
texts = ["This is English.", "Esto es español."]
dataset = Dataset.from_dict({"text": texts})

def tokenize_fn(batch):
    return tok_new(batch["text"], truncation=True)

dataset = dataset.map(tokenize_fn, batched=True)
data_collator = DataCollatorForLanguageModeling(tok_new, mlm=False)

args = TrainingArguments(
    output_dir=OUT_DIR,
    learning_rate=2e-4,
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_steps=10,
    save_steps=200,
)

trainer = Trainer(
    model=model_lora,
    args=args,
    train_dataset=dataset,
    data_collator=data_collator,
)

print("Starting bilingual finetune...")
trainer.train()

model_lora.save_pretrained(OUT_DIR)
tok_new.save_pretrained(OUT_DIR)

print("Merged bilingual Goldfish saved:", OUT_DIR)


Loading monolingual models...


tokenizer_config.json:   0%|          | 0.00/581 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/581 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.17M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/851 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/502M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/502M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/851 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/502M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/502M [00:00<?, ?B/s]

Loading pretrained bilingual tokenizer...


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Computing Procrustes alignment...
Building merged embedding matrix (fast mode)...


100%|██████████| 51200/51200 [02:39<00:00, 320.15it/s]


Merging transformer weights...
Adding LoRA adapters...




Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Starting bilingual finetune...


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


Merged bilingual Goldfish saved: goldfish-bilingual


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

OUT_DIR = "goldfish-bilingual"
DEVICE = "cuda"

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(OUT_DIR)
model = AutoModelForCausalLM.from_pretrained(OUT_DIR).to(DEVICE)
model.eval()

# Generation function
def generate_text(prompt, max_new_tokens=100, temperature=1.0, top_p=0.95, repetition_penalty=1.2):
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=None  # ignore EOS to force longer generation
        )
    # return only the newly generated tokens
    return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

# Prompts for English and Spanish
prompts = [
    "Once upon a time, in a small village,",
    "Había una vez, en un pequeño pueblo,",
    "The quick brown fox",
    "El rápido zorro marrón",
    "In a distant future, humans",
    "En un futuro lejano, los humanos",
]

print("\n--- Bilingual Generation ---\n")
for i, prompt in enumerate(prompts[:10]):  # generate up to 10 prompts
    continuation = generate_text(prompt)
    print(f"Prompt {i+1}: {prompt}")
    print(f"Continuation: {continuation}\n{'-'*50}")




--- Bilingual Generation ---

Prompt 1: Once upon a time, in a small village,
Continuation: po starting al.t steps very). of time programaába visit students about lowcommerce considercom nothing habíaand several
--------------------------------------------------
Prompt 2: Había una vez, en un pequeño pueblo,
Continuation: y of poical; starting. y very nuevo).  habíacom our y same that time y).).
--------------------------------------------------
Prompt 3: The quick brown fox
Continuation: po y of minister. but lascom time our ourtcom, same very our había several about us consider low). se
--------------------------------------------------
Prompt 4: El rápido zorro marrón
Continuation: y therefore our. startingcom).
--------------------------------------------------
Prompt 5: In a distant future, humans
Continuation: very lot ydora starting for little time studentsand therefore same little se).ical. programacomcom of nuevo).)..,
--------------------------------------------------
Prompt

# **Improvement 1: Frequency-based Anchor Selection**

Uses frequency-based anchor selection for Procrustes. Expands contextual embedding prototypes (can increase CONTEXT_N for better accuracy).
Layerwise merging with α-decay.


Adds LoRA adapter fine-tuning on a small bilingual corpus.
Final text generation function handles English & Spanish prompts.

In [None]:
import os
import torch
import random
import numpy as np
from tqdm import tqdm
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model
from datasets import Dataset

# ---------------------------------------------------------------------
# SETTINGS
# ---------------------------------------------------------------------
MODEL_L1 = "goldfish-models/eng_latn_100mb"
MODEL_L2 = "goldfish-models/spa_latn_100mb"
OUT_DIR = "goldfish-bilingual"
BILINGUAL_TOKENIZER = "catherinearnett/B-GPT_en_es_simultaneous"
CONTEXT_TOPK = 2000   # tokens to get full contextual embeddings
DEVICE = "cuda"
LAYERWISE_ALPHA = 0.5
CONTEXT_N = 20        # contexts per token

# ---------------------------------------------------------------------
# LOAD MODELS & TOKENIZERS
# ---------------------------------------------------------------------
print("Loading monolingual models...")
tok_L1 = AutoTokenizer.from_pretrained(MODEL_L1)
tok_L2 = AutoTokenizer.from_pretrained(MODEL_L2)

model_L1 = AutoModelForCausalLM.from_pretrained(MODEL_L1).to(DEVICE)
model_L2 = AutoModelForCausalLM.from_pretrained(MODEL_L2).to(DEVICE)

print("Loading bilingual tokenizer...")
tok_new = AutoTokenizer.from_pretrained(BILINGUAL_TOKENIZER)
V_new = len(tok_new)
d_model = model_L1.config.hidden_size

# ---------------------------------------------------------------------
# PROCRUSTES ALIGNMENT
# ---------------------------------------------------------------------
def compute_procrustes(E1, E2, anchor_indices):
    X = E2[anchor_indices]
    Y = E1[anchor_indices]
    U, _, Vt = torch.linalg.svd(Y.T @ X)
    return U @ Vt

def get_anchor_indices(tok1, tok2, top_k=5000):
    # Combine shared tokens and high-frequency tokens
    vocab1 = tok1.get_vocab()
    vocab2 = tok2.get_vocab()
    shared = list(set(vocab1.keys()) & set(vocab2.keys()))
    # Sort by frequency (descending)
    shared = sorted(shared, key=lambda t: vocab1[t] + vocab2.get(t, 0), reverse=True)
    shared = shared[:top_k]
    return [(vocab1[t], vocab2[t]) for t in shared]

print("Computing Procrustes alignment...")
E1 = model_L1.get_input_embeddings().weight.data
E2 = model_L2.get_input_embeddings().weight.data

anchor_pairs = get_anchor_indices(tok_L1, tok_L2)
idx2 = torch.tensor([p[1] for p in anchor_pairs], device=DEVICE)
R = compute_procrustes(E1, E2, idx2)

# ---------------------------------------------------------------------
# EMBEDDING UTILITIES
# ---------------------------------------------------------------------
def token_embedding_from_subwords(model, tokenizer, token_str):
    old_ids = tokenizer.encode(token_str, add_special_tokens=False)
    emb_layer = model.get_input_embeddings().weight
    return emb_layer[old_ids].mean(dim=0)

def contextual_prototype(model, tokenizer, token_str, n=CONTEXT_N):
    """Compute contextual embedding using multiple synthetic/realistic contexts."""
    texts = [f"This sentence contains the token '{token_str}'." for _ in range(n)]
    vecs = []
    for text in texts:
        ids = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            outputs = model(input_ids=ids, output_hidden_states=True)
        hidden = outputs.hidden_states[1][:, -1, :]
        vecs.append(hidden.squeeze(0))
    return torch.stack(vecs).mean(dim=0)

# ---------------------------------------------------------------------
# BUILD MERGED EMBEDDINGS
# ---------------------------------------------------------------------
print("Building merged embedding matrix...")
E_new = torch.zeros((V_new, d_model), device=DEVICE)

# pick high-frequency tokens for full contextual embeddings
freq_tokens = list(set(tok_L1.get_vocab().keys()) & set(tok_L2.get_vocab().keys()))
freq_tokens = freq_tokens[:CONTEXT_TOPK]

for i in tqdm(range(V_new)):
    token_str = tok_new.convert_ids_to_tokens(i)
    e1_sub = token_embedding_from_subwords(model_L1, tok_L1, token_str)
    e2_sub = R @ token_embedding_from_subwords(model_L2, tok_L2, token_str)

    if token_str in freq_tokens:
        try:
            e1_ctx = contextual_prototype(model_L1, tok_L1, token_str)
            e2_ctx = R @ contextual_prototype(model_L2, tok_L2, token_str)
            e1 = 0.5 * e1_sub + 0.5 * e1_ctx
            e2 = 0.5 * e2_sub + 0.5 * e2_ctx
        except Exception:
            e1, e2 = e1_sub, e2_sub
    else:
        e1, e2 = e1_sub, e2_sub

    # final merge
    E_new[i] = 0.5 * e1 + 0.5 * e2

# ---------------------------------------------------------------------
# MERGE TRANSFORMER BLOCKS
# ---------------------------------------------------------------------
def layer_alpha(layer_id, total_layers, base=0.5):
    t = layer_id / total_layers
    return base * (1 - t) + 0.3 * t

def merge_models(model1, model2):
    out = AutoModelForCausalLM.from_pretrained(MODEL_L1)
    out.get_input_embeddings().weight.data = E_new.clone()
    out.lm_head.weight.data = E_new.clone()

    for i in range(model1.config.num_hidden_layers):
        a = layer_alpha(i, model1.config.num_hidden_layers, base=LAYERWISE_ALPHA)

        # attention
        for k, v in model1.transformer.h[i].attn.state_dict().items():
            merged = a * v + (1 - a) * model2.transformer.h[i].attn.state_dict()[k]
            out.transformer.h[i].attn.state_dict()[k].copy_(merged)

        # mlp
        for k, v in model1.transformer.h[i].mlp.state_dict().items():
            merged = a * v + (1 - a) * model2.transformer.h[i].mlp.state_dict()[k]
            out.transformer.h[i].mlp.state_dict()[k].copy_(merged)

        # layernorm
        ln1, ln2, ln_out = model1.transformer.h[i].ln_1, model2.transformer.h[i].ln_1, out.transformer.h[i].ln_1
        ln_out.weight.data = a * ln1.weight.data + (1 - a) * ln2.weight.data
        ln_out.bias.data = a * ln1.bias.data + (1 - a) * ln2.bias.data

    return out

print("Merging transformer weights...")
model_merged = merge_models(model_L1, model_L2).to(DEVICE)

# ---------------------------------------------------------------------
# ADD LoRA + FINETUNE
# ---------------------------------------------------------------------
print("Adding LoRA adapters...")
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj", "mlp.c_fc", "mlp.c_proj"],
    lora_dropout=0.05,
    bias="none",
)
model_lora = get_peft_model(model_merged, config)

# Dummy bilingual dataset (replace with real bilingual corpus)
texts = ["This is English.", "Esto es español."]
dataset = Dataset.from_dict({"text": texts})

def tokenize_fn(batch):
    return tok_new(batch["text"], truncation=True)

dataset = dataset.map(tokenize_fn, batched=True)
data_collator = DataCollatorForLanguageModeling(tok_new, mlm=False)

args = TrainingArguments(
    output_dir=OUT_DIR,
    learning_rate=2e-4,
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_steps=10,
    save_steps=200,
)

trainer = Trainer(
    model=model_lora,
    args=args,
    train_dataset=dataset,
    data_collator=data_collator,
)

print("Starting bilingual finetune...")
trainer.train()

model_lora.save_pretrained(OUT_DIR)
tok_new.save_pretrained(OUT_DIR)
print("Merged bilingual Goldfish saved:", OUT_DIR)

# ---------------------------------------------------------------------
# EVALUATION / GENERATION
# ---------------------------------------------------------------------
model = AutoModelForCausalLM.from_pretrained(OUT_DIR).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(OUT_DIR)
model.eval()

def generate_text(prompt, max_new_tokens=100, temperature=1.0, top_p=0.95, repetition_penalty=1.2):
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

prompts = [
    "Once upon a time, in a small village,",
    "Había una vez, en un pequeño pueblo,",
    "The quick brown fox",
    "El rápido zorro marrón",
    "In a distant future, humans",
    "En un futuro lejano, los humanos",
]

print("\n--- Bilingual Generation ---\n")
for i, prompt in enumerate(prompts):
    continuation = generate_text(prompt)
    print(f"Prompt {i+1}: {prompt}")
    print(f"Continuation: {continuation}\n{'-'*50}")


Loading monolingual models...
Loading bilingual tokenizer...
Computing Procrustes alignment...
Building merged embedding matrix...


100%|██████████| 51200/51200 [11:43<00:00, 72.76it/s]


Merging transformer weights...
Adding LoRA adapters...




Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Starting bilingual finetune...


Step,Training Loss


Merged bilingual Goldfish saved: goldfish-bilingual

--- Bilingual Generation ---

Prompt 1: Once upon a time, in a small village,
Continuation: 2014 students. to po same las very very prepareba little little al  of low nuevo ycom therefore consider same nothing).,ical very se había
--------------------------------------------------
Prompt 2: Había una vez, en un pequeño pueblo,
Continuation: . therefore po Two  al programa). y of lascom había
--------------------------------------------------
Prompt 3: The quick brown fox
Continuation: experience ourá y of starting. programa. considercomand., programa time
--------------------------------------------------
Prompt 4: El rápido zorro marrón
Continuation: starting. very little y of había había our y same of that time y).com
--------------------------------------------------
Prompt 5: In a distant future, humans
Continuation: ed ourt lotcom of time very However same programa lasical severaland including nuevo about prepare había us consid

These generations are pretty poor in general – need to try out some more principled improvements.

Key issues I have noticed:

**Sparse or misaligned anchors** Procrustes only aligns shared or high-frequency tokens. Rare Spanish words are often left “floating” in the semantic space.Example: nuevo, preparaba, había may not have enough context in the alignment step.

**Merging transformer layers with a fixed α:** A simple linear interpolation may smear features, causing the model to produce hybridized or nonsensical subword sequences.

**Bilingual tokenizer issues:** if the tokenizer merges English + Spanish subwords in weird ways, low-frequency Spanish words might be split incorrectly.Combined with embedding misalignment, this creates “garbled outputs.”

**Contextual prototypes too small / synthetic:** We currently generate 20 synthetic sentences per token. Synthetic contexts often fail to capture real usage nuances in Spanish. English contexts dominate the average if you use shared templates. But is this optimisation really what we want to be doing?

# **Improvement 2**

Loads two monolingual GPT-style models. Merges their tokenizers via concatenation. Builds a merged embedding matrix with Procrustes alignment + naive interpolation. Merges transformer layers using layer-wise decayed α, handling GPT-style multi-head attention safely. Adds LoRA adapters. Runs a tiny bilingual finetuning dataset and text generation for demo. For large vocabularies, computing a contextual prototype per token sequentially is extremely slow. We can batch tokens and process them in parallel through the model

In [1]:
import torch
from tqdm import tqdm
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
from peft import LoraConfig, get_peft_model

# ---------------------------------------------------------------------
# SETTINGS
# ---------------------------------------------------------------------
DEVICE = "cuda"
BATCH_SIZE = 64  # for batched contextual embeddings


MODEL_L1 = "goldfish-models/eng_latn_100mb"
MODEL_L2 = "goldfish-models/spa_latn_100mb"
BILINGUAL_TOKENIZER = "catherinearnett/B-GPT_en_es_simultaneous"
OUT_DIR = "goldfish-bilingual"

CONTEXT_TOPK = 2000  # tokens to compute contextual prototypes for
ALPHA_LOW = 0.7     # lower layers favor L1
ALPHA_HIGH = 0.3    # upper layers favor L2
N_CONTEXTS = 4      # contexts per token

# ---------------------------------------------------------------------
# LOAD MODELS AND TOKENIZERS
# ---------------------------------------------------------------------
print("Loading monolingual models...")
tok_L1 = AutoTokenizer.from_pretrained(MODEL_L1)
tok_L2 = AutoTokenizer.from_pretrained(MODEL_L2)
tok_new = AutoTokenizer.from_pretrained(BILINGUAL_TOKENIZER)

model_L1 = AutoModelForCausalLM.from_pretrained(MODEL_L1).to(DEVICE)
model_L2 = AutoModelForCausalLM.from_pretrained(MODEL_L2).to(DEVICE)

d_model = model_L1.config.hidden_size
V_new = len(tok_new)

# ---------------------------------------------------------------------
# PROCRUSTES ALIGNMENT
# ---------------------------------------------------------------------
def get_shared_tokens(tok1, tok2):
    vocab1 = set(tok1.get_vocab().keys())
    vocab2 = set(tok2.get_vocab().keys())
    return list(vocab1 & vocab2)

def compute_procrustes(E1, E2, idx1, idx2):
    X = E2[idx2]
    Y = E1[idx1]
    U, _, Vt = torch.linalg.svd(Y.T @ X)
    return U @ Vt

shared_tokens = get_shared_tokens(tok_L1, tok_L2)
shared_idx1 = torch.tensor([tok_L1.get_vocab()[t] for t in shared_tokens], device=DEVICE)
shared_idx2 = torch.tensor([tok_L2.get_vocab()[t] for t in shared_tokens], device=DEVICE)

E1 = model_L1.get_input_embeddings().weight.data
E2 = model_L2.get_input_embeddings().weight.data
R = compute_procrustes(E1, E2, shared_idx1, shared_idx2)

# ---------------------------------------------------------------------
# STATIC EMBEDDINGS
# ---------------------------------------------------------------------
def token_embedding_from_subwords(model, tokenizer, token_str):
    ids = tokenizer.encode(token_str, add_special_tokens=False)
    return model.get_input_embeddings().weight[ids].mean(dim=0)

# ---------------------------------------------------------------------
# BATCHED CONTEXTUAL EMBEDDINGS
# ---------------------------------------------------------------------
def batched_contextual_prototypes(model, tokenizer, tokens, n_contexts=N_CONTEXTS):
    model.eval()
    all_embeddings = []

    for i in tqdm(range(0, len(tokens), BATCH_SIZE)):
        batch_tokens = tokens[i:i+BATCH_SIZE]
        contexts = []

        # prepare synthetic contexts for each token
        for token in batch_tokens:
            for _ in range(n_contexts):
                contexts.append(f"This example contains {token} in an interesting setting.")

        # tokenize batch
        encodings = tokenizer(contexts, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
        with torch.no_grad():
            outputs = model(**encodings, output_hidden_states=True)

        hidden = outputs.hidden_states[1]  # first transformer layer
        hidden_last = hidden[:, -1, :]

        # average contexts per token
        for j in range(0, len(hidden_last), n_contexts):
            token_emb = hidden_last[j:j+n_contexts].mean(dim=0)
            all_embeddings.append(token_emb)

    return torch.stack(all_embeddings, dim=0)

# ---------------------------------------------------------------------
# BUILD MERGED EMBEDDING MATRIX
# ---------------------------------------------------------------------
print("Building merged embedding matrix...")
E_new = torch.zeros((V_new, d_model), device=DEVICE)

# high-frequency/shared tokens for contextual embeddings
freq_tokens = list(set(tok_L1.get_vocab().keys()) & set(tok_L2.get_vocab().keys()))[:CONTEXT_TOPK]

# compute batched contextual embeddings
ctx_emb_L1 = batched_contextual_prototypes(model_L1, tok_L1, freq_tokens)
ctx_emb_L2 = batched_contextual_prototypes(model_L2, tok_L2, freq_tokens)
ctx_emb_L2_aligned = ctx_emb_L2 @ R.T
ctx_emb_merged = 0.5 * ctx_emb_L1 + 0.5 * ctx_emb_L2_aligned

# build embedding matrix
for i in tqdm(range(V_new)):
    token = tok_new.convert_ids_to_tokens(i)

    # static embeddings
    e1 = token_embedding_from_subwords(model_L1, tok_L1, token)
    e2 = token_embedding_from_subwords(model_L2, tok_L2, token)
    e2 = R @ e2
    E_static = 0.5 * e1 + 0.5 * e2

    # use contextual prototype if in freq_tokens
    if token in freq_tokens:
        idx = freq_tokens.index(token)
        E_new[i] = ctx_emb_merged[idx]
    else:
        E_new[i] = E_static

# ---------------------------------------------------------------------
# MERGE TRANSFORMER LAYERS WITH LAYERWISE DECAY
# ---------------------------------------------------------------------
def layerwise_alpha(layer_id, total_layers, alpha_low=ALPHA_LOW, alpha_high=ALPHA_HIGH):
    t = layer_id / (total_layers - 1)
    return alpha_low * (1 - t) + alpha_high * t

def merge_attention(attn1, attn2, alpha):
    merged = {}
    for name in attn1.state_dict().keys():
        merged[name] = alpha * attn1.state_dict()[name] + (1 - alpha) * attn2.state_dict()[name]
    return merged

def merge_mlp(mlp1, mlp2, alpha):
    merged = {}
    for name in mlp1.state_dict().keys():
        merged[name] = alpha * mlp1.state_dict()[name] + (1 - alpha) * mlp2.state_dict()[name]
    return merged

def merge_layernorm(ln1, ln2, alpha):
    ln_merged = type(ln1)(ln1.normalized_shape)
    ln_merged.weight.data = alpha * ln1.weight.data + (1 - alpha) * ln2.weight.data
    ln_merged.bias.data = alpha * ln1.bias.data + (1 - alpha) * ln2.bias.data
    return ln_merged

def merge_gpt_layers_layerwise(model1, model2, E_new):
    out_model = AutoModelForCausalLM.from_pretrained(MODEL_L1)
    out_model.get_input_embeddings().weight.data = E_new.clone()
    out_model.lm_head.weight.data = E_new.clone()

    num_layers = model1.config.num_hidden_layers
    for i in range(num_layers):
        alpha = layerwise_alpha(i, num_layers)

        # attention
        merged_attn = merge_attention(model1.transformer.h[i].attn,
                                      model2.transformer.h[i].attn, alpha)
        out_model.transformer.h[i].attn.load_state_dict(merged_attn)

        # mlp
        merged_mlp = merge_mlp(model1.transformer.h[i].mlp,
                               model2.transformer.h[i].mlp, alpha)
        out_model.transformer.h[i].mlp.load_state_dict(merged_mlp)

        # layernorm
        out_model.transformer.h[i].ln_1 = merge_layernorm(model1.transformer.h[i].ln_1,
                                                          model2.transformer.h[i].ln_1, alpha)
        out_model.transformer.h[i].ln_2 = merge_layernorm(model1.transformer.h[i].ln_2,
                                                          model2.transformer.h[i].ln_2, alpha)
    return out_model

print("Merging transformer layers with layer-wise decay...")
model_merged = merge_gpt_layers_layerwise(model_L1, model_L2, E_new).to(DEVICE)

# ---------------------------------------------------------------------
# ADD LORA + FINETUNE
# ---------------------------------------------------------------------
print("Adding LoRA adapters...")
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj", "mlp.c_fc", "mlp.c_proj"],
    lora_dropout=0.05,
    bias="none",
)
model_lora = get_peft_model(model_merged, lora_config)

# tiny bilingual dataset
texts = [
    "This is an English sentence.",
    "Esto es una oración en español.",
    "Hello world!",
    "¡Hola mundo!"
]
dataset = Dataset.from_dict({"text": texts})
dataset = dataset.map(lambda batch: tok_new(batch["text"], truncation=True, padding="max_length", max_length=32), batched=True)
data_collator = DataCollatorForLanguageModeling(tok_new, mlm=False)

training_args = TrainingArguments(
    output_dir=OUT_DIR,
    learning_rate=2e-4,
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_steps=2,
    save_steps=100,
    save_total_limit=1,
)
trainer = Trainer(model=model_lora, args=training_args, train_dataset=dataset, data_collator=data_collator)

print("Starting bilingual LoRA fine-tuning...")
trainer.train()

model_lora.save_pretrained(OUT_DIR)
tok_new.save_pretrained(OUT_DIR)
print(f"Merged bilingual Goldfish saved to {OUT_DIR}")

# ---------------------------------------------------------------------
# TEST TEXT GENERATION
# ---------------------------------------------------------------------
print("Testing bilingual generation...")
prompts = ["Today I want to say", "Hoy quiero decir"]

for prompt in prompts:
    input_ids = tok_new(prompt, return_tensors="pt").input_ids.to(DEVICE)
    with torch.no_grad():
        outputs = model_lora.generate(input_ids, max_length=32, do_sample=True, temperature=0.8)
    print("Prompt:", prompt)
    print("Output:", " ".join(tok_new.batch_decode(outputs, skip_special_tokens=True)))
    print("-" * 50)


Loading monolingual models...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/581 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/581 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.17M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/851 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/502M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/502M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/851 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/502M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/502M [00:00<?, ?B/s]

Building merged embedding matrix...


  0%|          | 0/32 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 32/32 [00:02<00:00, 14.90it/s]
  0%|          | 0/32 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 32/32 [00:02<00:00, 13.27it/s]
100%|██████████| 51200/51200 [00:31<00:00, 1610.89it/s]


Merging transformer layers with layer-wise decay...
Adding LoRA adapters...




Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Starting bilingual LoRA fine-tuning...


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
2,22.1682


Merged bilingual Goldfish saved to goldfish-bilingual
Testing bilingual generation...
Prompt: Today I want to say
Output: Today I want to sayrata%20emanny
--------------------------------------------------
Prompt: Hoy quiero decir
Output: Hoy quiero decirema
--------------------------------------------------


We need to still adapt the merged embeddings, calibrate logits to the new bilingual tokenizer and remove garbage suffixes (%20, broken morphemes, etc.)

# A Third Pass

Two monolingual Goldfish models are first aligned at the embedding level using Procrustes rotation over shared vocabulary, augmented with contextual prototype embeddings for frequent tokens to improve cross-lingual semantic consistency. The transformer parameters are then merged layer-wise with a depth-dependent interpolation that biases lower layers toward English and higher layers toward Spanish. After merging, LoRA adapters are added and most lower transformer layers are frozen to preserve linguistic knowledge while allowing embeddings, the language-model head, and upper layers to adapt to a new bilingual tokenizer. A small, clean EN–ES corpus (Opus Books) is used for a short “stabilization” fine-tuning pass whose goal is not language learning but calibration of tokenization and logits. The notebook concludes with quantitative diagnostics—bilingual perplexity, token fragmentation rates, and continuation entropy—alongside generation checks, providing concrete evidence of tokenizer alignment and stability improvements suitable for fair comparison against alternative merging methods such as SLERP or DARE.

In [3]:
# =========================
# GOLD FISH EN–ES MERGE + STABILIZATION + METRICS (ONE CELL)
# =========================

!pip install -q transformers datasets peft accelerate sentencepiece tqdm

import math, torch, random
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_float32_matmul_precision("high")

# -------------------------
# SETTINGS
# -------------------------
MODEL_L1 = "goldfish-models/eng_latn_100mb"
MODEL_L2 = "goldfish-models/spa_latn_100mb"
BILINGUAL_TOKENIZER = "catherinearnett/B-GPT_en_es_simultaneous"
OUT_DIR = "goldfish-bilingual-stabilized"

CONTEXT_TOPK = 2000
N_CONTEXTS = 4
BATCH_SIZE = 64
ALPHA_LOW, ALPHA_HIGH = 0.7, 0.3

# -------------------------
# LOAD
# -------------------------
tok_L1 = AutoTokenizer.from_pretrained(MODEL_L1)
tok_L2 = AutoTokenizer.from_pretrained(MODEL_L2)
tok_new = AutoTokenizer.from_pretrained(BILINGUAL_TOKENIZER)

model_L1 = AutoModelForCausalLM.from_pretrained(MODEL_L1).to(DEVICE)
model_L2 = AutoModelForCausalLM.from_pretrained(MODEL_L2).to(DEVICE)

d_model = model_L1.config.hidden_size
V_new = len(tok_new)

# -------------------------
# PROCRUSTES
# -------------------------
def get_shared(tok1, tok2):
    return list(set(tok1.get_vocab()) & set(tok2.get_vocab()))

def procrustes(E1, E2, i1, i2):
    U, _, Vt = torch.linalg.svd(E1[i1].T @ E2[i2])
    return U @ Vt

shared = get_shared(tok_L1, tok_L2)
i1 = torch.tensor([tok_L1.get_vocab()[t] for t in shared], device=DEVICE)
i2 = torch.tensor([tok_L2.get_vocab()[t] for t in shared], device=DEVICE)
R = procrustes(model_L1.get_input_embeddings().weight.data,
               model_L2.get_input_embeddings().weight.data,
               i1, i2)

# -------------------------
# CONTEXTUAL EMBEDDINGS
# -------------------------
def contextual(model, tokenizer, tokens):
    model.eval()
    out = []
    for i in tqdm(range(0, len(tokens), BATCH_SIZE)):
        batch = tokens[i:i+BATCH_SIZE]
        ctx = [f"This contains {t}." for t in batch for _ in range(N_CONTEXTS)]
        enc = tokenizer(ctx, return_tensors="pt", padding=True).to(DEVICE)
        with torch.no_grad():
            h = model(**enc, output_hidden_states=True).hidden_states[1][:, -1, :]
        for j in range(0, len(h), N_CONTEXTS):
            out.append(h[j:j+N_CONTEXTS].mean(0))
    return torch.stack(out)

freq_tokens = shared[:CONTEXT_TOPK]
ctx1 = contextual(model_L1, tok_L1, freq_tokens)
ctx2 = contextual(model_L2, tok_L2, freq_tokens) @ R.T
ctx = 0.5 * (ctx1 + ctx2)

# -------------------------
# BUILD EMBEDDINGS
# -------------------------
E_new = torch.zeros((V_new, d_model), device=DEVICE)

def static_embed(model, tok, t):
    ids = tok.encode(t, add_special_tokens=False)
    return model.get_input_embeddings().weight[ids].mean(0)

for i in tqdm(range(V_new)):
    t = tok_new.convert_ids_to_tokens(i)
    if t in freq_tokens:
        E_new[i] = ctx[freq_tokens.index(t)]
    else:
        e1 = static_embed(model_L1, tok_L1, t)
        e2 = R @ static_embed(model_L2, tok_L2, t)
        E_new[i] = 0.5 * (e1 + e2)

# -------------------------
# MERGE TRANSFORMER
# -------------------------
def alpha(l, L): return ALPHA_LOW*(1-l/(L-1)) + ALPHA_HIGH*(l/(L-1))

out = AutoModelForCausalLM.from_pretrained(MODEL_L1).to(DEVICE)
out.get_input_embeddings().weight.data = E_new.clone()
out.lm_head.weight.data = E_new.clone()

L = out.config.num_hidden_layers
for i in range(L):
    a = alpha(i, L)
    for k in out.transformer.h[i].state_dict():
        out.transformer.h[i].state_dict()[k].copy_(
            a * model_L1.transformer.h[i].state_dict()[k] +
            (1-a) * model_L2.transformer.h[i].state_dict()[k]
        )

# -------------------------
# LORA
# -------------------------
lora = LoraConfig(r=16, lora_alpha=32,
                  target_modules=["c_attn","c_proj","mlp.c_fc","mlp.c_proj"])
model = get_peft_model(out, lora)

# -------------------------
# FREEZE
# -------------------------
def freeze(model, ratio=0.6):
    n = model.config.num_hidden_layers
    cut = int(n * ratio)
    for nme,p in model.named_parameters():
        p.requires_grad = True
        if nme.startswith("transformer.h."):
            if int(nme.split(".")[2]) < cut:
                p.requires_grad = False
    return model

model = freeze(model)

# -------------------------
# DATA (OPUS BOOKS)
# -------------------------
ds = load_dataset("opus_books", "en-es", split="train")
def flatten(x): return {"text":[x["translation"]["en"], x["translation"]["es"]]}
ds = ds.map(flatten, remove_columns=ds.column_names).flatten_indices()
ds = ds.shuffle(seed=0).select(range(50_000))

def tok(x): return tok_new(x["text"], truncation=True, max_length=64)
ds = ds.map(tok, batched=True, remove_columns=["text"])

eval_ds = ds.shuffle(seed=1).select(range(2_000))

# -------------------------
# METRICS
# -------------------------
def ppl(model, ds):
    model.eval()
    L=[]
    for b in ds:
        ids=torch.tensor(b["input_ids"]).unsqueeze(0).to(DEVICE)
        with torch.no_grad():
            L.append(model(ids,labels=ids).loss.item())
    return math.exp(sum(L)/len(L))

def fragmentation(tok, words):
    return sum(len(tok.tokenize(w)) for w in words)/len(words)

def entropy(model,p):
    ids=tok_new(p,return_tensors="pt").input_ids.to(DEVICE)
    with torch.no_grad():
        l=model(ids).logits[:,-1]
    p=torch.softmax(l,dim=-1)
    return -(p*p.log()).sum().item()

print("PPL before:", ppl(model, eval_ds))

# -------------------------
# TRAIN
# -------------------------
args = TrainingArguments(
    OUT_DIR, bf16=True, lr=1e-4, num_train_epochs=1,
    per_device_train_batch_size=16, gradient_accumulation_steps=2,
    logging_steps=100, save_steps=1000
)
trainer = Trainer(model=model, args=args,
                  train_dataset=ds,
                  data_collator=DataCollatorForLanguageModeling(tok_new,mlm=False))
trainer.train()

print("PPL after:", ppl(model, eval_ds))

# -------------------------
# TOKEN METRICS
# -------------------------
print("Fragmentation EN:",
      fragmentation(tok_new,["running","happiness","unbelievable"]))
print("Fragmentation ES:",
      fragmentation(tok_new,["corriendo","felicidad","increíble"]))

for p in ["I want to say","Quiero decir"]:
    print(p,"entropy:",entropy(model,p))

# -------------------------
# GENERATION
# -------------------------
for p in ["Today I want to say","Hoy quiero decir"]:
    ids=tok_new(p,return_tensors="pt").input_ids.to(DEVICE)
    with torch.no_grad():
        o=model.generate(ids,max_length=32,do_sample=True,temperature=0.8)
    print(p,"→",tok_new.decode(o[0],skip_special_tokens=True))


KeyboardInterrupt: 