# 02 - Análisis del Tokenizer para Guaraní

Analiza cómo el tokenizer de Qwen2.5 maneja texto en Guaraní vs Español.
La **fertilidad** (tokens por palabra) es una métrica clave: valores altos indican
que el tokenizer no fue diseñado para ese idioma.

In [None]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
print(f"Vocab size: {tokenizer.vocab_size:,}")

In [None]:
# Sample texts for comparison
guarani_samples = [
    "Paraguay ha'e peteĩ tetã oĩva América del Sur-pe.",
    "Guaraní ñe'ẽ ha'e peteĩ ñe'ẽ oñeñe'ẽva Paraguái-pe.",
    "Ko tetã oguereko mokõi ñe'ẽ teete: guaraní ha castellano.",
    "Ñandejára toikove opa ára, ha ñande retã toikove.",
    "Aipota amoñe'ẽ guaraníme ha castellano-pe avei.",
]

spanish_samples = [
    "Paraguay es un país ubicado en América del Sur.",
    "El guaraní es un idioma hablado en Paraguay.",
    "Este país tiene dos idiomas oficiales: guaraní y castellano.",
    "Dios vive todos los días, y nuestra patria vive.",
    "Quiero hablar en guaraní y en castellano también.",
]

In [None]:
def compute_fertility(text, tokenizer):
    """Compute tokens-per-word ratio."""
    words = text.split()
    tokens = tokenizer.encode(text)
    return len(tokens) / len(words) if words else 0

print("=" * 60)
print("FERTILITY ANALYSIS: Guaraní vs Español")
print("=" * 60)

gn_fertilities = []
print("\nGuaraní:")
for text in guarani_samples:
    f = compute_fertility(text, tokenizer)
    gn_fertilities.append(f)
    words = len(text.split())
    tokens = len(tokenizer.encode(text))
    print(f"  {words}w → {tokens}t = {f:.2f} tok/word | {text[:50]}...")

es_fertilities = []
print("\nEspañol:")
for text in spanish_samples:
    f = compute_fertility(text, tokenizer)
    es_fertilities.append(f)
    words = len(text.split())
    tokens = len(tokenizer.encode(text))
    print(f"  {words}w → {tokens}t = {f:.2f} tok/word | {text[:50]}...")

print(f"\n{'Metric':<20} {'Guaraní':>10} {'Español':>10} {'Ratio':>10}")
print("-" * 50)
gn_mean = np.mean(gn_fertilities)
es_mean = np.mean(es_fertilities)
print(f"{'Mean fertility':<20} {gn_mean:>10.2f} {es_mean:>10.2f} {gn_mean/es_mean:>10.2f}x")
print(f"{'Median fertility':<20} {np.median(gn_fertilities):>10.2f} {np.median(es_fertilities):>10.2f}")

In [None]:
# Analyze individual Guaraní words
guarani_words = [
    "ñe'ẽ", "oñeñe'ẽva", "Paraguái", "tetã", "guarã",
    "ha'e", "peteĩ", "mokõi", "ñandejára", "porã",
    "mba'e", "oĩva", "reñe'ẽ", "amoñe'ẽ", "ỹ",
]

print("\nWord-level tokenization (Guaraní):")
print(f"{'Word':<20} {'Tokens':>6} {'Token IDs'}")
print("-" * 60)
for word in guarani_words:
    tokens = tokenizer.encode(word, add_special_tokens=False)
    decoded = [tokenizer.decode([t]) for t in tokens]
    print(f"{word:<20} {len(tokens):>6} {decoded}")

In [None]:
# Summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Qwen2.5 tokenizer fertility on Guaraní: {gn_mean:.2f} tokens/word")
print(f"Qwen2.5 tokenizer fertility on Spanish: {es_mean:.2f} tokens/word")
print(f"Guaraní overhead: {gn_mean/es_mean:.1f}x more tokens than Spanish")
print()
if gn_mean < 4.0:
    print("✓ Fertility < 4.0 — ACCEPTABLE for fine-tuning")
else:
    print("⚠ Fertility >= 4.0 — Consider tokenizer extension or vocabulary adaptation")