# 01 - Exploración de Datos GuaraniLM

Este notebook explora las fuentes de datos disponibles para entrenar GuaraniLM:
- Wikipedia Guaraní
- CulturaX (subset grn)
- Jojajovai (corpus paralelo gn↔es)
- mmaguero datasets (sentiment, humor, hate speech)

In [None]:
import json
import sys
from collections import Counter
from pathlib import Path

import pandas as pd

sys.path.insert(0, str(Path(".").resolve().parent / "scripts"))
from normalize_guarani import normalize

DATA_DIR = Path("..") / "data"

## 1. Wikipedia Guaraní

In [None]:
wiki_path = DATA_DIR / "interim" / "wikipedia_gn.jsonl"
if wiki_path.exists():
    wiki_texts = []
    with open(wiki_path, "r", encoding="utf-8") as f:
        for line in f:
            wiki_texts.append(json.loads(line.strip()))
    print(f"Articles: {len(wiki_texts)}")
    lengths = [len(t['text'].split()) for t in wiki_texts]
    print(f"Total words: {sum(lengths):,}")
    print(f"Mean words/article: {sum(lengths)/len(lengths):.0f}")
    print(f"\nSample article (first 200 chars):")
    print(wiki_texts[0]['text'][:200])
else:
    print(f"File not found: {wiki_path}")
    print("Run: python scripts/download_data.py && python scripts/clean_wikipedia.py")

## 2. CulturaX Guaraní

In [None]:
culturax_path = DATA_DIR / "interim" / "culturax_gn.jsonl"
if culturax_path.exists():
    cx_texts = []
    with open(culturax_path, "r", encoding="utf-8") as f:
        for line in f:
            cx_texts.append(json.loads(line.strip()))
    print(f"Documents: {len(cx_texts)}")
    lengths = [len(t['text'].split()) for t in cx_texts]
    print(f"Total words: {sum(lengths):,}")
    print(f"\nSample (first 200 chars):")
    print(cx_texts[0]['text'][:200])
else:
    print(f"File not found: {culturax_path}")
    print("Run: python scripts/download_data.py && python scripts/clean_culturax.py")

## 3. Jojajovai (Parallel Corpus)

In [None]:
parallel_path = DATA_DIR / "interim" / "parallel_gn_es.jsonl"
if parallel_path.exists():
    pairs = []
    with open(parallel_path, "r", encoding="utf-8") as f:
        for line in f:
            pairs.append(json.loads(line.strip()))
    print(f"Parallel pairs: {len(pairs)}")
    gn_lengths = [len(p['gn'].split()) for p in pairs]
    es_lengths = [len(p['es'].split()) for p in pairs]
    print(f"GN mean words: {sum(gn_lengths)/len(gn_lengths):.1f}")
    print(f"ES mean words: {sum(es_lengths)/len(es_lengths):.1f}")
    print(f"\nSample pairs:")
    for p in pairs[:5]:
        print(f"  GN: {p['gn'][:80]}")
        print(f"  ES: {p['es'][:80]}")
        print()
else:
    print(f"File not found: {parallel_path}")
    print("Run: python scripts/download_data.py && python scripts/prepare_parallel.py")

## 4. Character Distribution

In [None]:
# Analyze character distribution in Guaraní text
all_text = ""
for path in [wiki_path, culturax_path]:
    if path.exists():
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                all_text += json.loads(line.strip())["text"] + " "

if all_text:
    char_counts = Counter(all_text.lower())
    guarani_special = ['ã', 'ẽ', 'ĩ', 'õ', 'ũ', 'ỹ', 'g̃', "'"]
    print("Guaraní-specific character frequencies:")
    for ch in guarani_special:
        count = char_counts.get(ch, 0)
        print(f"  '{ch}': {count:,} ({count/len(all_text)*100:.3f}%)")
else:
    print("No data loaded yet. Run the pipeline first.")

## 5. Dataset Statistics Summary

In [None]:
print("=" * 50)
print("DATASET SUMMARY")
print("=" * 50)
sources = {
    "Wikipedia GN": DATA_DIR / "interim" / "wikipedia_gn.jsonl",
    "CulturaX GN": DATA_DIR / "interim" / "culturax_gn.jsonl",
    "Parallel GN-ES": DATA_DIR / "interim" / "parallel_gn_es.jsonl",
    "NLLB Augmented": DATA_DIR / "interim" / "augmented_nllb.jsonl",
}
total_words = 0
for name, path in sources.items():
    if path.exists():
        count = sum(1 for _ in open(path, encoding="utf-8"))
        print(f"  {name}: {count:,} records")
    else:
        print(f"  {name}: NOT YET GENERATED")