In [1]:

# ================================================
# SECTION 1 — SETUP & UTILS
# ================================================
import sys, os, random, math, json, string, warnings
warnings.filterwarnings("ignore")

def _ensure(pkgs):
    """Best-effort installs. Notebook still runs if offline; LLM tries fallbacks."""
    import importlib, subprocess, sys as _sys
    for p in pkgs:
        base = p.split(">=")[0]
        try:
            importlib.import_module(base)
        except Exception:
            try:
                print(f"Installing {p} …")
                subprocess.check_call([_sys.executable, "-m", "pip", "install", p])
            except Exception as e:
                print(f"⚠️ Could not install {p}: {e}")

_ensure(["numpy", "pandas", "matplotlib", "wordfreq", "editdistance",
         "transformers>=4.41.0", "sentencepiece>=0.1.99", "torch", "ipywidgets", "datasets"])

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

# Levenshtein backend
try:
    import editdistance as _editdistance
    _USE_EDITDIST = "editdistance"
except Exception:
    _editdistance = None
    try:
        from rapidfuzz.distance import Levenshtein
        _USE_EDITDIST = "rapidfuzz"
    except Exception:
        Levenshtein = None
        _USE_EDITDIST = "none"

# Wordfreq
try:
    from wordfreq import top_n_list, word_frequency, zipf_frequency
    _HAS_WORDFREQ = True
except Exception:
    _HAS_WORDFREQ = False

# LLM libs
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM

# Repro
random.seed(13)
np.random.seed(13)

print("✅ Setup complete. Edit backend:", _USE_EDITDIST, "| wordfreq:", _HAS_WORDFREQ)


✅ Setup complete. Edit backend: editdistance | wordfreq: True


In [2]:

# ================================================
# SECTION 2 — DATA & NOISE
# ================================================
from collections import defaultdict, Counter
from typing import List, Tuple, Dict, Iterable, Set
ALPHABET = string.ascii_lowercase

KEY_NEIGHBORS: Dict[str, str] = {
    'q':'was', 'w':'qesd', 'e':'wsdr', 'r':'edft', 't':'rfgy', 'y':'tghu', 'u':'yhji', 'i':'ujko', 'o':'iklp', 'p':'ol',
    'a':'qwsz', 's':'weadxz', 'd':'ersfcx', 'f':'rtdgcv', 'g':'tyfhbv', 'h':'yugjbn', 'j':'uikhmn', 'k':'ijolm', 'l':'opk',
    'z':'asx', 'x':'zsdc', 'c':'xdfv', 'v':'cfgb', 'b':'vghn', 'n':'bhjm', 'm':'njk'
}

def random_neighbor(c: str) -> str:
    c = c.lower()
    neigh = KEY_NEIGHBORS.get(c, '')
    return random.choice(neigh) if neigh else random.choice(ALPHABET)

def noisy_word(word: str, p_del=0.05, p_ins=0.05, p_sub=0.06, p_trans=0.02, max_ops=2):
    ops = 0; s = list(word); counts = defaultdict(int)
    while ops < max_ops and s:
        r = random.random()
        if r < p_del and len(s)>1:
            i = random.randrange(len(s)); s.pop(i); counts['del']+=1
        elif r < p_del + p_ins:
            i = random.randrange(len(s)+1)
            s.insert(i, random_neighbor(s[i-1] if i>0 else random.choice(ALPHABET))); counts['ins']+=1
        elif r < p_del + p_ins + p_sub:
            i = random.randrange(len(s)); s[i] = random_neighbor(s[i]); counts['sub']+=1
        elif r < p_del+p_ins+p_sub+p_trans and len(s)>1:
            i = random.randrange(len(s)-1); s[i], s[i+1] = s[i+1], s[i]; counts['trans']+=1
        else:
            break
        ops += 1
    noisy = ''.join(s) or word
    return noisy, counts

def edits1(word: str, alphabet: str = ALPHABET) -> Set[str]:
    w = word.lower()
    splits = [(w[:i], w[i:]) for i in range(len(w)+1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in alphabet]
    inserts = [L + c + R for L, R in splits for c in alphabet]
    return set(deletes+transposes+replaces+inserts)

def edits2(word: str) -> Set[str]:
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1))

def build_vocab(n=80000, min_len=3, max_len=18) -> List[str]:
    if _HAS_WORDFREQ:
        vocab = [w for w in top_n_list("en", n) if w.isalpha() and min_len <= len(w) <= max_len]
        seen=set(); out=[]
        for w in vocab:
            wl = w.lower()
            if wl not in seen:
                seen.add(wl); out.append(wl)
        return out
    # Minimal fallback
    return ["the","and","there","their","cat","dog","apple","banana","orange","grape","house",
            "keyboard","screen","computer","program","python","model","language","learning",
            "school","teacher","student","book","library","garden","flower","river","mountain",
            "beautiful","strength","tomorrow","yesterday","because","receive","believe","friend"]

def synth_pairs(words: List[str], per_word: int = 3) -> pd.DataFrame:
    rows=[]
    for w in words:
        for _ in range(per_word):
            noisy, types = noisy_word(w)
            rows.append({"noisy": noisy, "clean": w, "types": dict(types)})
    return pd.DataFrame(rows)


In [3]:

# ================================================
# SECTION 3 — DATA SPLITS
# ================================================
vocab = build_vocab(80000)
random.shuffle(vocab)
n = len(vocab)
train_words = vocab[: int(n*0.8)]
dev_words   = vocab[int(n*0.8): int(n*0.9)]
test_words  = vocab[int(n*0.9):]

train_df = synth_pairs(train_words, 3)
dev_df   = synth_pairs(dev_words,   3)
test_df  = synth_pairs(test_words,  3)

os.makedirs("data/splits", exist_ok=True)
train_df.to_json("data/splits/train.jsonl", lines=True, orient="records")
dev_df.to_json(  "data/splits/dev.jsonl",   lines=True, orient="records")
test_df.to_json( "data/splits/test.jsonl",  lines=True, orient="records")

print("✅ data/splits written"); train_df.head()


✅ data/splits written


Unnamed: 0,noisy,clean,types
0,wonderful,wonderful,{}
1,wonderful,wonderful,{}
2,wonderful,wonderful,{}
3,conditioning,conditioning,{}
4,conditioning,conditioning,{}


In [4]:

# ================================================
# SECTION 4 — N-GRAM LM (CHAR)
# ================================================
BOS="^"; EOS="$"
class CharNGramLM:
    def __init__(self, n=3, alpha=0.5):
        self.n=n; self.alpha=alpha
        self.context_counts=defaultdict(Counter)
        self.context_totals=Counter()
        self.vocab_chars=set()
    def fit(self, words: Iterable[str]):
        for w in words:
            s=(BOS*(self.n-1))+w+EOS
            self.vocab_chars.update(s)
            for i in range(self.n-1, len(s)):
                ctx=s[i-(self.n-1):i]; nxt=s[i]
                self.context_counts[ctx][nxt]+=1
                self.context_totals[ctx]+=1
        return self
    def log_prob(self, word: str)->float:
        s=(BOS*(self.n-1))+word+EOS
        V=max(1,len(self.vocab_chars)); lp=0.0
        for i in range(self.n-1, len(s)):
            ctx=s[i-(self.n-1):i]; nxt=s[i]
            c=self.context_counts[ctx][nxt]; tot=self.context_totals[ctx]
            p=(c+self.alpha)/(tot+self.alpha*V)
            lp+=math.log(p)
        return lp

def log_unigram_prior(word: str, lang="en")->float:
    if _HAS_WORDFREQ:
        f=max(word_frequency(word, lang, minimum=1e-9), 1e-12)
        return math.log(f)
    return -0.001*len(word)

def build_vocab_set(n=100_000)->Set[str]:
    return set(map(str.lower, build_vocab(n)))

def candidates(word: str, vocab: Set[str]):
    w=word.lower(); cands=set()
    if w in vocab: cands.add(w)
    cands |= (edits1(w) & vocab)
    if not cands: cands |= (edits2(w) & vocab)
    return list(cands) or [w]

def score(word: str, lm: CharNGramLM, lambda_prior=0.4):
    return lm.log_prob(word) + lambda_prior*log_unigram_prior(word)

def cer(ref: str, hyp: str)->float:
    if _editdistance is not None:
        return _editdistance.eval(ref, hyp)/max(1,len(ref))
    if 'Levenshtein' in globals() and Levenshtein is not None:
        return Levenshtein.distance(ref, hyp)/max(1,len(ref))
    return float(ref!=hyp)


In [None]:

# ================================================
# SECTION 5 — TRAIN & EVAL N-GRAM
# ================================================
vocab_set = build_vocab_set(100_000)
lm = CharNGramLM(n=3, alpha=0.5).fit(vocab_set)

def eval_split(df, lm, vocab, topk=(1,5)):
    correct={k:0 for k in topk}; tot=0; cer_sum=0.0
    for _, row in df.iterrows():
        noisy, gold = row["noisy"], row["clean"]
        ranked = sorted(candidates(noisy, vocab), key=lambda w: score(w, lm), reverse=True)
        tot += 1
        for k in topk:
            if gold in set(ranked[:k]): correct[k]+=1
        cer_sum += cer(gold, ranked[0])
    out={f"top{k}_acc": correct[k]/tot for k in topk}
    out.update({"avg_cer": cer_sum/tot, "n": tot})
    return out

dev_m = eval_split(dev_df, lm, vocab_set)
test_m = eval_split(test_df, lm, vocab_set)

os.makedirs("reports", exist_ok=True)
with open("reports/ngram_metrics.json","w") as f: json.dump({"dev":dev_m,"test":test_m}, f, indent=2)

print("✅ n-gram metrics")
print(json.dumps({"dev":dev_m,"test":test_m}, indent=2))

plt.figure()
plt.bar(["Top-1","Top-5"], [test_m.get("top1_acc",0.0), test_m.get("top5_acc",0.0)])
plt.ylim(0,1); plt.title("N-gram accuracy (test)"); plt.tight_layout()
plt.savefig("reports/ngram_plots.png"); plt.show()


In [None]:

# ================================================
# SECTION 5.1 — SENTENCE-LEVEL CORRECTION (N-GRAM)
# ================================================
import re
TOKEN_PATTERN = re.compile(r"\w+|[^\w\s]", re.UNICODE)

def _match_case(src: str, tgt: str)->str:
    if src.isupper(): return tgt.upper()
    if src[:1].isupper(): return tgt[:1].upper()+tgt[1:]
    return tgt

def correct_sentence_ngram(text: str, lm: CharNGramLM, vocab: Set[str], topk=5):
    # Simple & stable: per-token correction + punctuation-aware spacing
    toks = TOKEN_PATTERN.findall(text)
    out=[]
    for t in toks:
        if t.isalpha():
            ranked = sorted(candidates(t, vocab), key=lambda w: score(w, lm), reverse=True)
            best = ranked[0] if ranked else t
            out.append(_match_case(t, best))
        else:
            out.append(t)
    # detokenize: no space before .,!?:;) and after opening brackets
    detok=[]
    for i, t in enumerate(out):
        if i>0 and re.match(r"[.,!?;:)\]\}]", t):
            detok[-1] = detok[-1]+t
        elif t in ["(", "[", "{"]:
            detok.append(t)
        else:
            detok.append((t if not detok else " "+t))
    return "".join(detok).lstrip()


In [None]:

# ================================================
# SECTION 6 — OPTIONAL QUICK T5 FINE-TUNE
# ================================================
# You can skip this entirely. LLM fallback below works without training.
def t5_train_and_eval(train_df, dev_df, test_df, out_dir="models/t5_speller",
                      max_train=8000, max_dev=2000, epochs=1, batch_size=32):
    try:
        from transformers import (T5ForConditionalGeneration, T5TokenizerFast,
                                  DataCollatorForSeq2Seq, Trainer, TrainingArguments)
        from datasets import Dataset
        import numpy as np
        os.makedirs(out_dir, exist_ok=True)
        MAX_LEN=32; MODEL_NAME="t5-small"

        def make_hf(df):
            return Dataset.from_pandas(df[["noisy","clean"]])
        def preprocess(tok, ex):
            inputs=[f"spell: {x}" for x in ex["noisy"]]; targets=ex["clean"]
            mi=tok(inputs, max_length=MAX_LEN, truncation=True)
            with tok.as_target_tokenizer():
                labs=tok(targets, max_length=MAX_LEN, truncation=True)
            mi["labels"]=labs["input_ids"]; return mi
        def exact_match(ps, gs): return sum(p==g for p,g in zip(ps,gs))/max(1,len(gs))

        from transformers import T5TokenizerFast, T5ForConditionalGeneration
        tok=T5TokenizerFast.from_pretrained(MODEL_NAME)
        model=T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

        train_df=train_df.sample(n=min(max_train,len(train_df)), random_state=13)
        dev_df  =dev_df  .sample(n=min(max_dev,  len(dev_df)),   random_state=13)

        from datasets import Dataset
        tr=make_hf(train_df); dv=make_hf(dev_df)
        tr_tok=tr.map(lambda e: preprocess(tok,e), batched=True, remove_columns=tr.column_names)
        dv_tok=dv.map(lambda e: preprocess(tok,e), batched=True, remove_columns=dv.column_names)

        collator=DataCollatorForSeq2Seq(tok, model=model)
        args = TrainingArguments(output_dir=out_dir, evaluation_strategy="epoch", save_strategy="epoch",
                                 logging_steps=100, per_device_train_batch_size=batch_size,
                                 per_device_eval_batch_size=batch_size, num_train_epochs=epochs,
                                 learning_rate=5e-4, weight_decay=0.0, predict_with_generate=True, fp16=False)
        def cer_metric(refs, hyps): return sum(cer(r,h) for r,h in zip(refs,hyps))/max(1,len(refs))
        def compute_metrics(eval_pred):
            p_ids, y_ids = eval_pred
            p_txt=tok.batch_decode(p_ids, skip_special_tokens=True)
            y_ids[y_ids==-100]=tok.pad_token_id
            g_txt=tok.batch_decode(y_ids, skip_special_tokens=True)
            return {"exact_match": exact_match(p_txt,g_txt), "cer": cer_metric(g_txt,p_txt)}

        from transformers import Trainer
        trnr=Trainer(model=model, args=args, train_dataset=tr_tok, eval_dataset=dv_tok,
                     data_collator=collator, tokenizer=tok, compute_metrics=compute_metrics)
        trnr.train()
        trnr.save_model(out_dir); tok.save_pretrained(out_dir)
        print("✅ Saved fine-tuned model to", out_dir)
    except Exception as e:
        print("⚠️ Skipping fine-tune:", e)

# Example (optional):
# t5_train_and_eval(train_df, dev_df, test_df, max_train=8000, max_dev=2000, epochs=1, batch_size=32)


In [None]:
# ================================================
# SECTION 6.1 — ROBUST LLM LOADER (NO TRAINING NEEDED, STRONG PROMPTS)
# ================================================
import os, torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM

# Use a writable HF cache
os.environ.setdefault("HF_HOME", "./.hf_cache")
os.makedirs(os.environ["HF_HOME"], exist_ok=True)

DEVICE = torch.device("cpu")  # CPU-only for reliability

def _try_load_seq2seq(name, local=False):
    tok = AutoTokenizer.from_pretrained(name, local_files_only=local)
    model = AutoModelForSeq2SeqLM.from_pretrained(name, local_files_only=local)
    model.to(DEVICE).eval()
    return tok, model, name, True  # seq2seq=True

def _try_load_causal(name, local=False):
    tok = AutoTokenizer.from_pretrained(name, local_files_only=local)
    # GPT-2 usually lacks pad_token; set to EOS to avoid warnings
    if tok.pad_token is None and tok.eos_token is not None:
        tok.pad_token = tok.eos_token
    model = AutoModelForCausalLM.from_pretrained(name, local_files_only=local)
    model.to(DEVICE).eval()
    return tok, model, name, False  # seq2seq=False

def load_llm_with_fallbacks():
    # 1) your fine-tuned T5 (if you trained later)
    try:
        return _try_load_seq2seq("models/t5_speller", local=True)
    except Exception as e:
        print("Local T5 not found:", e)

    # 2) specialized correction models (better than generic small models)
    for m in [
        "oliverguhr/spelling-correction-english-base",  # spelling-focused T5
        "vennify/t5-base-grammar-correction",          # grammar correction T5
    ]:
        try:
            return _try_load_seq2seq(m, local=False)
        except Exception as e:
            print(f"{m} not available:", e)

    # 3) FLAN-T5 (instruction-tuned)
    for m in ["google/flan-t5-base", "google/flan-t5-small"]:
        try:
            return _try_load_seq2seq(m, local=False)
        except Exception as e:
            print(f"{m} not available:", e)

    # 4) GPT-2 (causal LM) as a last resort before vanilla T5
    try:
        return _try_load_causal("gpt2", local=False)
    except Exception as e:
        print("gpt2 not available:", e)

    # 5) vanilla T5-small
    return _try_load_seq2seq("t5-small", local=False)

tok_llm, llm_model, llm_name, IS_SEQ2SEQ = load_llm_with_fallbacks()
print(f"✅ Loaded LLM: {llm_name} | seq2seq={IS_SEQ2SEQ} | device={DEVICE}")

# ---------- Helpers ----------
def _generate_seq2seq(prompt: str) -> str:
    ids = tok_llm([prompt], return_tensors="pt")
    ids = {k: v.to(DEVICE) for k, v in ids.items()}
    with torch.inference_mode():
        out = llm_model.generate(
            **ids,
            max_new_tokens=96,
            num_beams=6,
            length_penalty=1.0,
            early_stopping=True,
            do_sample=False,
        )
    return tok_llm.batch_decode(out, skip_special_tokens=True)[0].strip()

def _generate_causal(prompt: str) -> str:
    if tok_llm.pad_token is None and tok_llm.eos_token is not None:
        tok_llm.pad_token = tok_llm.eos_token
    ids = tok_llm(prompt, return_tensors="pt")
    ids = {k: v.to(DEVICE) for k, v in ids.items()}
    with torch.inference_mode():
        out = llm_model.generate(
            **ids,
            max_new_tokens=96,
            num_beams=6,
            early_stopping=True,
            do_sample=False,
            pad_token_id=tok_llm.pad_token_id,
        )
    return tok_llm.decode(out[0], skip_special_tokens=True).strip()

def _unchanged(a: str, b: str) -> bool:
    return a.strip() == b.strip()

# Few-shot examples help smaller models a lot
FEWSHOT = (
    "Correct spelling and grammar.\n"
    "Input: Ths sentnce has soem mispelings.\n"
    "Corrected: This sentence has some misspellings.\n\n"
    "Input: I cant beleive tehres no mlik.\n"
    "Corrected: I can't believe there's no milk.\n\n"
)

def llm_correct_sentence(text: str) -> str:
    """
    Robust sentence correction:
    - Tries multiple strong prompts in order, falls back if unchanged.
    - Works for both seq2seq (T5/FLAN) and causal (GPT-2).
    """
    text = (text or "").strip()
    if not text:
        return ""

    # Strategy list varies slightly by model family, but we can try them in order.
    if IS_SEQ2SEQ:
        prompts = []

        # Specialized models often expect plain or "spell:" input
        if "oliverguhr/spelling-correction" in llm_name.lower():
            prompts += [text, f"spell: {text}"]

        if "vennify/t5-base-grammar-correction" in llm_name.lower():
            prompts += [f"fix grammar: {text}", f"proofread: {text}"]

        # FLAN prefers instruction format
        if "flan" in llm_name.lower():
            prompts += [
                f"Proofread and correct spelling and grammar.\nReturn only the corrected sentence.\nInput: {text}\nCorrected:",
                FEWSHOT + f"Input: {text}\nCorrected:",
            ]

        # Generic T5 fallbacks
        prompts += [f"fix spelling: {text}", f"spell: {text}"]

        # Try candidates until we see a change
        for p in prompts:
            out = _generate_seq2seq(p)
            if not _unchanged(out, text):
                return out
        # If all unchanged, return the last attempt (or original)
        return out if prompts else text

    else:
        # GPT-2: build an instruction + few-shot prompt and extract after "Corrected:"
        prompt_variants = [
            FEWSHOT + f"Input: {text}\nCorrected:",
            f"Correct spelling and grammar.\nInput: {text}\nCorrected:",
        ]
        for p in prompt_variants:
            gen = _generate_causal(p)
            # extract after "Corrected:"
            if "Corrected:" in gen:
                cand = gen.split("Corrected:", 1)[1].strip()
            else:
                cand = gen
            if not _unchanged(cand, text):
                return cand
        return cand  # last try

# If your GUI calls `correct_with_t5`, point it here:
try:
    correct_with_t5 = llm_correct_sentence  # make GUI pick this up without changes
except Exception:
    pass


Local T5 not found: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files.
Check your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.


tokenizer_config.json:   0%|          | 0.00/353 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


(…)f3815f99f42fcc48674a6f5d4e785cbc16448b80:   0%|          | 0.00/558M [00:00<?, ?B/s]

✅ Loaded LLM: oliverguhr/spelling-correction-english-base | seq2seq=True | device=cpu


In [None]:

# ================================================
# SECTION 7 — SIMPLE GUI (IPYWIDGETS): N-GRAM + LLM
# ================================================
try:
    import ipywidgets as widgets
    from IPython.display import display
    ta = widgets.Textarea(
        value="Ths sentnce has soem mispelings.",
        description="Input:",
        layout=widgets.Layout(width="100%", height="80px")
    )
    btn = widgets.Button(description="Correct", button_style="primary")
    out = widgets.Output()

    def on_click(_):
        with out:
            out.clear_output()
            print("— N-gram sentence correction —")
            print(correct_sentence_ngram(ta.value, lm, vocab_set, topk=5))
            print("\n— LLM sentence correction —")
            print(llm_correct_sentence(ta.value))

    btn.on_click(on_click)
    display(ta, btn, out)
    print("✅ GUI ready. Type text and click Correct.")
except Exception as e:
    print("ipywidgets not available. You can call the functions directly:")
    print("correct_sentence_ngram('text', lm, vocab_set)  and  llm_correct_sentence('text')")


Textarea(value='Ths sentnce has soem mispelings.', description='Input:', layout=Layout(height='80px', width='1…

Button(button_style='primary', description='Correct', style=ButtonStyle())

Output()

✅ GUI ready. Type text and click Correct.
