In [14]:
#!/usr/bin/env python3

from collections import Counter, defaultdict
from tqdm import tqdm
import json
import math

# ---------------------------------------------------------------------
# Utility: read tokenized corpus
# ---------------------------------------------------------------------
def read_corpus(path):
    """Each line = tokens separated by space."""
    corpus = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            toks = line.strip().split()
            if toks:
                corpus.append(toks)
    return corpus
def stream_corpus(filepath, limit=None):
    corpus = []
    with open(filepath, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            line = line.strip()
            if not line:
                continue
            corpus.append(line)
            if limit and i >= limit:
                break
    return " ".join(corpus)

# ---------------------------------------------------------------------
# Common: vocabulary counting
# ---------------------------------------------------------------------
def get_vocab(sentences):
    vocab = Counter()
    for tokens in sentences:
        for tok in tokens:
            vocab[tok] += 1
    return vocab

# ---------------------------------------------------------------------
# --------------------------  BPE  ------------------------------------
# ---------------------------------------------------------------------
def bpe_get_stats(vocab):
    """Count frequency of symbol pairs across all words."""
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[(symbols[i], symbols[i+1])] += freq
    return pairs

def bpe_merge_vocab(vocab, pair):
    """Merge given symbol pair in all words."""
    a, b = pair
    pattern = ' '.join(pair)
    replacement = a + b
    new_vocab = {}
    for word, freq in vocab.items():
        new_word = word.replace(pattern, replacement)
        new_vocab[new_word] = freq
    return new_vocab

def train_bpe(tokenized_corpus, num_merges=32000):
    # 1Ô∏è‚É£ Prepare initial vocab (char-level)
    vocab = Counter()
    for sent in tokenized_corpus:
        for word in sent:
            chars = ' '.join(list(word)) + ' </w>'
            vocab[chars] += 1

    merges = []
    print(f"[BPE] Starting training with {len(vocab)} unique words...")
    for i in tqdm(range(num_merges), desc="BPE merges"):
        pairs = bpe_get_stats(vocab)
        if not pairs:
            break
        best = max(pairs, key=pairs.get)
        vocab = bpe_merge_vocab(vocab, best)
        merges.append(best)
    print(f"[BPE] Done {len(merges)} merges.")
    return merges, vocab

# ---------------------------------------------------------------------
# -----------------------  WORDPIECE  ---------------------------------
# ---------------------------------------------------------------------
def wordpiece_train(tokenized_corpus, target_vocab_size=32000):
    word_freqs = Counter()
    for sent in tokenized_corpus:
        for word in sent:
            word_freqs[word] += 1

    # Initialize subwords as individual characters + '</w>'
    subword_vocab = Counter()
    for w, f in word_freqs.items():
        for ch in list(w) + ['</w>']:
            subword_vocab[ch] += f

    merges = []
    print(f"[WordPiece] Initial subword vocab = {len(subword_vocab)}")

    for i in tqdm(range(target_vocab_size - len(subword_vocab)), desc="WordPiece merges"):
        # Count pair frequencies
        pair_freqs = Counter()
        for w, f in word_freqs.items():
            symbols = list(w) + ['</w>']
            for i2 in range(len(symbols) - 1):
                pair_freqs[(symbols[i2], symbols[i2 + 1])] += f

        if not pair_freqs:
            break

        # Compute likelihood score = freq(pair) / (freq(a) * freq(b))
        best_pair, best_score = None, -1.0
        for (a, b), freq in pair_freqs.items():
            score = freq / (subword_vocab[a] * subword_vocab[b] + 1e-10)
            if score > best_score:
                best_pair, best_score = (a, b), score

        # Merge best pair
        new_token = a + b
        subword_vocab[new_token] = pair_freqs[best_pair]
        merges.append(best_pair)
        if len(subword_vocab) >= target_vocab_size:
            break

    print(f"[WordPiece] Final vocab size ‚âà {len(subword_vocab)}")
    return merges, subword_vocab

# ---------------------------------------------------------------------
# --------------------------  MAIN  -----------------------------------
# ---------------------------------------------------------------------
if __name__ == "__main__":
    import os, json

    input_file = r"C:\Users\rani\Desktop\nlp lab\lab1\hindi_tokens.txt"

    outdir = r"C:\Users\rani\Desktop\nlp lab\lab9\output1"
    os.makedirs(outdir, exist_ok=True)

    
    corpus = stream_corpus(input_file, limit=100000)  # limit lines for testing

    num_merges = 32000
    vocab_size = 32000

    # ------------------ üîπ BPE Algorithm ------------------
    print("\nüöÄ Training Byte Pair Encoding (BPE)...")
    merges_bpe, final_vocab_bpe = train_bpe(corpus, num_merges=num_merges)

    with open(os.path.join(outdir, "bpe_merges.json"), "w", encoding="utf-8") as f:
        json.dump(merges_bpe, f, ensure_ascii=False, indent=2)

    with open(os.path.join(outdir, "bpe_vocab.json"), "w", encoding="utf-8") as f:
        json.dump(list(final_vocab_bpe.keys()), f, ensure_ascii=False, indent=2)

    # ------------------ üîπ WordPiece Algorithm ------------------
    print("\nüöÄ Training WordPiece...")
    merges_wp, vocab_wp = wordpiece_train(corpus, target_vocab_size=vocab_size)

    with open(os.path.join(outdir, "wordpiece_merges.json"), "w", encoding="utf-8") as f:
        json.dump(merges_wp, f, ensure_ascii=False, indent=2)

    with open(os.path.join(outdir, "wordpiece_vocab.json"), "w", encoding="utf-8") as f:
        json.dump(list(vocab_wp.keys()), f, ensure_ascii=False, indent=2)

    # ‚úÖ Step 5: Confirm output
    print("\n‚úÖ Training complete!")
    print("üìÇ Files saved in:", outdir)




üöÄ Training Byte Pair Encoding (BPE)...
[BPE] Starting training with 454 unique words...


BPE merges:   1%|‚ñè         | 451/32000 [00:00<00:17, 1842.46it/s]


[BPE] Done 451 merges.

üöÄ Training WordPiece...
[WordPiece] Initial subword vocab = 455


WordPiece merges: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 31545/31545 [00:42<00:00, 742.47it/s]


[WordPiece] Final vocab size ‚âà 456

‚úÖ Training complete!
üìÇ Files saved in: C:\Users\rani\Desktop\nlp lab\lab9\output1
