In [3]:
# Add-One (Laplace) Smoothing
def add_one_smoothing(ngram_counts, vocab_size):
    
    total_count = sum(ngram_counts.values())
    smoothed_probs = {}
    for ngram, count in ngram_counts.items():
        smoothed_probs[ngram] = (count + 1) / (total_count + vocab_size)
    return smoothed_probs


# Add-k Smoothing
def add_k_smoothing(ngram_counts, vocab_size, k=0.5):
    
    total_count = sum(ngram_counts.values())
    smoothed_probs = {}
    for ngram, count in ngram_counts.items():
        smoothed_probs[ngram] = (count + k) / (total_count + k * vocab_size)
    return smoothed_probs


# Add Token Type Smoothing
def add_token_type_smoothing(ngram_counts, token_types):
  
    total_count = sum(ngram_counts.values())
    smoothed_values = {}
    for ngram, count in ngram_counts.items():
        smoothed_values[ngram] = (count + token_types) / (total_count + token_types)
    return smoothed_values


In [None]:

def load_ngrams_from_txt(file_name):
    from collections import Counter
    counter = Counter()
    with open(file_name, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) != 2:
                continue
            ngram, freq = parts
            freq = int(freq)
            tokens = tuple(ngram.split()) if " " in ngram else ngram
            counter[tokens] = freq
    return counter
def save_ngrams_to_txt(counter, file_name):
    with open(file_name, "w", encoding="utf-8") as f:
        for ngram, value in counter.items():
            if isinstance(ngram, tuple):
                ngram_str = " ".join(ngram)
            else:
                ngram_str = ngram
            f.write(f"{ngram_str}\t{value}\n")


if __name__ == "__main__":
    # Example: do it for unigrams
    unigrams = load_ngrams_from_txt("unigrams.txt")
    vocab_size = len(unigrams)

    uni_add1 = add_one_smoothing(unigrams, vocab_size)
    uni_addk = add_k_smoothing(unigrams, vocab_size, k=0.5)
    uni_toktype = add_token_type_smoothing(unigrams, vocab_size)

    save_ngrams_to_txt(uni_add1, "unigrams_add1.txt")
    save_ngrams_to_txt(uni_addk, "unigrams_addk.txt")
    save_ngrams_to_txt(uni_toktype, "unigrams_toktype.txt")

    print("âœ… Smoothed unigram models saved!")

    # Repeat for bigrams, trigrams, quadrigrams
    for name in ["bigrams", "trigrams", "quadrigrams"]:
        print(f"Processing {name}...")
        counts = load_ngrams_from_txt(f"{name}.txt")
        vocab_size = len(counts)

        add1 = add_one_smoothing(counts, vocab_size)
        addk = add_k_smoothing(counts, vocab_size, k=0.5)
        toktype = add_token_type_smoothing(counts, vocab_size)

        save_ngrams_to_txt(add1, f"{name}_add1.txt")
        save_ngrams_to_txt(addk, f"{name}_addk.txt")
        save_ngrams_to_txt(toktype, f"{name}_toktype.txt")


In [4]:
for name in ["quadrigrams"]:
        print(f"Processing {name}...")
        counts = load_ngrams_from_txt(f"{name}.txt")
        vocab_size = len(counts)

        print(f"Processing add1")
        add1 = add_one_smoothing(counts, vocab_size)
        print(f"Processing add_k")
        addk = add_k_smoothing(counts, vocab_size, k=0.5)
        print(f"Processing add_token_type")
        toktype = add_token_type_smoothing(counts, vocab_size)

        save_ngrams_to_txt(add1, f"{name}_add1.txt")
        save_ngrams_to_txt(addk, f"{name}_addk.txt")
        save_ngrams_to_txt(toktype, f"{name}_toktype.txt")

Processing quadrigrams...
Processing add1
Processing add_k
Processing add_token_type
