In [None]:
import kenlm
import math

# 1. Load your 4-gram model
#    You can load either a plain .arpa.gz or a binary .klm file:
#    model = kenlm.Model('4-gram.trie.klm')
model = kenlm.Model('4-gram.arpa.gz')

In [None]:
# 2. Prepare your sentences
sentences = [
    "This is a well-formed sentence.",
    "An entirely novel utterance appears here."
    "haer oheirn iubiqub oqfn nioqwionocnsfwcvd"
]

# 3. Score each sentence
for sent in sentences:
    sent = sent.upper()
    # a) Log₁₀-probability (default)
    log10p = model.score(sent, bos=True, eos=True)
    # b) Convert to log₂-probability
    log2p = log10p / math.log10(2)
    # c) Approximate per-word perplexity:
    #    perplexity = 10 ** ( - log10p / N )
    #    where N is the number of words including <s> and </s> if bos/eos=True
    #    KenLM’s score() is the sum over all words including boundaries.
    #    So for per-word perplexity:
    num_tokens = len(sent.split()) + (1 if True else 0)  # +1 for either <s> or </s>, but KenLM includes both; adjust as needed
    perp = 10 ** (-log10p / num_tokens)

    print(f"Sentence: {sent}")
    print(f"  log₁₀-probability: {log10p:.4f}")
    print(f"  log₂-probability: {log2p:.4f}")
    print(f"  Approx. per-word perplexity: {perp:.2f}\n")
