In [8]:
import math
import csv
import re

bigram_file = "bigrams_add1.txt"       
unigram_file = "unigrams_add1.txt"       
sentences_file = "sentences.txt"  
out_csv = "sentence_probabilities.csv"
max_sentences = 1000


In [13]:

# Load LM
unigram_probs = {}
bigram_probs = {}
with open(unigram_file, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) == 2:
            word, prob = parts
            unigram_probs[word] = float(prob)
        elif len(parts) == 3:
            w1, w2, prob = parts
            bigram_probs[(w1, w2)] = float(prob)
print(f"Loaded LM: {len(unigram_probs)} unigrams")

with open(bigram_file, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) == 2:
            word, prob = parts
            unigram_probs[word] = float(prob)
        elif len(parts) == 3:
            w1, w2, prob = parts
            bigram_probs[(w1, w2)] = float(prob)
print(f"Loaded LM: {len(bigram_probs)} bigrams")

# Load sentences
with open(sentences_file, 'r', encoding='utf-8') as f:
    text = f.read()
    # Split sentences using Hindi danda (।) or punctuation
    sentence = re.split(r'[।!?.]', text)
    sentence = [s.strip() for s in sentence if s.strip()]
    sentences.extend(sentence)

sentences = sentences[:max_sentences]
print(f"Processing {len(sentences)} sentences")

def tokenize(sentence):
    return re.findall(r'[\u0900-\u097F]+', sentence.lower())

def sentence_prob_unigram(tokens, unigram_probs):
    log_prob = 0.0
    for w in tokens + ['</s>']:
        prob = unigram_probs.get(w, unigram_probs.get('<unk>', 1e-8))
        log_prob += math.log(prob)
    return log_prob

def sentence_prob_bigram(tokens, bigram_probs, unigram_probs):
    seq = ['<s>'] + tokens + ['</s>']
    log_prob = 0.0
    for i in range(1, len(seq)):
        w1, w2 = seq[i-1], seq[i]
        prob = bigram_probs.get((w1, w2), None)
        if prob is None:
            prob = unigram_probs.get(w2, unigram_probs.get('<unk>', 1e-8))
        log_prob += math.log(prob)
    return log_prob

def perplexity(log_prob, n_tokens):
    return math.exp(-log_prob / max(n_tokens, 1))

results = []
for s in sentences:
    tokens = tokenize(s)
    log_u = sentence_prob_unigram(tokens, unigram_probs)
    p_u=math.exp(log_u)
    log_b = sentence_prob_bigram(tokens, bigram_probs, unigram_probs)
    p_b=math.exp(log_b)
    n_tokens = len(tokens) + 1  # include </s>
    results.append((s, log_u,p_u, log_b,p_b))

# ---------- Save CSV ----------
with open(out_csv, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['sentence', 'logprob_unigram', 'prob_unigram', 'logprob_bigram','prob_bigram' ])
    for row in results:
        writer.writerow(row)

print(f"Saved results to {out_csv}")


Loaded LM: 510917 unigrams
Loaded LM: 4604982 bigrams
Processing 646 sentences
Saved results to sentence_probabilities.csv


3️⃣ Why log-prob differs

Context sensitivity:
Unigram ignores word order → may assign moderate probability to unlikely sequences.
Bigram considers order → likely sequences get higher log-prob, unlikely sequences get lower log-prob.

Smoothing effects:
Bigram smoothing often gives lower probabilities for unseen or rare sequences, so log-prob may be more negative than unigram in some cases.

Sentence length impact:
Longer sentences amplify differences because log-prob is additive.

<b>more negative value of logprob refers rarer sentencescalc based on n-grams.</b>