In [2]:
from collections import defaultdict

def compute_ngram_probs(counts_higher, counts_lower, discount=0.5):
    probs = {}
    for ngram, c in counts_higher.items():
        prefix = ngram[:-1]
        c_prefix = counts_lower.get(prefix, 0)
        if c_prefix > 0:
            probs[ngram] = max(c - discount, 0) / c_prefix
    return probs

def compute_backoff_weights(counts_higher, counts_lower, probs_higher, probs_lower):
    backoff = defaultdict(float)
    contexts = set([ngram[:-1] for ngram in counts_higher.keys()])
    for h in contexts:
        seen_words = [ngram[-1] for ngram in counts_higher.keys() if ngram[:-1]==h]
        sum_disc = sum(probs_higher[(h+(w,))] for w in seen_words if (h+(w,)) in probs_higher)
        sum_lower = sum(probs_lower.get((h[1:],w), 0) for w in seen_words)  # backoff context
        if (1 - sum_lower) > 0:
            backoff[h] = (1 - sum_disc) / (1 - sum_lower)
        else:
            backoff[h] = 1.0
    return backoff

def katz_prob(word, context, probs4, probs3, probs2, probs1, back4, back3, back2):
    # context = tuple of previous words (up to 3)
    if len(context) >= 3 and (context + (word,)) in probs4:
        return probs4[context + (word,)]
    elif len(context) >= 3 and context in back4:
        return back4[context] * katz_prob(word, context[1:], probs3, probs2, probs1, back3, back2, {})
    elif len(context) >= 2 and (context + (word,)) in probs3:
        return probs3[context + (word,)]
    elif len(context) >= 2 and context in back3:
        return back3[context] * katz_prob(word, context[1:], probs2, probs1, {}, {}, {}, {})
    elif len(context) >= 1 and (context + (word,)) in probs2:
        return probs2[context + (word,)]
    elif len(context) >= 1 and context in back2:
        return back2[context] * probs1.get((word,), 1e-8)
    else:
        return probs1.get((word,), 1e-8)


In [5]:
import pickle

counts_n1=[]
counts_n3=[]
counts_n4=[]
counts_n2=[]
with open("C://Users/rani/Desktop/nlp lab/lab5/counts_n1.pkl", "rb") as f:
    counts_n1 = pickle.load(f)  # for unigram or bigram as per your data
with open("C://Users/rani/Desktop/nlp lab/lab5/counts_n2.pkl", "rb") as f:
    counts_n2 = pickle.load(f) 
with open("C://Users/rani/Desktop/nlp lab/lab5/counts_n3.pkl", "rb") as f:
    counts_n3 = pickle.load(f) 
with open("C://Users/rani/Desktop/nlp lab/lab5/counts_n4.pkl", "rb") as f:
    counts_n4 = pickle.load(f) 

# Compute discounted probs for each order
probs4 = compute_ngram_probs(counts_n4, counts_n3)
probs3 = compute_ngram_probs(counts_n3, counts_n2)
probs2 = compute_ngram_probs(counts_n2, counts_n1)
probs1 = {k: v/sum(counts_n1.values()) for k,v in counts_n1.items()}

# Compute backoff weights
back4 = compute_backoff_weights(counts_n4, counts_n3, probs4, probs3)
back3 = compute_backoff_weights(counts_n3, counts_n2, probs3, probs2)
back2 = compute_backoff_weights(counts_n2, counts_n1, probs2, probs1)


KeyboardInterrupt: 

In [15]:
print('calculating back3...')
back3 = compute_backoff_weights(counts_n3, counts_n2, probs3, probs2)
print('calculating back2...')
back2 = compute_backoff_weights(counts_n2, counts_n1, probs2, probs1)

calculating back3...
calculating back2...


In [25]:
print(back2)

defaultdict(<class 'float'>, {('लिखने',): 0.5, ('पीओएस',): 0.5, ('फर्स्\u200dट',): 0.5, ('एच',): 0.5, ('गड्ढा',): 0.5, ('डीडवाना',): 0.5, ('देख',): 0.3157894736842105, ('रफी',): 0.5, ('जुर्माना,',): 0.5, ('प्रबुद्ध',): 0.5, ('साथ',): 0.2973933649289099, ('2-3',): 0.5, ('महेश्वरी',): 0.5, ('कार्ड’',): 0.5, ('जवाहरलाल',): 0.20000000000000007, ('मालिक',): 0.4, ('-26',): 0.5, ('कंधे',): 0.33333333333333337, ('क्रांति',): 0.4285714285714286, ('जेटली',): 0.375, ('बनवाने',): 0.5, ('कछला',): 0.5, ('फाफामऊ,',): 0.5, ('सोफे',): 0.5, ('Suzuki',): 0.5, ("'मुख्\u200dयमंत्री",): 0.5, ('हथकड़ी',): 0.5, ('चुप',): 0.5, ('ईलाही',): 0.5, ('धुंदा',): 0.5, ('पेट्रोलियम',): 0.5, ('संभाला,',): 0.5, ('जनसैलाब',): 0.5, ('फ्लाइओवर',): 0.5, ('लैपटॉप',): 0.5, ('आरोग्य',): 0.25, ('पट्टी',): 0.5, ('आत्मा',): 0.375, ('सफाया',): 0.5, ('पेंशन,',): 0.5, ('मनाया',): 0.40909090909090906, ('अभीष्टï',): 0.5, ('02ः',): 0.5, ('मरते',): 0.5, ('एशिया',): 0.3125, ('वाहिद',): 0.5, ('चंडीगढ़',): 0.30000000000000004, ('वनवास',): 0

In [55]:
import random
import numpy as np

def generate_sentence_greedy(katz_prob_fn, start_tokens, max_len=20, vocab=None, top_k=10, min_len=5):
    sentence = list(start_tokens)

    for _ in range(max_len):
        context = tuple(sentence[-(len(start_tokens)):])
        # Get probabilities for all words
        word_probs = np.array([katz_prob_fn(w, context) for w in vocab])
        
        # Normalize to sum=1 (important)
        if word_probs.sum() == 0:
            word_probs = np.ones_like(word_probs) / len(word_probs)
        else:
            word_probs /= word_probs.sum()
        
        # Get indices of top_k most probable words
        top_indices = np.argsort(word_probs)[-top_k:]
        top_words = [vocab[i] for i in top_indices]
        top_probs = word_probs[top_indices]
        
        # Re-normalize top-k probabilities
        top_probs = top_probs / top_probs.sum()
        
        # Randomly choose next word among top-k
        next_word = np.random.choice(top_words, p=top_probs)
        
        # Avoid early </s>
        if len(sentence) < min_len and next_word == '</s>':
            continue
        
        sentence.append(next_word)
        
        if next_word == '</s>':
            break
    
    return sentence


import numpy as np
import heapq
import random

def generate_sentence_beam(katz_prob_fn, start_tokens, beam_size=20, max_len=20, vocab=None, top_k=10, temperature=1.0):
    beam = [(0.0, list(start_tokens))]  # (neg log-prob, sequence)
    
    for _ in range(max_len):
        candidates = []
        for logp, seq in beam:
            context = tuple(seq[-(len(start_tokens)):])
            
            # Compute probabilities for all words
            probs = np.array([katz_prob_fn(w, context) for w in vocab])
            probs = np.maximum(probs, 1e-12)  # Avoid log(0)
            probs = probs / probs.sum()
            
            # Pick top-k candidates
            top_indices = np.argsort(probs)[-top_k:]
            top_probs = probs[top_indices]
            top_probs = top_probs / top_probs.sum()
            
            # Randomly sample one next word (weighted by probability)
            sampled_idx = np.random.choice(top_indices, p=top_probs)
            w = vocab[sampled_idx]
            p = probs[sampled_idx]
            
            candidates.append((logp - np.log(p), seq + [w]))
        
        # Keep best `beam_size` beams
        beam = heapq.nsmallest(beam_size, candidates, key=lambda x: x[0])
        
        # Stop if all beams end with </s>
        if all(seq[-1] == '</s>' for _, seq in beam):
            break

    # Randomly pick one of the final top beams (for more diversity)
    return random.choice(beam)[1]


def generate_sentences(katz_prob_fn, vocab, n_sentences=100, start_tokens=None,method='greedy', beam_size=20):
    sentences = []
    if start_tokens is None:
        start_tokens = ['<s>'] * (len(vocab[0].split()) if hasattr(vocab[0], "__iter__") else 3)
    
    for _ in range(n_sentences):
        if method == "greedy":
            sent = generate_sentence_greedy(katz_prob_fn, start_tokens, vocab=vocab)
        elif method == "beam":
            sent = generate_sentence_beam(katz_prob_fn, start_tokens, beam_size=beam_size, vocab=vocab)
        else:
            raise ValueError("Method must be 'greedy' or 'beam'")
        sentences.append(" ".join(str(tok) if not isinstance(tok, tuple) else tok[0] for tok in sent))
    return sentences


import numpy as np
import random
import heapq

# Example: Katz probability function for quadrigrams (or any n-gram)
def get_katz_prob_fn(n, probs_list, backoff_list):
    """
    Returns a function f(word, context) → probability
    probs_list = [probs1, probs2, ..., probsN]
    backoff_list = [back1, back2, ..., backN-1]
    """
    def katz_prob(word, context):
        if n == 1:
            return probs_list[0].get((word,), 1e-8)
        else:
            # recursively backoff
            context = context[-(n-1):] if len(context) >= n-1 else context
            if tuple(context + (word,)) in probs_list[n-1]:
                return probs_list[n-1][tuple(context + (word,))]
            elif tuple(context) in backoff_list[n-2]:
                lower_fn = get_katz_prob_fn(n-1, probs_list, backoff_list)
                return backoff_list[n-2][tuple(context)] * lower_fn(word, context[1:])
            else:
                lower_fn = get_katz_prob_fn(n-1, probs_list, backoff_list)
                return lower_fn(word, context[1:])
    return katz_prob


In [35]:
# probs_list = [probs1, probs2, probs3, probs4]
# backoff_list = [back2, back3, back4]  # backoff weights for 2-,3-,4-grams
vocab = [w[0] for w in counts_n1.keys()]  # unigram vocabulary
start_tokens = ['<s>', '<s>', '<s>']

# Get Katz probability function
katz_fn = get_katz_prob_fn(4, [probs1, probs2, probs3, probs4], [back2,back3, back4])

# Generate 100 greedy sentences
greedy_sentences = generate_sentences(katz_fn, vocab, n_sentences=10, start_tokens=start_tokens, method="greedy")

# Optionally save to files
with open("quadrigram_greedy.txt", "w", encoding="utf-8") as f:
    for s in greedy_sentences:
        print(s)
        f.write(s + "\n")
        



<s> <s> <s> से पहले दिन इनकी शूटिंग थी तो इन्हें सुबह आठ बजे बुलाया गया है </s>
<s> <s> <s> के एक आश्चर्यजनक चयन प्रदान की जाती है, लेकिन मैं नहीं चाहता कि लोग पान मसाला और गुटखा की बिक्री
<s> <s> <s> में खलल नहीं डाला जाएगा </s>
<s> <s> <s> को एक दिन के भीतर अपने ही नेता के लिए यह दिन मनाया गया, महिलाओं ने पुलिस को दी
<s> <s> <s> है और वो है </s>
<s> <s> <s> के एक लाख आईटी प्रोफेशनल के साथ ही कहा कि कोहली की टीम में एसआई अरविंद कुमार, अधिकारी,
<s> <s> <s> है कि जब वे एफआईआर दर्ज कराने पुलिस स्टेशन गए, तो पुलिस ने शिकायत की
<s> <s> <s> और इस आधार पर इसे मानव अधिकार का उल्लंघन माना जाता है </s>
<s> <s> <s> ने जो फैसला सुनाया </s>
<s> <s> <s> के घातकों को एक से बढ़कर एक बकवास फिल्में भी दी </s>


In [56]:
# Generate 100 beam search sentences
beam_sentences = generate_sentences(katz_fn, vocab, n_sentences=5, start_tokens=start_tokens, method="beam", beam_size=20)

with open("quadrigram_beam.txt", "w", encoding="utf-8") as f:
    for s in beam_sentences:
        print(s)
        f.write(s + "\n")

<s> <s> <s> फरवरी, शनिवार तुला सात रहेगी जाइए, ल़डो शनिवार तक तक शनिवार 2 जाइए, 2 रहेगी तुला 22 2 2 जाइए,
<s> <s> <s> जाइए, जाइए, रहेगी जाइए, सात ल़डो तक फरवरी, फरवरी, जाइए, जाइए, फरवरी, फरवरी, जाइए, ल़डो रहेगी फरवरी, 22 2 तुला
<s> <s> <s> 2 शनिवार तक तुला 2 रहेगी ल़डो ल़डो फरवरी, ल़डो तुला जाइए, रहेगी जाइए, फरवरी, तुला शनिवार तुला तुला तक
<s> <s> <s> तुला 22 2 फरवरी, तक 22 ल़डो फरवरी, 22 2 शनिवार ल़डो फरवरी, जाइए, फरवरी, ल़डो जाइए, 22 सात ल़डो
<s> <s> <s> जाइए, तक रहेगी ल़डो फरवरी, 22 2 जाइए, तुला फरवरी, तुला रहेगी रहेगी जाइए, ल़डो 22 सात सात ल़डो तक


In [27]:
context = ('<s>', '<s>', '<s>')
for w, p in sorted([(w, katz_fn(w, context)) for w in vocab], key=lambda x: -x[1])[:10]:
    print(w, p)


</s> 0.05689900426742532
के 0.03862019914651493
में 0.02922475106685633
की 0.024310099573257467
है 0.02290896159317212
को 0.018421052631578946
से 0.01729018492176387
ने 0.013527738264580369
का 0.012660028449502134
और 0.012261735419630156


<h2>kneser ney smoothing</h2>

In [37]:
from collections import Counter, defaultdict
import random
import numpy as np

# ---------- Step 1.2: Kneser–Ney probability ----------
def kneser_ney_prob(word, context, ngram_counts, lower_order_probs, D=0.75):
    """
    Compute Kneser–Ney probability for a given word given context.
    context: tuple of (n-1) words
    """
    n = len(context) + 1
    if n == 1:
        return ngram_counts[0][(word,)] / sum(ngram_counts[0].values())

    # Numerator and denominator for conditional probability
    full_ngram = context + (word,)
    context_count = sum(
        c for ngram, c in ngram_counts[n-1].items() if ngram[:-1] == context
    )

    # Discounted probability term
    count_full = ngram_counts[n-1][full_ngram]
    if context_count > 0:
        discounted = max(count_full - D, 0) / context_count
    else:
        discounted = 0

    # Compute λ (backoff weight)
    num_unique_continuations = len(
        {ngram for ngram in ngram_counts[n-1] if ngram[:-1] == context}
    )
    lambda_weight = (D * num_unique_continuations / context_count) if context_count > 0 else 1.0

    # Recursive lower-order probability
    lower_context = context[1:]
    lower_prob = lower_order_probs(word, lower_context)
    return discounted + lambda_weight * lower_prob

# ---------- Step 1.3: Recursive wrapper ----------
def get_kneser_ney_prob_fn(ngram_counts, D=0.75):
    def kn_func(word, context):
        n = len(context) + 1
        if n == 1:
            return ngram_counts[0][(word,)] / sum(ngram_counts[0].values())
        else:
            lower_order_fn = get_kneser_ney_prob_fn(ngram_counts[:-1], D)
            return kneser_ney_prob(word, context, ngram_counts, lower_order_fn, D)
    return kn_func


In [39]:
ngram_counts = [counts_n1, counts_n2, counts_n3, counts_n4]

# Create the Kneser–Ney probability function
kneser_ney_fn = get_kneser_ney_prob_fn(ngram_counts)

In [40]:
def generate_sentence_greedy_kn(kneser_ney_fn, start_tokens, vocab, max_len=20):
    sentence = list(start_tokens)
    for _ in range(max_len):
        context = tuple(sentence[-(len(start_tokens)):])
        word_probs = {w: kneser_ney_fn(w, context) for w in vocab}
        next_word = max(word_probs, key=word_probs.get)
        sentence.append(next_word)
        if next_word == '</s>':
            break
    return sentence

def generate_sentence_beam_kn(kneser_ney_fn, start_tokens, vocab, beam_size=20, max_len=20):
    beams = [(list(start_tokens), 0.0)]  # (sentence, log_prob)

    for _ in range(max_len):
        new_beams = []
        for sent, log_prob in beams:
            context = tuple(sent[-(len(start_tokens)):])
            probs = np.array([kneser_ney_fn(w, context) for w in vocab])
            if probs.sum() == 0:
                continue
            probs = probs / probs.sum()

            # Select top beam_size candidates for this context
            top_indices = np.argsort(probs)[-beam_size:]
            for idx in top_indices:
                w = vocab[idx]
                new_sent = sent + [w]
                new_log_prob = log_prob + np.log(probs[idx] + 1e-12)
                new_beams.append((new_sent, new_log_prob))

        # Keep only top beam_size beams
        beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_size]

        # Stop if all beams end with </s>
        if all(sent[-1] == '</s>' for sent, _ in beams):
            break

    # Return the highest probability sentence
    best_sentence = max(beams, key=lambda x: x[1])[0]
    return best_sentence



In [None]:
start_tokens = ('<s>', '<s>', '<s>')
vocab = list(counts_n1.keys())  # vocabulary from unigrams

greedy_sentences = [
    generate_sentence_greedy_kn(kneser_ney_fn, start_tokens, vocab)
    for _ in range(10)
]


In [44]:
print(greedy_sentences)

['<s> <s> <s> से पहले दिन इनकी शूटिंग थी तो इन्हें सुबह आठ बजे बुलाया गया है </s>', '<s> <s> <s> के एक आश्चर्यजनक चयन प्रदान की जाती है, लेकिन मैं नहीं चाहता कि लोग पान मसाला और गुटखा की बिक्री', '<s> <s> <s> में खलल नहीं डाला जाएगा </s>', '<s> <s> <s> को एक दिन के भीतर अपने ही नेता के लिए यह दिन मनाया गया, महिलाओं ने पुलिस को दी', '<s> <s> <s> है और वो है </s>', '<s> <s> <s> के एक लाख आईटी प्रोफेशनल के साथ ही कहा कि कोहली की टीम में एसआई अरविंद कुमार, अधिकारी,', '<s> <s> <s> है कि जब वे एफआईआर दर्ज कराने पुलिस स्टेशन गए, तो पुलिस ने शिकायत की', '<s> <s> <s> और इस आधार पर इसे मानव अधिकार का उल्लंघन माना जाता है </s>', '<s> <s> <s> ने जो फैसला सुनाया </s>', '<s> <s> <s> के घातकों को एक से बढ़कर एक बकवास फिल्में भी दी </s>']


In [None]:

beam_sentences = [
    generate_sentence_beam_kn(kneser_ney_fn, start_tokens, vocab, beam_size=20)
    for _ in range(10)
]


In [46]:
print(beam_sentences)

['<s> <s> <s> के एक आश्चर्यजनक चयन प्रदान करता है सभी प्रकार के खरीफ फसलों का रकबा पिछले साल के मुकाबले यह 161', '<s> <s> <s> के एक आश्चर्यजनक चयन प्रदान करता है सभी प्रकार के खरीफ फसलों का रकबा पिछले साल के मुकाबले यह 161', '<s> <s> <s> के एक आश्चर्यजनक चयन प्रदान करता है सभी प्रकार के खरीफ फसलों का रकबा पिछले साल के मुकाबले यह 161', '<s> <s> <s> के एक आश्चर्यजनक चयन प्रदान करता है सभी प्रकार के खरीफ फसलों का रकबा पिछले साल के मुकाबले यह 161', '<s> <s> <s> के एक आश्चर्यजनक चयन प्रदान करता है सभी प्रकार के खरीफ फसलों का रकबा पिछले साल के मुकाबले यह 161']
