In [4]:
from collections import defaultdict
import random

class NGramLanguageModel:
    def __init__(self, n):
        self.n = n
        self.ngram_counts = defaultdict(int)
        self.context_counts = defaultdict(int)
        self.vocab = set()

    def train(self, corpus):
        for sentence in corpus:
            tokens = ['<s>'] * (self.n - 1) + sentence + ['</s>']
            self.vocab.update(tokens)

            for i in range(len(tokens) - self.n + 1):
                context = tuple(tokens[i:i+self.n-1])
                word = tokens[i + self.n - 1]
                self.ngram_counts[context + (word,)] += 1
                self.context_counts[context] += 1

    def predict_next(self, context):
        context = tuple(context[-(self.n - 1):])
        candidates = {word: self.ngram_counts[context + (word,)] for word in self.vocab}
        total = self.context_counts.get(context, 0)

        if total == 0:
            return random.choice(list(self.vocab))  # unseen context

        probabilities = {word: count / total for word, count in candidates.items() if count > 0}
        return max(probabilities, key=probabilities.get)

    def get_probability(self, context, word):
        context = tuple(context[-(self.n - 1):])
        count_ngram = self.ngram_counts.get(context + (word,), 0)
        count_context = self.context_counts.get(context, 0)
        if count_context == 0:
            return 0.0  # zero denominator
        return count_ngram / count_context


# Example usage:
corpus = [
    ["I", "love", "machine", "learning"],
    ["I", "love", "deep", "learning"],
    ["deep", "learning", "is", "fun"],
    ["machine", "learning", "is", "powerful"]
]

model = NGramLanguageModel(n=3)  # Trigram model
model.train(corpus)

context = ["I", "love"]
print("Next word prediction for context", context, "→", model.predict_next(context))

print("P('machine' | 'I', 'love') =", model.get_probability(["I", "love"], "machine"))
print("P('learning' | 'I', 'love') =", model.get_probability(["I", "love"], "learning"))  # unlikely seen


Next word prediction for context ['I', 'love'] → deep
P('machine' | 'I', 'love') = 0.5
P('learning' | 'I', 'love') = 0.0
