In [4]:
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import defaultdict, Counter
import random

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
corpus = """
Language models are statistical tools used in natural language processing.
They help predict the next word in a sentence.
Unigrams consider individual words.
Bigrams look at pairs of consecutive words.
Trigrams extend that to three-word sequences.
"""

In [7]:
tokens = word_tokenize(corpus.lower())
tokens = [word for word in tokens if word.isalpha()]

In [8]:
unigram_counts = Counter(tokens)
bigram_counts = Counter(ngrams(tokens, 2))
trigram_counts = Counter(ngrams(tokens, 3))

In [9]:
def get_next_word_unigram():
    return random.choices(list(unigram_counts.keys()), weights=unigram_counts.values(), k=1)[0]

In [10]:

def get_next_word_bigram(context):
    candidates = {ng[1]: count for ng, count in bigram_counts.items() if ng[0] == context}
    if not candidates:
        return get_next_word_unigram()
    return random.choices(list(candidates.keys()), weights=candidates.values(), k=1)[0]

In [11]:
def get_next_word_trigram(context):
    candidates = {ng[2]: count for ng, count in trigram_counts.items() if ng[:2] == context}
    if not candidates:
        return get_next_word_bigram(context[-1])
    return random.choices(list(candidates.keys()), weights=candidates.values(), k=1)[0]

In [12]:
def generate_sentence(model='unigram', seed=None, length=10):
    sentence = []
    if model == 'unigram':
        for _ in range(length):
            word = get_next_word_unigram()
            sentence.append(word)

    elif model == 'bigram':
        if seed is None:
            seed = get_next_word_unigram()
        sentence.append(seed)
        for _ in range(length - 1):
            next_word = get_next_word_bigram(sentence[-1])
            sentence.append(next_word)

    elif model == 'trigram':
        if seed is None:
            seed = (get_next_word_unigram(), get_next_word_unigram())
        elif isinstance(seed, str):
            seed = (seed, get_next_word_unigram())
        sentence.extend(seed)
        for _ in range(length - 2):
            next_word = get_next_word_trigram(tuple(sentence[-2:]))
            sentence.append(next_word)

    return ' '.join(sentence)

In [13]:
print("\n🔹 Unigram prediction:")
print(generate_sentence('unigram'))

print("\n🔹 Bigram prediction (seed='language'):")
print(generate_sentence('bigram', seed='language'))

print("\n🔹 Trigram prediction (seed=('language', 'models')):")
print(generate_sentence('trigram', seed=('language', 'models')))


🔹 Unigram prediction:
in sentence pairs tools words consider sentence the sentence of

🔹 Bigram prediction (seed='language'):
language models are statistical tools used in a sentence unigrams

🔹 Trigram prediction (seed=('language', 'models')):
language models are statistical tools used in natural language processing
