In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pushk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import math
from collections import defaultdict, Counter
from nltk.tokenize import word_tokenize

In [10]:
corpus = """
Students study hard to achieve good grades.
Teachers guide students through difficult concepts.
Examinations test the knowledge of students.
Online learning platforms provide flexible education.
Research projects improve practical understanding.
"""

In [11]:
def preprocess(text):
    text = text.lower()
    tokens = word_tokenize(text)
    return tokens

tokens = preprocess(corpus)

In [12]:
def build_ngram_model(tokens, n):
    model = defaultdict(Counter)
    for i in range(len(tokens) - n + 1):
        context = tuple(tokens[i:i+n-1])
        word = tokens[i+n-1]
        model[context][word] += 1
    return model

In [13]:
unigram_model = build_ngram_model(tokens, 1)
bigram_model = build_ngram_model(tokens, 2)
trigram_model = build_ngram_model(tokens, 3)

In [14]:
def laplace_prob(model, context, word, vocab_size):
    context_count = sum(model[context].values())
    word_count = model[context][word]
    return (word_count + 1) / (context_count + vocab_size)

In [15]:
def autocomplete(input_text, top_k=5):
    tokens = preprocess(input_text)
    vocab = set(tokens)
    vocab_size = len(vocab)

    # Try trigram
    if len(tokens) >= 2:
        context = tuple(tokens[-2:])
        if context in trigram_model:
            probs = {
                word: laplace_prob(trigram_model, context, word, vocab_size)
                for word in trigram_model[context]
            }
            return sorted(probs, key=probs.get, reverse=True)[:top_k]

    # Try bigram
    if len(tokens) >= 1:
        context = tuple(tokens[-1:])
        if context in bigram_model:
            probs = {
                word: laplace_prob(bigram_model, context, word, vocab_size)
                for word in bigram_model[context]
            }
            return sorted(probs, key=probs.get, reverse=True)[:top_k]

    # Fallback to unigram
    unigram_probs = {
        word: count / sum(unigram_model[()].values())
        for word, count in unigram_model[()].items()
    }
    return sorted(unigram_probs, key=unigram_probs.get, reverse=True)[:top_k]

In [16]:
print(autocomplete("students"))
print(autocomplete("online"))
print(autocomplete("research"))
print(autocomplete("teachers"))

['study', 'through', '.']
['learning']
['projects']
['guide']
