In [1]:
import nltk
from nltk.corpus import brown
from nltk.util import ngrams
from collections import Counter
import pandas as pd
import string

# Download the Brown corpus
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


True

## 1. Cleaning the corpus

In [2]:
# Clean the tokens: lowercase and remove punctuation
tokenized_text = [word.lower() for word in brown.words() if word.isalpha()]

## 2. n-grams and their frequencies function

In [4]:
def get_ngram_frequencies(tokens, n):
    ngram_list = list(ngrams(tokens, n))
    freq = Counter(ngram_list)
    df = pd.DataFrame(freq.items(), columns=['ngram', 'freq'])
    df.sort_values(by='freq', ascending=False, inplace=True)
    return df

## 3. Predicting top-k next words based on n-grams

In [7]:
def predict_next_words(context, ngram_df, n, k=5):
    context_tokens = context.lower().split()
    if len(context_tokens) != n - 1:
        raise ValueError(f"Context should have exactly {n - 1} words for {n}-gram prediction.")

    # Match n-grams starting with the context
    filtered = ngram_df[ngram_df['ngram'].apply(lambda x: list(x[:n-1]) == context_tokens)]
    top_k = filtered.nlargest(k, 'freq')

    return [ngram[-1] for ngram in top_k['ngram']]

## Testing

In [6]:
# Get trigram frequencies
trigram_df = get_ngram_frequencies(tokenized_text, n=3)

# Predict top-5 next words after "the united"
predict_next_words("the united", trigram_df, n=3, k=5)

['states', 'nations', 'kingdom', 'declining', 'steel']