In [1]:
import nltk
from nltk.corpus import reuters
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
import random
import numpy as np

In [2]:
# Download the required datasets
nltk.download('reuters')
nltk.download('punkt')
nltk.download('punkt_tab')

# Initialize the model as a defaultdict of defaultdicts
model = defaultdict(lambda: defaultdict(lambda: 0.0))  # Using float for probabilities

# Function to pad sentences manually
def pad_sentence(sentence, n=3):
    padding = [None] * (n - 1)
    return padding + sentence + padding

# Process sentences in the Reuters corpus
for sentence in reuters.sents():
    padded_sentence = pad_sentence(sentence)
    for w1, w2, w3 in trigrams(padded_sentence):  # No pad_right, pad_left, or pad_symbol
        model[(w1, w2)][w3] += 1

# Convert counts to probabilities with smoothing
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] = (model[w1_w2][w3] + 1) / (total_count + len(model[w1_w2]))  # Laplace smoothing

print("Model has been successfully built!")

# Function to generate random text
def generate_text(starting_words, model, length=100):
    text = starting_words[:]  # Create a copy to avoid modifying the original
    sentence_finished = False
    while not sentence_finished and len(text) < length:
        next_word_candidates = model[tuple(text[-2:])]
        if not next_word_candidates:
            sentence_finished = True
            break

        # Use probabilities to pick the next word
        words, probabilities = zip(*next_word_candidates.items())

        # Normalize probabilities to ensure they sum to 1
        probabilities = np.array(probabilities)
        probabilities /= probabilities.sum()

        try:
            next_word = random.choices(words, probabilities)[0]
        except IndexError:
            sentence_finished = True
            break

        text.append(next_word)

        if text[-2:] == [None, None] or len(model[tuple(text[-2:])]) == 0:
            sentence_finished = True

    return ' '.join([t for t in text if t])

# Generate multiple random sentences
starting_words = ["today", "the"]
for _ in range(5):
    generated_text = generate_text(starting_words, model)
    print(generated_text)

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\skong\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\skong\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\skong\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


Model has been successfully built!
today the time they have been thinly traded on the unusual rise in manufacturing industries to win mandates , Morgan added .
today the public sector debt fell by 5 . 25 European currency dealers said .
today the company stressed in a 53 pct of Intercontinental Bank Holding Co , said it plans to join in when the Fed because it said .
today the Turkish research ship which Greece had asked GATT to arbitrate on the commercial creditors would start within two to three billion stg level for the shareholder group consisting of 38 . 20 billion in 1985 .
today the options reflect the merger called for stepped - up spending as highly unlikely that the substantial exchange rate trends , said Charles Stichler , an American army base near Athens as a huge rise in retail operating profit rose to 6 . 5p vs 5 , 545 , 160 short tons of low returns on commercial bills to a Senate Finance Committee ' s expenses and other market dependent influences just about weathered 