In [21]:
!pip install nltk




In [22]:
import nltk
nltk.download('brown')
from nltk.corpus import brown
from nltk import trigrams
from nltk import ConditionalFreqDist as CFD

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


#### Part 1:  Building an N-Gram Language Model Using the NLTK Brown Corpus

In this section, a statistical LM is created using the **Brown Corpus** from NLTK library.
The data is preprocessed into all lower case words and adds the BOS and EOS for every sentence.

The LM is built using a **Trigram Model:**

$$
P(w_i \mid w_{i-2}, w_{i-1})
$$



In [23]:

tokens = []

for sentence in brown.sents():
    tokens.append("<s>") # Add BOS and EOS tags for each sentence
    tokens.append("<s>") # Trigrams have two BOS tags

    for words in sentence:
        tokens.append(words.lower()) # Reduce vocabulary size The != the
    tokens.append("</s>")

In [24]:
# Build Trigram model using conditional frequency distribution
trigram = []

for word1, word2, word3 in trigrams(tokens):
    trigram.append(((word1,word2),word3))

trigram_table = CFD(trigram)

#### Part 2 - 3: Generate Predictions "I am ... and Your are ..."

By using the Trigram model, simple predictions are made following `I am` and `You are`

In [25]:
import random
def predict_next_word(start):
    """
    Predicts the next word in a sequence using a trigram model with weighted sampling.

    Args:
        start (str): The starting sequence of words.

    Returns:
        str: The predicted next word.
    """
    start_words = start.lower().split()
    # Take the previous two words
    word1 = start_words[-2]
    word2 = start_words[-1]

    prediction = trigram_table[(word1,word2)]

    if len(prediction) == 0:
        return "</s>"

    # Dont be so determinsitic, add variety
    choices = list(prediction.keys())
    weights = list(prediction.values())

    return random.choices(choices, weights=weights)[0]

In [26]:
start = "you are"
next_words = []

for _ in range(10):
    word = predict_next_word(start)
    next_words.append(word)

for word in next_words:
    print(start + " " + word)


you are horrified
you are about
you are not
you are being
you are ready
you are a
you are bound
you are not
you are the
you are now


In [27]:
def generate_sentence(start, max_len=10):
    """
    Generates a sentence using a trigram model.

    Args:
        start (str): The starting sequence of words.
        max_len (int, optional): The maximum length of the generated sentence. Defaults to 10.

    Returns:
        str: The generated sentence
    """
    words = start.lower().split()

    if len(words) == 1:
        words = ["<s>" , words[0]]
    elif len(words) == 0:
        words = ["<s>" , "<s>"]

    for _ in range(max_len):
        start_now = " ".join(words)
        next_word = predict_next_word(start_now)

        if next_word == "</s>":
            break

        words.append(next_word)

        if next_word == "</s>":
            break

    return " ".join(words)

In [28]:
import re

PUNCT_TO_REMOVE = {"''", "``", "--"}

def clean_sentence(sentence):
    """
    Cleans a sentence by removing punctuation and extra spaces.

    Args:
        sentence (str): The sentence to be cleaned.

    Returns:
        str: The cleaned sentence.
    """
    words = sentence.split()

    cleaned_words = []
    for w in words:
        if w not in PUNCT_TO_REMOVE:
            cleaned_words.append(w)

    if len(cleaned_words) > 2 and cleaned_words[2] in PUNCT_TO_REMOVE:
        cleaned_words.pop(2)

    text = ""
    for i in range(len(cleaned_words)):
        if i == 0:
            text = cleaned_words[i]
        else:
            text += " " + cleaned_words[i]

    text = re.sub(r"\s+([.,?!;:])", r"\1", text)

    text = re.sub(r"([.,?!])\1+", r"\1", text)

    return text


In [29]:
sentence = "You are"
you_are_sentences = []

for _ in range(10):

    you_are_sentences.append(generate_sentence(sentence))

for i in you_are_sentences:
    print(clean_sentence(i))


you are in a future which, had set so solidly as
you are willing to compromise, stood by her in death
you are unable to propagate, i felt that they did not
you are getting out-moded.
you are once again assure all peoples and times in recent events
you are a bit of soap and water, street scenes.
you are able to force walter to tell you this much having
you are really closer to this proposed merger than profit and loss
you are talking about a series of articles, there was a
you are.


In [30]:
sentence = "I am"
i_am_sentences = []

for _ in range(10):
    i_am_sentences.append(generate_sentence(sentence))

for i in i_am_sentences:
    print(clean_sentence(i))

i am not privileged to know and apprehend it, since silence
i am padding it a familiar vaudeville device, with production of
i am deliberately raising the policy may not have to carry the
i am not now, or peril, or so before serving
i am married?
i am taunting you as he did sleep, and tragedies facing
i am also registrar.
i am deliberately raising the average human body, attempting to connect
i am an old guy running a darker color.
i am, robinson answered warily.


#### Part 4: Calculate the probabilities of the predicted words

To compute the probabilities of a sentence, use the **Trigram Formula**

$$\prod_{i=1}^{n} P(word_i|word_{i-2},word_{i-1}) = \prod_{i=1}^{n} \frac{count(word_{i-2}, word_{i-1}, word_i)}{count(word_{i-2}, word_{i-1})}$$

Note: Laplace smoothing (Add 1) was added, incase any raw counts have `zero`

*modified bigram formula with laplace smoothing:*
$$P_{LAP}(w_i | h) = \frac {Count(h,w_i) +1}{Count(h) + |V|}$$

In [31]:
V = len(set(tokens))

def trigram_probability(w1, w2, w3):
    """
    Computes the probability of a trigram using Laplace smoothing.

    Args:
        w1 (str): The first word in the trigram.
        w2 (str): The second word in the trigram.
        w3 (str): The third word in the trigram.
    """
    numerator = trigram_table[(w1, w2)][w3] + 1
    denominator = trigram_table[(w1, w2)].N() + V

    return numerator / denominator


In [32]:
def compute_sentence_probability(sentence):
    """
    Computes the probability of a sentence using the trigram model.

    Args:
        sentence (str): The sentence to compute the probability of.

    Returns:
        float: The probability of the sentence.
    """
    words = sentence.lower().split()

    # Add's BOS and EOS to each sentence
    words.insert(0, "<s>")
    words.insert(0, "<s>")
    words.append("</s>")

    probability = 1.0

    for i in range(2, len(words)):
        w1 = words[i-2]
        w2 = words[i-1]
        w3 = words[i]

        probability *= trigram_probability(w1, w2, w3)

    return probability


In [33]:
you_are_sentence_prob = []

for sentence in you_are_sentences:
    P = compute_sentence_probability(sentence)
    you_are_sentence_prob.append((sentence, P))
    # Prints in .e format, since probabilities are often very small
    print(f"{P:>12.2e}  |  {sentence}")


    2.12e-54  |  you are in a future which , had set so solidly as
    1.10e-54  |  you are willing to compromise , stood by her in death ''
    7.26e-52  |  you are unable to propagate , i felt that they did not
    3.87e-24  |  you are getting out-moded .
    2.44e-55  |  you are once again assure all peoples and times in recent events
    2.19e-53  |  you are a bit of soap and water , street scenes .
    4.53e-54  |  you are able to force walter to tell you this much having
    3.65e-55  |  you are really closer to this proposed merger than profit and loss
    6.05e-50  |  you are talking about a series of articles , there was a
    5.05e-14  |  you are .


#### Part 5: Computer Perplexity of the model

`Perplexity` will measure how suprised the model is by a given sentence. A `high` perplexity the model is very suprised, a `low` perplexity the model is confident in it's predictions

$$PP(sentence) = P(sentence)^{-1/n}$$


In [34]:
def compute_perplexity(sentence):
    """
    Computes the perplexity of a sentence using the trigram model.

    Args:
        sentence (str): The sentence to compute the perplexity of.

    Returns:
        float: The perplexity of the sentence.
    """
    p = compute_sentence_probability(sentence)

    words = sentence.lower().split()
    words.insert(0,"<s>")
    words.insert(0,"<s>")
    words.append("</s>")

    return p ** (-1.0/len(words))

In [35]:
for sentence, prob in you_are_sentence_prob:
    pp = compute_perplexity(sentence)
    print(f"{pp:>10.2f}  |  {sentence}")

   3786.59  |  you are in a future which , had set so solidly as
   3956.94  |  you are willing to compromise , stood by her in death ''
   2566.15  |  you are unable to propagate , i felt that they did not
    844.28  |  you are getting out-moded .
   4373.95  |  you are once again assure all peoples and times in recent events
   3240.91  |  you are a bit of soap and water , street scenes .
   3599.83  |  you are able to force walter to tell you this much having
   4257.83  |  you are really closer to this proposed merger than profit and loss
   1910.86  |  you are talking about a series of articles , there was a
    164.50  |  you are .


#### Part 6: Gemini

Using the 10 sentences that was generated above, Google-Gemini-1.5-flash will create a generate a story (within 500 words).

It ensures the story contains no DEROGATORY, TOXICITY, VIOLENCE, HARASSMENT, HATE_SPEECH, SEXUAL, etc.  

In [36]:
!pip install -U google-generativeai



In [37]:
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

genai.configure(api_key="AIzaSyAqp3_4pzwrSrZhkplQ9B3Jg0c_94llqBE")

# Had SDK issues, switched models to 2.5 flash
model = genai.GenerativeModel("models/gemini-2.5-flash")

SAFETY_SETTINGS = {
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
}

sys_prompt = (
    "Create a coherent, creative story under 500 words by taking the following sentences.\n"
    "Requirements:\n"
    "- PG-13 appropriate content only\n"
    "- No political or geopolitical manipulation\n"
    "- Clear, coherent narrative (get rid of any incoherent source material)\n"
    "- No coercion, threats, or non consensual scenarios\n"
    "- Natural flow of integration of source material\n"
    "\nSource sentences to use:\n"
)

for s in you_are_sentences:
    sys_prompt += " " + s + "\n"

response = model.generate_content(
    sys_prompt,
    safety_settings=SAFETY_SETTINGS
)

print(response.text)


The polished chrome skyscrapers outside Dr. Aris Thorne's window reflected a city where **you are in a future which, had set so solidly as** to feel immutable. He ran a finger over the cold glass, a stark contrast to the warmth of his memories. "They say **you are getting out-moded**," he murmured to his reflection, a sentiment he'd heard often about his focus on biomimicry over pure synthetics.

He thought of Elara, his research partner, whose final project had been deemed too costly. "I remember when **you are willing to compromise**, yet **stood by her in death**," he whispered, recalling the difficult decision to shut down their prototype after the accident that claimed her. Their vision for a self-sustaining ecosystem, designed to re-seed desolate environments, was dismissed because ultimately, **you are unable to propagate** such complex, organic systems at the industrial scale the corporations demanded. "I felt that they did not understand," he’d often confided, the loss still r