In [7]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize, FreqDist
from nltk.util import bigrams, trigrams
from nltk.probability import FreqDist, ConditionalFreqDist
from collections import defaultdict


corpus = """
Natural language processing (NLP) is a field of artificial intelligence (AI) that focuses on the interaction between computers and humans through natural language. The ultimate objective of NLP is to enable computers to understand, interpret, and generate human language in a way that is both meaningful and useful.
"""


tokens = word_tokenize(corpus.lower())

unigrams = list(nltk.ngrams(tokens, 1))
unigram_freq = FreqDist(unigrams)


bigrams_list = list(bigrams(tokens))
bigram_freq = FreqDist(bigrams_list)


trigrams_list = list(trigrams(tokens))
trigram_freq = FreqDist(trigrams_list)

bigram_probabilities = ConditionalFreqDist((w1, w2) for w1, w2 in bigrams_list)
for w1 in bigram_probabilities:
    total_count = sum(bigram_probabilities[w1].values())
    for w2 in bigram_probabilities[w1]:
        bigram_probabilities[w1][w2] /= total_count


def predict_next_word(word, bigram_probabilities):
    if word in bigram_probabilities:
        return bigram_probabilities[word].max()
    else:
        return None


print("Unigrams:")
print(unigram_freq)

print("\nBigrams:")
print(bigram_freq)

print("\nTrigrams:")
print(trigram_freq)

print("\nBigram Probabilities:")
for w1 in bigram_probabilities:
    for w2 in bigram_probabilities[w1]:
        print(f"P({w2}|{w1}) = {bigram_probabilities[w1][w2]:.4f}")


word = 'the'
next_word = predict_next_word(word, bigram_probabilities)
print(f"\nNext word prediction for '{word}': {next_word}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unigrams:
<FreqDist with 38 samples and 56 outcomes>

Bigrams:
<FreqDist with 54 samples and 55 outcomes>

Trigrams:
<FreqDist with 54 samples and 54 outcomes>

Bigram Probabilities:
P(language|natural) = 1.0000
P(processing|language) = 0.3333
P(.|language) = 0.3333
P(in|language) = 0.3333
P((|processing) = 1.0000
P(nlp|() = 0.5000
P(ai|() = 0.5000
P()|nlp) = 0.5000
P(is|nlp) = 0.5000
P(is|)) = 0.5000
P(that|)) = 0.5000
P(a|is) = 0.3333
P(to|is) = 0.3333
P(both|is) = 0.3333
P(field|a) = 0.5000
P(way|a) = 0.5000
P(of|field) = 1.0000
P(artificial|of) = 0.5000
P(nlp|of) = 0.5000
P(intelligence|artificial) = 1.0000
P((|intelligence) = 1.0000
P()|ai) = 1.0000
P(focuses|that) = 0.5000
P(is|that) = 0.5000
P(on|focuses) = 1.0000
P(the|on) = 1.0000
P(interaction|the) = 0.5000
P(ultimate|the) = 0.5000
P(between|interaction) = 1.0000
P(computers|between) = 1.0000
P(and|computers) = 0.5000
P(to|computers) = 0.5000
P(humans|and) = 0.3333
P(generate|and) = 0.3333
P(useful|and) = 0.3333
P(through|hum