In [1]:
import nltk
from nltk import bigrams, trigrams, FreqDist
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from collections import defaultdict, Counter
import random

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')

# Sample text corpus
text = "Natural language processing (NLP) is a field of artificial intelligence (AI) that focuses on the interaction between computers and humans through natural language."

# Tokenize the text
tokens = word_tokenize(text.lower())

# Unigrams
unigrams = list(nltk.ngrams(tokens, 1))
print("Unigrams:")
print(unigrams)

# Bigrams
bigrams_list = list(nltk.ngrams(tokens, 2))
print("\nBigrams:")
print(bigrams_list)

# Trigrams
trigrams_list = list(nltk.ngrams(tokens, 3))
print("\nTrigrams:")
print(trigrams_list)

# Bigram probabilities
bigram_freq = FreqDist(bigrams_list)
unigram_freq = FreqDist(tokens)
bigram_prob = {bigram: bigram_freq[bigram] / unigram_freq[bigram[0]] for bigram in bigram_freq}

print("\nBigram Probabilities:")
for bigram, prob in bigram_prob.items():
    print(f"{bigram}: {prob:.4f}")

# Next word prediction
def predict_next_word(word, bigram_prob, top_n=1):
    next_words = [bigram[1] for bigram in bigram_prob if bigram[0] == word]
    if not next_words:
        return []
    next_word_freq = Counter(next_words)
    return next_word_freq.most_common(top_n)

word = 'the'
print(f"\nNext word prediction for '{word}':")
predictions = predict_next_word(word, bigram_prob)
for prediction, freq in predictions:
    print(f"{prediction}: {freq}")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unigrams:
[('natural',), ('language',), ('processing',), ('(',), ('nlp',), (')',), ('is',), ('a',), ('field',), ('of',), ('artificial',), ('intelligence',), ('(',), ('ai',), (')',), ('that',), ('focuses',), ('on',), ('the',), ('interaction',), ('between',), ('computers',), ('and',), ('humans',), ('through',), ('natural',), ('language',), ('.',)]

Bigrams:
[('natural', 'language'), ('language', 'processing'), ('processing', '('), ('(', 'nlp'), ('nlp', ')'), (')', 'is'), ('is', 'a'), ('a', 'field'), ('field', 'of'), ('of', 'artificial'), ('artificial', 'intelligence'), ('intelligence', '('), ('(', 'ai'), ('ai', ')'), (')', 'that'), ('that', 'focuses'), ('focuses', 'on'), ('on', 'the'), ('the', 'interaction'), ('interaction', 'between'), ('between', 'computers'), ('computers', 'and'), ('and', 'humans'), ('humans', 'through'), ('through', 'natural'), ('natural', 'language'), ('language', '.')]

Trigrams:
[('natural', 'language', 'processing'), ('language', 'processing', '('), ('processing'