In [3]:
import nltk
from nltk import word_tokenize, FreqDist
from nltk.util import ngrams
from collections import defaultdict, Counter
import random

nltk.download('punkt')

# Sample text corpus
text = "Artificial intelligence and machine learning are transforming the world of technology and creating new opportunities for innovation."

# Tokenize the text
tokens = word_tokenize(text.lower())

# 1. Unigrams
unigrams = tokens
unigram_freq = FreqDist(unigrams)

print("Unigrams:", unigrams)
print("Unigram frequencies:", unigram_freq)

# 2. Bigrams
bigrams = list(ngrams(tokens, 2))
bigram_freq = FreqDist(bigrams)

print("\nBigrams:", bigrams)
print("Bigram frequencies:", bigram_freq)

# 3. Trigrams
trigrams = list(ngrams(tokens, 3))
trigram_freq = FreqDist(trigrams)

print("\nTrigrams:", trigrams)
print("Trigram frequencies:", trigram_freq)

# 4. Bigram probabilities
bigram_prob = defaultdict(lambda: defaultdict(lambda: 0))
for w1, w2 in bigrams:
    bigram_prob[w1][w2] += 1

for w1 in bigram_prob:
    total_count = float(sum(bigram_prob[w1].values()))
    for w2 in bigram_prob[w1]:
        bigram_prob[w1][w2] /= total_count

print("\nBigram probabilities:")
for w1 in bigram_prob:
    for w2 in bigram_prob[w1]:
        print(f"P({w2}|{w1}) = {bigram_prob[w1][w2]}")

# 5. Next word prediction
def predict_next_word(current_word):
    if current_word in bigram_prob:
        next_word = max(bigram_prob[current_word], key=bigram_prob[current_word].get)
        return next_word
    else:
        return None

# Example predictions
current_words = ["this", "a", "for"]
for word in current_words:
    next_word = predict_next_word(word)
    print(f"\nNext word prediction for '{word}': {next_word}")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unigrams: ['artificial', 'intelligence', 'and', 'machine', 'learning', 'are', 'transforming', 'the', 'world', 'of', 'technology', 'and', 'creating', 'new', 'opportunities', 'for', 'innovation', '.']
Unigram frequencies: <FreqDist with 17 samples and 18 outcomes>

Bigrams: [('artificial', 'intelligence'), ('intelligence', 'and'), ('and', 'machine'), ('machine', 'learning'), ('learning', 'are'), ('are', 'transforming'), ('transforming', 'the'), ('the', 'world'), ('world', 'of'), ('of', 'technology'), ('technology', 'and'), ('and', 'creating'), ('creating', 'new'), ('new', 'opportunities'), ('opportunities', 'for'), ('for', 'innovation'), ('innovation', '.')]
Bigram frequencies: <FreqDist with 17 samples and 17 outcomes>

Trigrams: [('artificial', 'intelligence', 'and'), ('intelligence', 'and', 'machine'), ('and', 'machine', 'learning'), ('machine', 'learning', 'are'), ('learning', 'are', 'transforming'), ('are', 'transforming', 'the'), ('transforming', 'the', 'world'), ('the', 'world', '