In [12]:
import nltk
from nltk import word_tokenize, FreqDist
from nltk.util import ngrams
from collections import defaultdict, Counter
import random

nltk.download('punkt')

# Sample text corpus
text = "Machine learning models are used for predictive analytics. Predictive analytics is important in data science. Data science involves machine learning models."

# Tokenize the text
tokens = word_tokenize(text.lower())

# 1. Unigrams
unigrams = tokens
unigram_freq = FreqDist(unigrams)

print("Unigrams:", unigrams)
print("Unigram frequencies:", unigram_freq)

# 2. Bigrams
bigrams = list(ngrams(tokens, 2))
bigram_freq = FreqDist(bigrams)

print("\nBigrams:", bigrams)
print("Bigram frequencies:", bigram_freq)

# 3. Trigrams
trigrams = list(ngrams(tokens, 3))
trigram_freq = FreqDist(trigrams)

print("\nTrigrams:", trigrams)
print("Trigram frequencies:", trigram_freq)

# 4. Bigram probabilities
bigram_prob = defaultdict(lambda: defaultdict(lambda: 0))
for w1, w2 in bigrams:
    bigram_prob[w1][w2] += 1

for w1 in bigram_prob:
    total_count = float(sum(bigram_prob[w1].values()))
    for w2 in bigram_prob[w1]:
        bigram_prob[w1][w2] /= total_count

print("\nBigram probabilities:")
for w1 in bigram_prob:
    for w2 in bigram_prob[w1]:
        print(f"P({w2}|{w1}) = {bigram_prob[w1][w2]}")

# 5. Next word prediction
def predict_next_word(current_word):
    if current_word in bigram_prob:
        next_word = max(bigram_prob[current_word], key=bigram_prob[current_word].get)
        return next_word
    else:
        return None

# Example predictions
current_words = ["predictive", "machine", "data"]
for word in current_words:
    next_word = predict_next_word(word)
    print(f"\nNext word prediction for '{word}': {next_word}")




Unigrams: ['machine', 'learning', 'models', 'are', 'used', 'for', 'predictive', 'analytics', '.', 'predictive', 'analytics', 'is', 'important', 'in', 'data', 'science', '.', 'data', 'science', 'involves', 'machine', 'learning', 'models', '.']
Unigram frequencies: <FreqDist with 15 samples and 24 outcomes>

Bigrams: [('machine', 'learning'), ('learning', 'models'), ('models', 'are'), ('are', 'used'), ('used', 'for'), ('for', 'predictive'), ('predictive', 'analytics'), ('analytics', '.'), ('.', 'predictive'), ('predictive', 'analytics'), ('analytics', 'is'), ('is', 'important'), ('important', 'in'), ('in', 'data'), ('data', 'science'), ('science', '.'), ('.', 'data'), ('data', 'science'), ('science', 'involves'), ('involves', 'machine'), ('machine', 'learning'), ('learning', 'models'), ('models', '.')]
Bigram frequencies: <FreqDist with 19 samples and 23 outcomes>

Trigrams: [('machine', 'learning', 'models'), ('learning', 'models', 'are'), ('models', 'are', 'used'), ('are', 'used', 'for

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
