# Unigram

In [1]:
import nltk
from nltk import FreqDist
from nltk.tokenize import word_tokenize

# Ensure required NLTK data is downloaded
nltk.download('punkt')

# Sample text
text = "This is a sample text for unigram analysis. Unigram analysis is useful."

# Tokenize and convert to lowercase
tokens = word_tokenize(text.lower())

# Unigrams
unigrams = FreqDist(tokens)
print("Unigrams:")
for word, freq in unigrams.items():
    print(f"{word}: {freq}")


Unigrams:
this: 1
is: 2
a: 1
sample: 1
text: 1
for: 1
unigram: 2
analysis: 2
.: 2
useful: 1


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Bigram

In [2]:
import nltk
from nltk import bigrams, FreqDist
from nltk.tokenize import word_tokenize

# Ensure required NLTK data is downloaded
nltk.download('punkt')

# Sample text
text = "This is a sample text for bigram analysis. Bigram analysis is insightful."

# Tokenize and convert to lowercase
tokens = word_tokenize(text.lower())

# Bigrams
bi_grams = list(bigrams(tokens))
bigram_freq = FreqDist(bi_grams)
print("Bigrams:")
for bigram, freq in bigram_freq.items():
    print(f"{bigram}: {freq}")


Bigrams:
('this', 'is'): 1
('is', 'a'): 1
('a', 'sample'): 1
('sample', 'text'): 1
('text', 'for'): 1
('for', 'bigram'): 1
('bigram', 'analysis'): 2
('analysis', '.'): 1
('.', 'bigram'): 1
('analysis', 'is'): 1
('is', 'insightful'): 1
('insightful', '.'): 1


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Trigram

In [3]:
import nltk
from nltk import trigrams, FreqDist
from nltk.tokenize import word_tokenize

# Ensure required NLTK data is downloaded
nltk.download('punkt')

# Sample text
text = "This is a sample text for trigram analysis. Trigram analysis provides context."

# Tokenize and convert to lowercase
tokens = word_tokenize(text.lower())

# Trigrams
tri_grams = list(trigrams(tokens))
trigram_freq = FreqDist(tri_grams)
print("Trigrams:")
for trigram, freq in trigram_freq.items():
    print(f"{trigram}: {freq}")


Trigrams:
('this', 'is', 'a'): 1
('is', 'a', 'sample'): 1
('a', 'sample', 'text'): 1
('sample', 'text', 'for'): 1
('text', 'for', 'trigram'): 1
('for', 'trigram', 'analysis'): 1
('trigram', 'analysis', '.'): 1
('analysis', '.', 'trigram'): 1
('.', 'trigram', 'analysis'): 1
('trigram', 'analysis', 'provides'): 1
('analysis', 'provides', 'context'): 1
('provides', 'context', '.'): 1


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Bigram Probabilities

In [4]:
import nltk
from nltk import bigrams, ConditionalFreqDist
from nltk.tokenize import word_tokenize

# Ensure required NLTK data is downloaded
nltk.download('punkt')

# Sample text
text = "This is a sample text for bigram probabilities. Bigram probabilities are useful."

# Tokenize and convert to lowercase
tokens = word_tokenize(text.lower())

# Bigram probabilities
bigram_cfd = ConditionalFreqDist(bigrams(tokens))
print("Bigram Probabilities:")
for word in bigram_cfd:
    total_count = bigram_cfd[word].N()
    for next_word in bigram_cfd[word]:
        probability = bigram_cfd[word][next_word] / total_count
        print(f"P({next_word}|{word}) = {probability:.4f}")


Bigram Probabilities:
P(is|this) = 1.0000
P(a|is) = 1.0000
P(sample|a) = 1.0000
P(text|sample) = 1.0000
P(for|text) = 1.0000
P(bigram|for) = 1.0000
P(probabilities|bigram) = 1.0000
P(.|probabilities) = 0.5000
P(are|probabilities) = 0.5000
P(bigram|.) = 1.0000
P(useful|are) = 1.0000
P(.|useful) = 1.0000


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Next Word Prediction

In [5]:
import nltk
from nltk import bigrams, ConditionalFreqDist
from nltk.tokenize import word_tokenize

# Ensure required NLTK data is downloaded
nltk.download('punkt')

# Sample text
text = "vettri is a boy."

# Tokenize and convert to lowercase
tokens = word_tokenize(text.lower())

# Bigram probabilities
bigram_cfd = ConditionalFreqDist(bigrams(tokens))
print("Bigram Probabilities:")
for word in bigram_cfd:
    total_count = bigram_cfd[word].N()
    for next_word in bigram_cfd[word]:
        probability = bigram_cfd[word][next_word] / total_count
        print(f"P({next_word}|{word}) = {probability:.4f}")


Bigram Probabilities:
P(is|vettri) = 1.0000
P(a|is) = 1.0000
P(boy|a) = 1.0000
P(.|boy) = 1.0000


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
