<a href="https://colab.research.google.com/github/saigupta2025/NLP/blob/main/Statistical_Language_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
from itertools import count
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

text = "I love to learn Natural Language Processing using Python."
print(len(text))

# Tokenize text
tokens = word_tokenize(text)

print("Tokens:", tokens)
print(len(tokens))

# Unigrams (1-gram)
unigrams = list(ngrams(tokens, 1))
print("\nUnigrams:", unigrams)

# Bigrams (2-gram)
bigrams = list(ngrams(tokens, 2))
print("\nBigrams:", bigrams)

# Trigrams (3-gram)
trigrams = list(ngrams(tokens, 3))
print("\nTrigrams:", trigrams)


57
Tokens: ['I', 'love', 'to', 'learn', 'Natural', 'Language', 'Processing', 'using', 'Python', '.']
10

Unigrams: [('I',), ('love',), ('to',), ('learn',), ('Natural',), ('Language',), ('Processing',), ('using',), ('Python',), ('.',)]

Bigrams: [('I', 'love'), ('love', 'to'), ('to', 'learn'), ('learn', 'Natural'), ('Natural', 'Language'), ('Language', 'Processing'), ('Processing', 'using'), ('using', 'Python'), ('Python', '.')]

Trigrams: [('I', 'love', 'to'), ('love', 'to', 'learn'), ('to', 'learn', 'Natural'), ('learn', 'Natural', 'Language'), ('Natural', 'Language', 'Processing'), ('Language', 'Processing', 'using'), ('Processing', 'using', 'Python'), ('using', 'Python', '.')]


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


N-gram Model From a Text File (Complete Code)

In [28]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk import FreqDist

# STEP 1: READ TEXT FILE

file_path = "/content/input.txt"   # your file name

with open(file_path, "r", encoding="utf-8") as f:
    text = f.read().lower()

print("File Loaded Successfully!\n")

# STEP 2: TOKENIZATION
tokens = word_tokenize(text)
print("Tokens:", tokens, "\n")

# STEP 3: UNIGRAM MODEL

unigram_freq = FreqDist(tokens)
total_unigrams = len(tokens)

print("===== UNIGRAM PROBABILITIES =====")
for word, count in unigram_freq.items():
    prob = count / total_unigrams
    print(f"{word:15s} Count={count:2d}  P(w)={prob:.4f}")

print("\n")

# STEP 4: BIGRAM MODEL

bigrams = list(ngrams(tokens, 2))
bigram_freq = FreqDist(bigrams)

print("===== BIGRAM PROBABILITIES =====")
for (w1, w2), count in bigram_freq.items():
    prob = count / unigram_freq[w1]   # P(w2 | w1)
    print(f"({w1:10s}, {w2:10s})  Count={count:2d}  P({w2}|{w1})={prob:.4f}")

print("\n")

# STEP 5: TRIGRAM MODEL

trigrams = list(ngrams(tokens, 3))
trigram_freq = FreqDist(trigrams)

print("===== TRIGRAM PROBABILITIES =====")
for (w1, w2, w3), count in trigram_freq.items():
    prob = count / bigram_freq[(w1, w2)]   # P(w3 | w1, w2)
    print(f"({w1}, {w2}, {w3})  Count={count}  P({w3}|{w1},{w2})={prob:.4f}")


File Loaded Successfully!

Tokens: ['natural', 'language', 'processing', 'is', 'good', 'language', 'models', 'help', 'computers', 'understand', 'text', '.', 'nlp', 'is', 'a', 'part', 'of', 'artificial', 'intelligence', '&', 'great', '.'] 

===== UNIGRAM PROBABILITIES =====
natural         Count= 1  P(w)=0.0455
language        Count= 2  P(w)=0.0909
processing      Count= 1  P(w)=0.0455
is              Count= 2  P(w)=0.0909
good            Count= 1  P(w)=0.0455
models          Count= 1  P(w)=0.0455
help            Count= 1  P(w)=0.0455
computers       Count= 1  P(w)=0.0455
understand      Count= 1  P(w)=0.0455
text            Count= 1  P(w)=0.0455
.               Count= 2  P(w)=0.0909
nlp             Count= 1  P(w)=0.0455
a               Count= 1  P(w)=0.0455
part            Count= 1  P(w)=0.0455
of              Count= 1  P(w)=0.0455
artificial      Count= 1  P(w)=0.0455
intelligence    Count= 1  P(w)=0.0455
&               Count= 1  P(w)=0.0455
great           Count= 1  P(w)=0.0455


==

In [19]:
import nltk
from nltk import word_tokenize, FreqDist
from nltk.util import bigrams
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [20]:
text = "I love learning NLP. NLP is interesting."

tokens = word_tokenize(text)

# Unigram frequencies
fdist = FreqDist(tokens)
print("Unigram Probabilities:")
for word in fdist:
    print(word, fdist[word] / len(tokens))

# Bigram model
print("\nBigrams with Probabilities:")
bigram_list = list(bigrams(tokens))
bigram_freq = FreqDist(bigram_list)

for bg in bigram_freq:
    print(bg, bigram_freq[bg] / fdist[bg[0]])


Unigram Probabilities:
NLP 0.2222222222222222
. 0.2222222222222222
I 0.1111111111111111
love 0.1111111111111111
learning 0.1111111111111111
is 0.1111111111111111
interesting 0.1111111111111111

Bigrams with Probabilities:
('I', 'love') 1.0
('love', 'learning') 1.0
('learning', 'NLP') 1.0
('NLP', '.') 0.5
('.', 'NLP') 0.5
('NLP', 'is') 0.5
('is', 'interesting') 1.0
('interesting', '.') 1.0


In [23]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk import FreqDist

text = "Natural Language Processing is a combo of computer science, Artificial intelligence and language grammar. "

# Step 1: Tokenize
tokens = word_tokenize(text.lower())

# UNIGRAM PROBABILITY DISTRIBUTION

unigram_freq = FreqDist(tokens)
total_unigrams = len(tokens)

print("----- UNIGRAM PROBABILITIES -----")
for word in unigram_freq:
    prob = unigram_freq[word] / total_unigrams
    print(f"P({word}) = {prob:.4f}")

#  BIGRAM PROBABILITY DISTRIBUTION

bigrams = list(ngrams(tokens, 2))
bigram_freq = FreqDist(bigrams)

print("\n----- BIGRAM PROBABILITIES -----")
for (w1, w2), freq in bigram_freq.items():
    prob = freq / unigram_freq[w1]     # P(w2 | w1)
    print(f"P({w2} | {w1}) = {prob:.4f}")

# TRIGRAM PROBABILITY DISTRIBUTION

trigrams = list(ngrams(tokens, 3))
trigram_freq = FreqDist(trigrams)

print("\n----- TRIGRAM PROBABILITIES -----")
for (w1, w2, w3), freq in trigram_freq.items():
    # denominator = frequency of bigram (w1, w2)
    denom = bigram_freq[(w1, w2)]
    prob = freq / denom                 # P(w3 | w1, w2)
    print(f"P({w3} | {w1}, {w2}) = {prob:.4f}")


----- UNIGRAM PROBABILITIES -----
P(language) = 0.1250
P(natural) = 0.0625
P(processing) = 0.0625
P(is) = 0.0625
P(a) = 0.0625
P(combo) = 0.0625
P(of) = 0.0625
P(computer) = 0.0625
P(science) = 0.0625
P(,) = 0.0625
P(artificial) = 0.0625
P(intelligence) = 0.0625
P(and) = 0.0625
P(grammar) = 0.0625
P(.) = 0.0625

----- BIGRAM PROBABILITIES -----
P(language | natural) = 1.0000
P(processing | language) = 0.5000
P(is | processing) = 1.0000
P(a | is) = 1.0000
P(combo | a) = 1.0000
P(of | combo) = 1.0000
P(computer | of) = 1.0000
P(science | computer) = 1.0000
P(, | science) = 1.0000
P(artificial | ,) = 1.0000
P(intelligence | artificial) = 1.0000
P(and | intelligence) = 1.0000
P(language | and) = 1.0000
P(grammar | language) = 0.5000
P(. | grammar) = 1.0000

----- TRIGRAM PROBABILITIES -----
P(processing | natural, language) = 1.0000
P(is | language, processing) = 1.0000
P(a | processing, is) = 1.0000
P(combo | is, a) = 1.0000
P(of | a, combo) = 1.0000
P(computer | combo, of) = 1.0000
P(sci

In [26]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk import FreqDist

text = "Natural Language Processing is a combo of computer science, Artificial intelligence and language grammar."

# Step 1: Tokenize
tokens = word_tokenize(text.lower())

# UNIGRAM PROBABILITY TABLE

unigram_freq = FreqDist(tokens)
total_unigrams = len(tokens)

unigram_data = {
    'Word': [],
    'Count': [],
    'Probability P(w)': []
}

for word, freq in unigram_freq.items():
    unigram_data['Word'].append(word)
    unigram_data['Count'].append(freq)
    unigram_data['Probability P(w)'].append(freq / total_unigrams)

df_unigram = pd.DataFrame(unigram_data)
print("UNIGRAM PROBABILITY TABLE:\n")
print(df_unigram, "\n")

#  BIGRAM PROBABILITY TABLE

bigrams = list(ngrams(tokens, 2))
bigram_freq = FreqDist(bigrams)

bigram_data = {
    'w1': [],
    'w2': [],
    'Count (w1,w2)': [],
    'Probability P(w2|w1)': []
}

for (w1, w2), freq in bigram_freq.items():
    bigram_data['w1'].append(w1)
    bigram_data['w2'].append(w2)
    bigram_data['Count (w1,w2)'].append(freq)
    bigram_data['Probability P(w2|w1)'].append(freq / unigram_freq[w1])

df_bigram = pd.DataFrame(bigram_data)
print("BIGRAM PROBABILITY TABLE:\n")
print(df_bigram, "\n")


# TRIGRAM PROBABILITY TABLE

trigrams = list(ngrams(tokens, 3))
trigram_freq = FreqDist(trigrams)

trigram_data = {
    'w1': [],
    'w2': [],
    'w3': [],
    'Count (w1,w2,w3)': [],
    'Probability P(w3|w1,w2)': []
}

for (w1, w2, w3), freq in trigram_freq.items():
    trigram_data['w1'].append(w1)
    trigram_data['w2'].append(w2)
    trigram_data['w3'].append(w3)
    trigram_data['Count (w1,w2,w3)'].append(freq)
    trigram_data['Probability P(w3|w1,w2)'].append(freq / bigram_freq[(w1, w2)])

df_trigram = pd.DataFrame(trigram_data)
print("TRIGRAM PROBABILITY TABLE:\n")
print(df_trigram)


UNIGRAM PROBABILITY TABLE:

            Word  Count  Probability P(w)
0        natural      1            0.0625
1       language      2            0.1250
2     processing      1            0.0625
3             is      1            0.0625
4              a      1            0.0625
5          combo      1            0.0625
6             of      1            0.0625
7       computer      1            0.0625
8        science      1            0.0625
9              ,      1            0.0625
10    artificial      1            0.0625
11  intelligence      1            0.0625
12           and      1            0.0625
13       grammar      1            0.0625
14             .      1            0.0625 

BIGRAM PROBABILITY TABLE:

              w1            w2  Count (w1,w2)  Probability P(w2|w1)
0        natural      language              1                   1.0
1       language    processing              1                   0.5
2     processing            is              1                   1.0