In [8]:
from collections import defaultdict

# corpus
corpus = [
    ['<s>', 'I', 'love', 'NLP', '</s>'],
    ['<s>', 'I', 'love', 'deep', 'learning', '</s>'],
    ['<s>', 'deep', 'learning', 'is', 'fun', '</s>']
]

# 1. Compute unigram and bigram counts
unigram_counts = {}
bigram_counts = {}

for sentence in corpus:
    for word in sentence:
        unigram_counts[word] = unigram_counts.get(word, 0) + 1

    for i in range(len(sentence) - 1):
        w1 = sentence[i]
        w2 = sentence[i+1]
        bigram = (w1, w2)
        bigram_counts[bigram] = bigram_counts.get(bigram, 0) + 1

print("Unigram Counts:")
for word, count in unigram_counts.items():
    print(f"{word} : {count}")
print("")

print("Bigram Counts:")
for bigram, count in bigram_counts.items():
    print(f"{bigram} : {count}")
print("")

# 2. Estimate bigram probabilities using MLE
bigram_probs = {}

for bigram, count in bigram_counts.items():
    w1 = bigram[0]
    unigram_count_w1 = unigram_counts.get(w1, 0)
    if unigram_count_w1 > 0:
        bigram_probs[bigram] = count / unigram_count_w1
    else:
        bigram_probs[bigram] = 0.0

print("Bigram Probabilities (MLE):")
for bigram, prob in bigram_probs.items():
    print(f"{bigram} : {prob:.3f}")
print("")

# 3. Implement a function that calculates the probability of any given sentence
def calculate_sentence_probability(sentence, bigram_probs):
    probability = 1.0
    for i in range(len(sentence) - 1):
        w1 = sentence[i]
        w2 = sentence[i+1]
        bigram = (w1, w2)
        bigram_p = bigram_probs.get(bigram, 0.0)

        if bigram_p == 0.0:
            return 0.0

        probability *= bigram_p
    return probability

# 4. Test your function on both sentences and print preferences
sentence1 = ['<s>', 'I', 'love', 'NLP', '</s>']
sentence2 = ['<s>', 'I', 'love', 'deep', 'learning', '</s>']

prob1 = calculate_sentence_probability(sentence1, bigram_probs)
prob2 = calculate_sentence_probability(sentence2, bigram_probs)

print("Sentence Probabilities:")
print(f"P(S1) = {prob1}")
print(f"P(S2) = {prob2}")
print("")

if prob1 > prob2:
    print(f"Model prefers S1 because it has higher probability.")
elif prob2 > prob1:
    print(f"Model prefers S2 because it has higher probability.")
else:
    print("The model assigns equal probability to both sentences.")

Unigram Counts:
<s> : 3
I : 2
love : 2
NLP : 1
</s> : 3
deep : 2
learning : 2
is : 1
fun : 1

Bigram Counts:
('<s>', 'I') : 2
('I', 'love') : 2
('love', 'NLP') : 1
('NLP', '</s>') : 1
('love', 'deep') : 1
('deep', 'learning') : 2
('learning', '</s>') : 1
('<s>', 'deep') : 1
('learning', 'is') : 1
('is', 'fun') : 1
('fun', '</s>') : 1

Bigram Probabilities (MLE):
('<s>', 'I') : 0.667
('I', 'love') : 1.000
('love', 'NLP') : 0.500
('NLP', '</s>') : 1.000
('love', 'deep') : 0.500
('deep', 'learning') : 1.000
('learning', '</s>') : 0.500
('<s>', 'deep') : 0.333
('learning', 'is') : 0.500
('is', 'fun') : 1.000
('fun', '</s>') : 1.000

Sentence Probabilities:
P(S1) = 0.3333333333333333
P(S2) = 0.16666666666666666

Model prefers S1 because it has higher probability.
