## Additional exercise #1  (CSE 628) 

Doubts:
1. Can a N-gram contain words from two different sentences?

In [2]:
import nltk
from nltk.corpus import gutenberg
from nltk import bigrams, trigrams
import string
from decimal import *
import numpy as np
import random

#Setting decimal point precision
getcontext().prec = 6

In [3]:
#Importing data
new_data = []
data = gutenberg.sents('austen-sense.txt')
data = data[2:]

#Add sentence de-limiters, remove punctuations and switch to lower-case
for d in data:
    d = [''.join(c for c in s if c not in string.punctuation) for s in d]
    d = [s for s in d if s]
    d.insert(0,'<s>')
    d.append('</s>')
    d = [str(w.lower()) for w in d]
    new_data.append(d)
    
#The first 1000 sentences go into training set
train_set = new_data[0:1000]
#The second 1000 sentences go into test set
test_set = new_data[1000:2000]

print "Size of training set: {} sentences".format(len(train_set))
print "Size of test set: {} sentences".format(len(test_set))

Size of training set: 1000 sentences
Size of test set: 1000 sentences


In [4]:
train_words = [val for sublist in train_set for val in sublist]
num_words = len(train_words)

#Calculating bigrams
train_bigrams = list(bigrams(train_words))
#Calculating trigrams
train_trigrams = list(trigrams(train_words))

#Counting unigrams, bigrams and trigrams encountered in the training set
uni_count = dict([(item, train_words.count(item)) for item in sorted(set(train_words))])
bi_count = dict([(item, train_bigrams.count(item)) for item in sorted(set(train_bigrams))])
tri_count = dict([(item, train_trigrams.count(item)) for item in sorted(set(train_trigrams))])

print "Number of unigrams: {}".format(len(uni_count))
print "Number of bigrams: {}".format(len(bi_count))
print "Number of trigrams: {}".format(len(tri_count))


Number of unigrams: 2882
Number of bigrams: 14341
Number of trigrams: 21306


We can observe that the number of unqiue N-grams increases on increasing the value of N. This is because, eg - repeating 3-word combinations (or trigrams) are rarer than repeating 2-word combinations (or bigrams), which in turn are rarer than repeating single words (or unigrams).

In [5]:
#Calculating MLE on test sentences using unigram model
test_uni_mle = []
for i in range(len(test_set)):
    uni_mle = 1
    for t in test_set[i][1:]:
        if t in uni_count.keys():
            uni_mle *= Decimal(uni_count[t])/Decimal(num_words)
        else:
            uni_mle = 0
            break
    test_uni_mle.append(uni_mle)

print "# of test sentences that get a non-zero probability according to unigram model: {} (out of {})".format(
    np.count_nonzero(test_uni_mle), len(test_set))

# of test sentences that get a non-zero probability according to unigram model: 388 (out of 1000)


In [6]:
#Calculating MLE on test sentences using bigram model
test_bi_mle = []
for i in range(len(test_set)):
    bi_mle = 1
    test_bigrams = list(bigrams(test_set[i]))
    for j in range(len(test_bigrams)):
        if test_bigrams[j] in bi_count.keys():
            bi_mle *= Decimal(bi_count[test_bigrams[j]])/Decimal(uni_count[test_bigrams[j][0]])
        else:
            bi_mle = 0
            break
    test_bi_mle.append(bi_mle)

print "# of test sentences that get a non-zero probability according to bigram model: {} (out of {})".format(
    np.count_nonzero(test_bi_mle), len(test_set))

# of test sentences that get a non-zero probability according to bigram model: 29 (out of 1000)


In [7]:
#Calculating MLE on test sentences using trigram model
test_tri_mle = []
for i in range(len(test_set)):
    tri_mle = 1
    test_trigrams = list(trigrams(test_set[i]))
    #if sentence only has less than 3 words, meaning no trigrams
    if len(test_trigrams) == 0:
        tri_mle = 0
        continue
    for j in range(len(test_trigrams)):
        if test_trigrams[j] in tri_count.keys():
            tri_mle *= Decimal(tri_count[test_trigrams[j]])/Decimal(bi_count[(test_trigrams[j][0],
                                                                              test_trigrams[j][1])])
        else:
            tri_mle = 0
            break
    test_tri_mle.append(tri_mle)

print "# of test sentences that get a non-zero probability according to trigram model: {} (out of {})".format(
    np.count_nonzero(test_tri_mle), len(test_set))

# of test sentences that get a non-zero probability according to trigram model: 16 (out of 1000)


In [32]:
#Making up 5 non-sensical sentences from training set and calculating their MLE estimates
all_rands = []
sample_words = filter(lambda c: c != '<s>' and c != '</s>', train_words)
print "Generated random sentences:"
for i in range(5):
    #Select sentence length at random
    sent_len = random.sample(range(5,10),1)
    idxs = random.sample(range(len(sample_words)), sent_len[0])
    rand_sent = [sample_words[i] for i in idxs]
    rand_sent.insert(0,'<s>')
    rand_sent.append('</s>')
    all_rands.append(rand_sent)
    print ' '.join(rand_sent)

Generated random sentences:
<s> for time never each atone </s>
<s> which future on them assurance of a general </s>
<s> exhausted tendency either long death think relative to </s>
<s> pencil be elinor of sure always john benefit would </s>
<s> he the as we address persuading </s>


In [33]:
rand_uni_mle = []
for i in range(len(all_rands)):
    uni_mle = 1
    for t in all_rands[i][1:]:
        if t in uni_count.keys():
            uni_mle *= Decimal(uni_count[t])/Decimal(num_words)
        else:
            uni_mle = 0
            break
    rand_uni_mle.append(uni_mle)
print "Unigram MLEs for the random sentences: \n1. {}\n2. {}\n3. {}\n4. {}\n5. {}\n".format(rand_uni_mle[0], 
                                                rand_uni_mle[1], rand_uni_mle[2], rand_uni_mle[3], rand_uni_mle[4])

rand_bi_mle = []
for i in range(len(all_rands)):
    bi_mle = 1
    rand_bigrams = list(bigrams(all_rands[i]))
    for j in range(len(rand_bigrams)):
        if rand_bigrams[j] in bi_count.keys():
            bi_mle *= Decimal(bi_count[rand_bigrams[j]])/Decimal(uni_count[rand_bigrams[j][0]])
        else:
            bi_mle = 0
            break
    rand_bi_mle.append(bi_mle)

print "Bigram MLEs for the random sentences: \n1. {}\n2. {}\n3. {}\n4. {}\n5. {}\n".format(rand_bi_mle[0], 
                                                rand_bi_mle[1], rand_bi_mle[2], rand_bi_mle[3], rand_bi_mle[4])

rand_tri_mle = []
for i in range(len(all_rands)):
    tri_mle = 1
    rand_trigrams = list(trigrams(all_rands[i]))
    #if sentence only has less than 3 words, meaning no trigrams
    if len(rand_trigrams) == 0:
        tri_mle = 0
        continue
    for j in range(len(rand_trigrams)):
        if rand_trigrams[j] in tri_count.keys():
            tri_mle *= Decimal(tri_count[rand_trigrams[j]])/Decimal(bi_count[(rand_trigrams[j][0],
                                                                              rand_trigrams[j][1])])
        else:
            tri_mle = 0
            break
    rand_tri_mle.append(tri_mle)

print "Trigram MLEs for the random sentences: \n1. {}\n2. {}\n3. {}\n4. {}\n5. {}\n".format(rand_tri_mle[0], 
                                            rand_tri_mle[1], rand_tri_mle[2], rand_tri_mle[3], rand_tri_mle[4])


Unigram MLEs for the random sentences: 
1. 1.60733E-17
2. 6.82827E-23
3. 5.22307E-30
4. 6.96985E-28
5. 1.87249E-18

Bigram MLEs for the random sentences: 
1. 0
2. 0
3. 0
4. 0
5. 0

Trigram MLEs for the random sentences: 
1. 0
2. 0
3. 0
4. 0
5. 0



In [None]:
(I)
#Implementing add-1 smoothing with unigram model. V = #train plus test words
add1_uni_mle = []
for i in range(len(test_set)):
    uni_mle = 1
    for t in test_set[i][1:]:
        if t in uni_count.keys():
            uni_cnt = uni_count[t] + 1
        else:
            uni_mle = 1
        uni_mle *= Decimal(uni_cnt)/(Decimal(num_words)+V)
        
    test_uni_mle.append(uni_mle)

print "# of test sentences that get a non-zero probability according to unigram model with add-1 smoothing: {} (out of {})".format
    (np.count_nonzero(test_uni_mle), len(test_set))