In [30]:
from collections import Counter
from collections import defaultdict
from math import log10, pow

In [31]:
def file_read(filename: str):
    with open(filename, 'r') as f:
        lines = [x.strip() for x in f.readlines()]
    # split lines and add starting and ending tokens
    for i, line in enumerate(lines):
        line = line.split(" ")
        line.insert(0, "<st>")
        line.append("<sp>")
        lines[i] = line
    return lines

#Preprocessing
lines = file_read("train.txt")


#Make all words lowercase
lines = [[y.lower() for y in line] for line in lines]

In [32]:
#Unigram counter function
def unigram_count(lines: list[list]):
    unigram_counter = Counter()
    for word_lines in lines:
        unigram_counter += Counter(word_lines)
    return unigram_counter

unigram_counter = unigram_count(lines)

# print unigram counts
total_word_count = sum(unigram_counter.values())
for word, count in unigram_counter.items():
    print(word + ":", count)
print("total word count:", total_word_count)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
credit: 15
incedentals: 1
50.00: 1
debit: 4
200: 11
clearly: 13
mistake: 14
rudest: 1
person: 29
spoken: 1
give: 26
him: 14
bank: 4
account: 10
routing: 2
ss: 1
authorization: 2
code: 1
care: 21
smart: 1
information: 10
anyone: 25
yet: 16
alone: 5
already: 18
taken: 12
should: 35
corporate: 6
situation: 11
credited: 2
questioned: 1
didnt: 9
answer: 4
accounts: 1
sake: 1
moment: 7
stepped: 1
entrance: 13
luxury: 6
began: 4
minute: 22
@: 6
coolest: 2
separate: 17
toilet: 24
desserts: 1
cheese: 5
eno: 1
yummy: 5
w: 6
accommodating: 6
polite: 11
23rd: 3
plasma: 5
toiletries: 10
closed: 14
umbrella: 3
handy: 5
rained: 1
connection: 4
9.95/24hrs: 1
gorgeous: 8
makeover: 2
gaping: 1
hole: 5
kid: 2
mauled: 1
giant: 3
earthworm: 1
severely: 3
chipped: 2
ok: 18
dont: 5
mind: 12
slumming: 1
just-out-of-college-sister: 1
apartment: 4
170+: 1
stuff: 11
retrieved: 1
dumpster: 1
drive: 8
curt: 2
isnt: 1
job: 11
horribly: 3
star: 50
--: 

In [33]:
# Unigram probabilities
p_unigram = {}
for item, count in unigram_counter.items():
    p_unigram[item] = count/total_word_count

for word, prob in p_unigram.items():
    print(word + ":", prob)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
handed: 5.5121929708515235e-05
credit: 0.0001653657891255457
incedentals: 1.1024385941703048e-05
50.00: 1.1024385941703048e-05
debit: 4.409754376681219e-05
200: 0.00012126824535873351
clearly: 0.0001433170172421396
mistake: 0.00015434140318384265
rudest: 1.1024385941703048e-05
person: 0.0003197071923093884
spoken: 1.1024385941703048e-05
give: 0.0002866340344842792
him: 0.00015434140318384265
bank: 4.409754376681219e-05
account: 0.00011024385941703047
routing: 2.2048771883406096e-05
ss: 1.1024385941703048e-05
authorization: 2.2048771883406096e-05
code: 1.1024385941703048e-05
care: 0.00023151210477576398
smart: 1.1024385941703048e-05
information: 0.00011024385941703047
anyone: 0.00027560964854257615
yet: 0.00017639017506724877
alone: 5.5121929708515235e-05
already: 0.00019843894695065485
taken: 0.00013229263130043657
should: 0.00038585350795960664
corporate: 6.614631565021828e-05
situation: 0.00012126824535873351
credited: 

In [34]:
# Bigram counts
def bigram_count(lines: list):
    bigram_counter = defaultdict(lambda: defaultdict(int))
    total_bigram_count = 0
    for line in lines:
        for i in range(1, len(line)):
            # print(line[i-1], line[i])
            bigram_counter[line[i-1]][line[i]] += 1
            total_bigram_count += 1
    return bigram_counter, total_bigram_count


bigram_counter, total_bigram_count = bigram_count(lines)
print("total_bigram_count", total_bigram_count)
print(bigram_counter)

total_bigram_count 90196


In [35]:
# conditional probabilities for bigrams
def calculate_bigram_probabilities(bigram_counter: defaultdict, unigram_counter: defaultdict):
    p_bigram = defaultdict(lambda: defaultdict(float))

    for pre, post_dict in bigram_counter.items():
        # total_count = sum(next_dict.values())
        for post, count in post_dict.items():
            # print(pre, unigram_counter[pre], post)
            p_bigram[pre][post] = count/unigram_counter[pre]
    return p_bigram

p_bigram = calculate_bigram_probabilities(bigram_counter, unigram_counter)
# Print bigram probabilities
for pre_word, dict_ in p_bigram.items():
    print(pre_word, "\n",dict_)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
turns 
 defaultdict(<class 'float'>, {'out': 1.0})
non 
 defaultdict(<class 'float'>, {'smoker': 0.3333333333333333, 'smoking': 0.6666666666666666})
smoker 
 defaultdict(<class 'float'>, {'there': 1.0})
children 
 defaultdict(<class 'float'>, {'.': 0.25, 'and': 0.5, ',': 0.25})
hampton 
 defaultdict(<class 'float'>, {'inn': 1.0})
switching 
 defaultdict(<class 'float'>, {'back': 1.0})
error 
 defaultdict(<class 'float'>, {'and': 0.3333333333333333, 'which': 0.3333333333333333, '.': 0.3333333333333333})
owned 
 defaultdict(<class 'float'>, {'up': 1.0})
16thz 
 defaultdict(<class 'float'>, {'floor': 1.0})
happens 
 defaultdict(<class 'float'>, {'in': 0.3333333333333333, 'to': 0.3333333333333333, 'and': 0.3333333333333333})
lcd 
 defaultdict(<class 'float'>, {'screen': 0.5, 'tv': 0.5})
extrodinary 
 defaultdict(<class 'float'>, {'quality': 1.0})
~30 
 defaultdict(<class 'float'>, {'usd': 1.0})
usd 
 defaultdict(<class 'float

In [36]:
#Unknown Words Handling Part-1

#Replacing words with frequency less than or equal to 5 with '<unk>' in training set
fileList=[element for sublist in lines for element in sublist]
for i in range(len(lines)):
    for j in range(len(lines[i])):
      if fileList.count(lines[i][j])<=5 and lines[i][j]!='<st>' and lines[i][j]!='<sp>':
        c=fileList.index(lines[i][j])
        fileList=fileList[:c]+['<unk>']+fileList[c+1:]
        lines[i][j]='<unk>'
print(lines)

[['<st>', 'i', 'booked', 'two', 'rooms', 'four', 'months', 'in', 'advance', 'at', 'the', 'talbott', '.', 'we', 'were', 'placed', 'on', 'the', 'top', 'floor', 'next', 'to', 'the', 'elevators', ',', 'which', 'are', 'used', 'all', 'night', 'long', '.', 'when', '<unk>', 'to', 'the', 'front', 'desk', ',', 'i', 'was', 'told', 'that', 'they', 'were', 'simply', '<unk>', 'my', 'request', 'for', 'an', 'upper', 'floor', ',', 'which', 'i', 'had', 'requested', 'for', 'a', 'better', 'view', '.', 'i', 'am', 'looking', 'at', 'a', 'brick', 'wall', ',', 'and', 'getting', 'no', 'sleep', '.', 'he', 'also', 'told', 'me', 'that', 'they', 'had', 'received', 'complaints', 'before', 'from', 'guests', 'on', 'the', '<unk>', 'floor', ',', 'and', 'were', 'aware', 'of', 'the', 'noise', 'problem', '.', 'why', 'then', 'did', 'they', 'place', 'us', 'on', 'this', 'floor', 'when', 'the', 'hotel', 'is', 'not', 'totally', 'booked', '?', 'a', 'request', 'for', 'an', 'upper', 'floor', 'does', 'not', '<unk>', '<unk>', 'someo

In [37]:
#Unknown Words Handling Part-2

total_word_count_in_test=0
def file_read_val(filename: str):
    global total_word_count_in_test
    #Preprocessing on validation/test set
    with open(filename, 'r') as f:
        lines = [x.strip().lower() for x in f.readlines()]
    for i, line in enumerate(lines):
        line = line.split(" ")
        line.insert(0, "<st>")
        line.append("<sp>")
        #Replacing words which are not in training set with '<unk>' in validation/test set
        for j in range(len(line)):
            total_word_count_in_test+=1
            if line[j] not in fileList and line[j]!='<st>' and line[j]!='<sp>':
                line[j]='<unk>'
        lines[i] = line
    return lines

val_lines = file_read_val("val.txt")
print(val_lines)

[['<st>', 'i', 'stayed', 'for', 'four', 'nights', 'while', 'attending', 'a', 'conference', '.', 'the', 'hotel', 'is', 'in', 'a', 'great', 'spot', '-', 'easy', 'walk', 'to', 'michigan', 'ave', 'shopping', 'or', 'rush', '<unk>', ',', 'but', 'just', 'off', 'the', 'busy', '<unk>', '.', 'the', 'room', 'i', 'had', 'was', 'spacious', ',', 'and', 'very', '<unk>', '.', 'the', 'staff', 'was', 'friendly', ',', 'and', 'the', 'fitness', 'center', ',', 'while', 'not', 'huge', ',', 'was', '<unk>', 'and', 'clean', '.', 'i', "'ve", 'stayed', 'at', 'a', 'number', 'of', 'hotels', 'in', 'chicago', ',', 'and', 'this', 'one', 'is', 'my', 'favorite', '.', 'internet', 'was', "n't", 'free', ',', 'but', 'at', '$', '10', 'for', '24', 'hours', 'is', '<unk>', 'than', 'most', 'business', 'hotels', ',', 'and', 'it', 'worked', 'very', 'well', '.', '<sp>'], ['<st>', 'we', 'love', 'the', 'location', 'and', '<unk>', 'to', 'everything', '.', 'the', 'staff', 'was', 'very', 'friendly', 'and', 'courteous', '.', 'they', 'wer

In [38]:
# Unigram probabilities with Add-1 smoothing
def unigram_count_smoothing(lines: list[list]):
    unigram_counter = Counter()
    for word_lines in lines:
        unigram_counter += Counter(word_lines)
    return unigram_counter

unigram_counter = unigram_count_smoothing(lines)
total_word_count = sum(unigram_counter.values())

p_unigram = {}
for item, count in unigram_counter.items():
    p_unigram[item] = count/total_word_count

p_unigram_with_add_1 = {}

for item, count in unigram_counter.items():
    p_unigram_with_add_1[item] = (count+1)/(total_word_count+len(p_unigram))

for word, prob in p_unigram_with_add_1.items():
    print(word + ":", prob)

<st>: 0.0055760869565217395
i: 0.018554347826086958
booked: 0.0009456521739130435
two: 0.0014021739130434783
rooms: 0.0021956521739130434
four: 0.0002282608695652174
months: 9.782608695652174e-05
in: 0.013695652173913043
advance: 8.695652173913044e-05
at: 0.008108695652173913
the: 0.057532608695652174
talbott: 0.0003152173913043478
.: 0.051010869565217394
we: 0.012141304347826088
were: 0.0062934782608695655
placed: 9.782608695652174e-05
on: 0.006967391304347826
top: 0.00044565217391304347
floor: 0.0015
next: 0.0011739130434782609
to: 0.02272826086956522
elevators: 0.00035869565217391304
,: 0.03206521739130435
which: 0.0019239130434782609
are: 0.0033804347826086956
used: 0.000391304347826087
all: 0.003032608695652174
night: 0.002173913043478261
long: 0.0004347826086956522
when: 0.0029782608695652175
<unk>: 0.09303260869565218
front: 0.0013695652173913043
desk: 0.0017391304347826088
was: 0.019858695652173915
told: 0.0008478260869565217
that: 0.007554347826086956
they: 0.00532608695652173

In [39]:
# Unigram probabilities with Add-k(For example, here we have taken k=0.5) smoothing
def add_k_unigram_probabilities(k):
    p_unigram_with_add_k = {}
    for item, count in unigram_counter.items():
        p_unigram_with_add_k[item] = (count+k)/(total_word_count+(k*len(p_unigram)))
    return p_unigram_with_add_k
for word, prob in add_k_unigram_probabilities(0.5).items():
        print(word + ":", prob)

<st>: 0.005610044442498413
i: 0.018680079689997155
booked: 0.0009468660376119272
two: 0.001406616021192285
rooms: 0.002205705278367669
four: 0.0002244017776999365
months: 9.304463953412001e-05
in: 0.01378702629332049
advance: 8.20982113536353e-05
at: 0.00816056220855135
the: 0.05793397114521532
talbott: 0.00031197320314381416
.: 0.05136611423692449
we: 0.012221687063511176
were: 0.006332508702410404
placed: 9.304463953412001e-05
on: 0.007011187249600455
top: 0.00044333034130963065
floor: 0.0015051338748166474
next: 0.001176741029402106
to: 0.022883508111303283
elevators: 0.000355758915865753
,: 0.03228648991833965
which: 0.001932044573855551
are: 0.0033988659500405016
used: 0.00038859820040720714
all: 0.003048580248264991
night: 0.0021838124220066994
long: 0.00043238391312914597
when: 0.0029938481073625675
<unk>: 0.09368500558267837
front: 0.0013737767366508308
desk: 0.0017459552947873109
was: 0.01999365107165532
told: 0.0008483481839875648
that: 0.0076022943713466296
they: 0.005358276

In [40]:
# Bigram probabilities with Add-k(default k=1) smoothing
def bigram_count_smoothing(lines: list):
    bigram_counter = defaultdict(lambda: defaultdict(int))
    total_bigram_count = 0
    for line in lines:
        for i in range(1, len(line)):
            # print(line[i-1], line[i])
            bigram_counter[line[i-1]][line[i]] += 1
            total_bigram_count += 1
    return bigram_counter, total_bigram_count


bigram_counter, total_bigram_count = bigram_count_smoothing(lines)


def calculate_bigram_probabilities_smoothing(bigram_counter: defaultdict, unigram_counter: defaultdict,k:int=1):
    p_bigram = defaultdict(lambda: defaultdict(float))

    for pre, post_dict in bigram_counter.items():
        # total_count = sum(next_dict.values())
        for post, count in post_dict.items():
            # print(pre, unigram_counter[pre], post)
            p_bigram[pre][post] = (count+k)/(unigram_counter[pre]+(k*len(p_unigram)))
    return p_bigram

p_bigram = calculate_bigram_probabilities_smoothing(bigram_counter, unigram_counter)
for pre_word, dict_ in p_bigram.items():
    print(pre_word, "\n",dict_)

<st> 
 defaultdict(<class 'float'>, {'i': 0.06208425720620843, 'the': 0.027161862527716185, 'we': 0.04878048780487805, 'my': 0.024390243902439025, 'recently': 0.0011086474501108647, 'after': 0.004434589800443459, '<unk>': 0.006097560975609756, 'while': 0.0033259423503325942, 'this': 0.02106430155210643, 'just': 0.0066518847006651885, 'ambassador': 0.0011086474501108647, 'did': 0.0011086474501108647, 'simply': 0.0011086474501108647, 'booked': 0.0022172949002217295, 'even': 0.0011086474501108647, 'from': 0.0016629711751662971, 'what': 0.0022172949002217295, 'so': 0.0016629711751662971, 'having': 0.0016629711751662971, 'review': 0.0011086474501108647, 'stayed': 0.012749445676274944, 'for': 0.0011086474501108647, 'great': 0.0033259423503325942, 'do': 0.0016629711751662971, 'how': 0.0011086474501108647, 'beautiful': 0.0016629711751662971, 'attended': 0.0011086474501108647, 'first': 0.002771618625277162, 'it': 0.0016629711751662971, 'a': 0.0016629711751662971, 'visiting': 0.00110864745011086

In [41]:
# Calculate Perplexity for validation data using smoothed training data
def calculate_perplexity_bigram(prob_dict: defaultdict, lines,k:int=1):
    perplexity = 0
    for sentence in lines:
        for i in range(1, len(sentence)):
            perplexity += -log10(prob_dict[sentence[i-1]].get(sentence[i],k/(unigram_counter[sentence[i-1]]+(k*len(p_unigram)))))
    perplexity /= total_word_count_in_test
    perplexity = pow(10,perplexity)
    return perplexity

def calculate_perplexity_unigram(prob_dict: dict, lines):
    perplexity = 0
    for sentence in lines:
        for i in range(0, len(sentence)):
            # print(prob_dict[sentence[i-1]][sentence[i]])
            perplexity += -log10(prob_dict[sentence[i]])
    perplexity /= total_word_count_in_test
    perplexity = pow(10,perplexity)
    return perplexity
k_list = [100, 10, 5, 2, 0.1, 0.05, 0.01, 5e-3, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7]

# Perplexity for Unigrams
laplace_perplexity_unigram = calculate_perplexity_unigram(p_unigram_with_add_1, val_lines)
addk_perplexity_list_unigram = [calculate_perplexity_unigram(add_k_unigram_probabilities(k), val_lines) for k in k_list]

print("Perplexity for laplace for unigrams for validation data using smoothed training data", laplace_perplexity_unigram)
for i, addk_perplexity in enumerate(addk_perplexity_list_unigram):
    print("Perplexity for addk with k =", k_list[i], "for unigrams for validation data using smoothed training data:", addk_perplexity)

# Perplexity for Bigrams
laplace_perplexity_bigram = calculate_perplexity_bigram(calculate_bigram_probabilities_smoothing(bigram_counter, unigram_counter), val_lines)
addk_perplexity_list_bigram = [calculate_perplexity_bigram(calculate_bigram_probabilities_smoothing(bigram_counter, unigram_counter, k), val_lines, k) for k in k_list]

print("Perplexity for laplace for bigrams for validation data using smoothed training data", laplace_perplexity_bigram)
for i, addk_perplexity in enumerate(addk_perplexity_list_bigram):
    print("Perplexity for addk with k =", k_list[i], "for bigrams for validation data using smoothed training data:", addk_perplexity)


Perplexity for laplace for unigrams for validation data using smoothed training data 194.20160566209827
Perplexity for addk with k = 100 for unigrams for validation data using smoothed training data: 281.6956820887556
Perplexity for addk with k = 10 for unigrams for validation data using smoothed training data: 199.63776712729762
Perplexity for addk with k = 5 for unigrams for validation data using smoothed training data: 196.0957725432855
Perplexity for addk with k = 2 for unigrams for validation data using smoothed training data: 194.55410700959075
Perplexity for addk with k = 0.1 for unigrams for validation data using smoothed training data: 193.98089643242923
Perplexity for addk with k = 0.05 for unigrams for validation data using smoothed training data: 193.97175997404952
Perplexity for addk with k = 0.01 for unigrams for validation data using smoothed training data: 193.9647080081497
Perplexity for addk with k = 0.005 for unigrams for validation data using smoothed training data:

In [42]:
#Eval, Analysis and Findings

# Calculate Perplexity for training set
def calculate_perplexity_bigram_for_training_data(prob_dict: defaultdict, lines):
    perplexity = 0
    for sentence in lines:
        for i in range(1, len(sentence)):
            # print(prob_dict[sentence[i-1]][sentence[i]])
            perplexity += -log10(prob_dict[sentence[i-1]][sentence[i]])
    perplexity /= total_word_count
    perplexity = pow(10,perplexity)
    return perplexity

def calculate_perplexity_unigram_for_training_data(prob_dict: dict, lines):
    perplexity = 0
    for sentence in lines:
        for i in range(0, len(sentence)):
            # print(prob_dict[sentence[i-1]][sentence[i]])
            perplexity += -log10(prob_dict[sentence[i]])
    perplexity /= total_word_count
    perplexity = pow(10,perplexity)
    return perplexity
k_list = [100, 10, 5, 2, 0.1, 0.05, 0.01, 5e-3, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7]

# Perplexity for Unigrams
laplace_perplexity_unigram_for_training_data = calculate_perplexity_unigram_for_training_data(p_unigram_with_add_1, lines)
addk_perplexity_list_unigram_for_training_data = [calculate_perplexity_unigram_for_training_data(add_k_unigram_probabilities(k), lines) for k in k_list]

print("Perplexity for laplace for unigrams on training set", laplace_perplexity_unigram_for_training_data)
for i, addk_perplexity in enumerate(addk_perplexity_list_unigram_for_training_data):
    print("Perplexity for addk with k =", k_list[i], "for unigrams on training set:", addk_perplexity)

# Perplexity for Bigrams
laplace_perplexity_bigram_for_training_data = calculate_perplexity_bigram_for_training_data(calculate_bigram_probabilities_smoothing(bigram_counter, unigram_counter), lines)
addk_perplexity_list_bigram_for_training_data = [calculate_perplexity_bigram_for_training_data(calculate_bigram_probabilities_smoothing(bigram_counter, unigram_counter, k), lines) for k in k_list]

print("Perplexity for laplace for bigrams on training set", laplace_perplexity_bigram_for_training_data)
for i, addk_perplexity in enumerate(addk_perplexity_list_bigram_for_training_data):
    print("Perplexity for addk with k =", k_list[i], "for bigrams on training set:", addk_perplexity)

Perplexity for laplace for unigrams on training set 203.43529735042708
Perplexity for addk with k = 100 for unigrams on training set: 289.79828805958977
Perplexity for addk with k = 10 for unigrams on training set: 208.07759511935907
Perplexity for addk with k = 5 for unigrams on training set: 204.87434519371746
Perplexity for addk with k = 2 for unigrams on training set: 203.65059055149268
Perplexity for addk with k = 0.1 for unigrams on training set: 203.3566767889746
Perplexity for addk with k = 0.05 for unigrams on training set: 203.35603548400942
Perplexity for addk with k = 0.01 for unigrams on training set: 203.35582909968167
Perplexity for addk with k = 0.005 for unigrams on training set: 203.35582262815038
Perplexity for addk with k = 0.001 for unigrams on training set: 203.35582055565217
Perplexity for addk with k = 0.0001 for unigrams on training set: 203.3558204698648
Perplexity for addk with k = 1e-05 for unigrams on training set: 203.35582046876604
Perplexity for addk wit

In [43]:
#Eval, Analysis and Findings

# Calculate Perplexity for validation data using unsmoothed training data
def calculate_perplexity_bigram_using_unsmoothed_training_data(prob_dict: defaultdict, lines):
    perplexity = 0
    for sentence in lines:
        for i in range(1, len(sentence)):
            # print(prob_dict[sentence[i-1]][sentence[i]])
            perplexity += -log10(prob_dict[sentence[i-1]][sentence[i]])
    perplexity /= total_word_count_in_test
    perplexity = pow(10,perplexity)
    return perplexity

def calculate_perplexity_unigram_using_unsmoothed_training_data(prob_dict: dict, lines):
    perplexity = 0
    for sentence in lines:
        for i in range(0, len(sentence)):
            # print(prob_dict[sentence[i-1]][sentence[i]])
            perplexity += -log10(prob_dict[sentence[i]])
    perplexity /= total_word_count_in_test
    perplexity = pow(10,perplexity)
    return perplexity

def calculate_bigram_probabilities_without_smoothing(bigram_counter: defaultdict, unigram_counter: defaultdict):
    p_bigram = defaultdict(lambda: defaultdict(float))

    for pre, post_dict in bigram_counter.items():
        # total_count = sum(next_dict.values())
        for post, count in post_dict.items():
            # print(pre, unigram_counter[pre], post)
            p_bigram[pre][post] = count/unigram_counter[pre]
    return p_bigram

# Perplexity for Unigrams
p_unigram_without_smoothing={}
for item, count in unigram_counter.items():
    p_unigram_without_smoothing[item] = count/total_word_count
perplexity_unigram = calculate_perplexity_unigram_using_unsmoothed_training_data(p_unigram_without_smoothing, val_lines)

print("Perplexity for unigrams using unsmoothed training data", perplexity_unigram)

# Perplexity for Bigrams
# Below code is giving ValueError as counts of unseen bigrams are 0 in unsmoothed training set and these 0s are passed to log10() function
# perplexity_bigram = calculate_perplexity_bigram_using_unsmoothed_training_data(calculate_bigram_probabilities_without_smoothing(bigram_counter, unigram_counter), val_lines)

# print("Perplexity for bigrams using unsmoothed training data", perplexity_bigram)

Perplexity for unigrams using unsmoothed training data 193.9629810526624
