-
Notifications
You must be signed in to change notification settings - Fork 0
/
solutionsB.py
executable file
·365 lines (294 loc) · 11.5 KB
/
solutionsB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
import sys
import nltk
import math
import time
import numpy
START_SYMBOL = '*'
STOP_SYMBOL = 'STOP'
RARE_SYMBOL = '_RARE_'
RARE_WORD_MAX_FREQ = 5
LOG_PROB_OF_ZERO = -1000
# TODO: IMPLEMENT THIS FUNCTION
# Receives a list of tagged sentences and processes each sentence to generate a list of words and a list of tags.
# Each sentence is a string of space separated "WORD/TAG" tokens, with a newline character in the end.
# Remember to include start and stop symbols in yout returned lists, as defined by the constants START_SYMBOL and STOP_SYMBOL.
# brown_words (the list of words) should be a list where every element is a list of the tags of a particular sentence.
# brown_tags (the list of tags) should be a list where every element is a list of the tags of a particular sentence.
def split_wordtags(brown_train):
brown_words = []
brown_tags = []
START = START_SYMBOL+'/'+START_SYMBOL
STOP = STOP_SYMBOL+'/'+STOP_SYMBOL
for item in brown_train:
words_tmp = []
tags_tmp = []
sentence = [START, START] + item.strip().split(' ')+[STOP]
for token in sentence:
words_tmp.append(token.rsplit('/',1)[0])
tags_tmp.append(token.rsplit('/',1)[1])
brown_words.append(words_tmp)
brown_tags.append(tags_tmp)
return brown_words, brown_tags
# TODO: IMPLEMENT THIS FUNCTION
# This function takes tags from the training data and calculates tag trigram probabilities.
# It returns a python dictionary where the keys are tuples that represent the tag trigram, and the values are the log probability of that trigram
def calc_trigrams(brown_tags):
q_values = {}
bigram_count = {}
trigram_count = {}
for item in brown_tags:
bigram_tmp = nltk.bigrams(item)
trigram_tmp = nltk.trigrams(item)
for bigram in bigram_tmp:
if bigram in bigram_count:
bigram_count[bigram] += 1
else:
bigram_count[bigram] = 1
for trigram in trigram_tmp:
if trigram in trigram_count:
trigram_count[trigram] += 1
else:
trigram_count[trigram] =1
for trigram in trigram_count:
q_values[trigram] = math.log(trigram_count[trigram], 2) - math.log(bigram_count[trigram[:2]],2)
return q_values
# This function takes output from calc_trigrams() and outputs it in the proper format
def q2_output(q_values, filename):
outfile = open(filename, "w")
trigrams = q_values.keys()
trigrams.sort()
for trigram in trigrams:
output = " ".join(['TRIGRAM', trigram[0], trigram[1], trigram[2], str(q_values[trigram])])
outfile.write(output + '\n')
outfile.close()
# TODO: IMPLEMENT THIS FUNCTION
# Takes the words from the training data and returns a set of all of the words that occur more than 5 times (use RARE_WORD_MAX_FREQ)
# brown_words is a python list where every element is a python list of the words of a particular sentence.
# Note: words that appear exactly 5 times should be considered rare!
def calc_known(brown_words):
known_words = set([])
word_count = {}
for item in brown_words:
for word in item:
if word in word_count:
word_count[word] += 1
else:
word_count[word] = 1
for item in word_count:
if word_count[item] > RARE_WORD_MAX_FREQ:
known_words.add(item)
return known_words
# TODO: IMPLEMENT THIS FUNCTION
# Takes the words from the training data and a set of words that should not be replaced for '_RARE_'
# Returns the equivalent to brown_words but replacing the unknown words by '_RARE_' (use RARE_SYMBOL constant)
def replace_rare(brown_words, known_words):
brown_words_rare = []
for item in brown_words:
tmp = []
for word in item:
if word in known_words:
tmp.append(word)
else:
tmp.append(RARE_SYMBOL)
brown_words_rare.append(tmp)
return brown_words_rare
# This function takes the ouput from replace_rare and outputs it to a file
def q3_output(rare, filename):
outfile = open(filename, 'w')
for sentence in rare:
outfile.write(' '.join(sentence[2:-1]) + '\n')
outfile.close()
# TODO: IMPLEMENT THIS FUNCTION
# Calculates emission probabilities and creates a set of all possible tags
# The first return value is a python dictionary where each key is a tuple in which the first element is a word
# and the second is a tag, and the value is the log probability of the emission of the word given the tag
# The second return value is a set of all possible tags for this data set
def calc_emission(brown_words_rare, brown_tags):
e_values = {}
taglist = set([])
tag_count = {}
word_tag_count = {}
for i in xrange(len(brown_tags)):
sentence = brown_words_rare[i]
tags = brown_tags[i]
for j in xrange(len(tags)):
word = sentence[j]
tag = tags[j]
if (word, tag) in word_tag_count:
word_tag_count[(word,tag)] += 1
else:
word_tag_count[(word,tag)] =1
if tag in tag_count:
tag_count[tag] += 1
else:
tag_count[tag] = 1
for item in word_tag_count:
e_values[item] = math.log(word_tag_count[item],2) - math.log(tag_count[item[1]],2)
for item in tag_count:
taglist.add(item)
return e_values, taglist
# This function takes the output from calc_emissions() and outputs it
def q4_output(e_values, filename):
outfile = open(filename, "w")
emissions = e_values.keys()
emissions.sort()
for item in emissions:
output = " ".join([item[0], item[1], str(e_values[item])])
outfile.write(output + '\n')
outfile.close()
# TODO: IMPLEMENT THIS FUNCTION
# This function takes data to tag (brown_dev_words), a set of all possible tags (taglist), a set of all known words (known_words),
# trigram probabilities (q_values) and emission probabilities (e_values) and outputs a list where every element is a tagged sentence
# (in the WORD/TAG format, separated by spaces and with a newline in the end, just like our input tagged data)
# brown_dev_words is a python list where every element is a python list of the words of a particular sentence.
# taglist is a set of all possible tags
# known_words is a set of all known words
# q_values is from the return of calc_trigrams()
# e_values is from the return of calc_emissions()
# The return value is a list of tagged sentences in the format "WORD/TAG", separated by spaces. Each sentence is a string with a
# terminal newline, not a list of tokens. Remember also that the output should not contain the "_RARE_" symbol, but rather the
# original words of the sentence!
def viterbi(brown_dev_words, taglist, known_words, q_values, e_values):
tagged = []
print len(brown_dev_words)
Pi_Init = {}
for u in taglist:
for v in taglist:
Pi_Init[(u,v)] = LOG_PROB_OF_ZERO
for item in brown_dev_words:
sentence = item + [STOP_SYMBOL]
n = len(sentence)
cur_len = 0
Path_Pre = {}
Path_Pre[(START_SYMBOL, START_SYMBOL)] = [START_SYMBOL, START_SYMBOL]
Bigram_Pre = [(START_SYMBOL, START_SYMBOL)]
Pi_Pre = {}
Pi_Pre[(START_SYMBOL, START_SYMBOL)] = 0
while cur_len < n:
Path_Cur = {}
Bigram_Cur = []
Pi_Cur = {}
if cur_len == n-1:
word = STOP_SYMBOL
tagspace = [STOP_SYMBOL]
else:
word = sentence[cur_len]
if word not in known_words:
word = RARE_SYMBOL
tagspace = list(taglist)
for v in tagspace:
emi_tmp = (word, v)
if emi_tmp not in e_values:
continue
for u in taglist:
w_tmp = ''
for w in taglist:
if (w,u) not in Bigram_Pre:
continue
trigram_cur = (w,u,v)
if trigram_cur not in q_values:
q_values[trigram_cur] = LOG_PROB_OF_ZERO
if (u,v) not in Pi_Cur:
Pi_Cur[(u,v)] = Pi_Pre[(w,u)]+q_values[trigram_cur]+e_values[emi_tmp]
w_tmp = w
elif Pi_Pre[(w,u)]+q_values[trigram_cur]+e_values[emi_tmp] > Pi_Cur[(u,v)]:
Pi_Cur[(u,v)] = Pi_Pre[(w,u)]+q_values[trigram_cur]+e_values[emi_tmp]
w_tmp = w
if w_tmp != '':
Path_Cur[(u,v)] = Path_Pre[(w_tmp,u)]+[v]
Bigram_Cur.append((u,v))
Pi_Pre = dict(Pi_Cur)
Bigram_Pre = list(Bigram_Cur)
Path_Pre = dict(Path_Cur)
cur_len += 1
st = ''
bigram_max = Bigram_Pre[0]
for bigram in Bigram_Pre:
if Pi_Cur[bigram] > Pi_Cur[bigram_max]:
bigram = bigram_max
for i,tag in enumerate(Path_Cur[bigram_max][2:-1]):
st = st + sentence[i]+'/'+tag+' '
tagged.append(st.strip()+'\n')
if len(tagged) % 100 == 0:
print len(tagged)
return tagged
# This function takes the output of viterbi() and outputs it to file
def q5_output(tagged, filename):
outfile = open(filename, 'w')
for sentence in tagged:
outfile.write(sentence)
outfile.close()
# TODO: IMPLEMENT THIS FUNCTION
# This function uses nltk to create the taggers described in question 6
# brown_words and brown_tags is the data to be used in training
# brown_dev_words is the data that should be tagged
# The return value is a list of tagged sentences in the format "WORD/TAG", separated by spaces. Each sentence is a string with a
# terminal newline, not a list of tokens.
def nltk_tagger(brown_words, brown_tags, brown_dev_words):
# Hint: use the following line to format data to what NLTK expects for training
training = [ zip(brown_words[i],brown_tags[i]) for i in xrange(len(brown_words)) ]
# IMPLEMENT THE REST OF THE FUNCTION HERE
tagged = []
t0 = nltk.DefaultTagger('NOUN')
t1 = nltk.BigramTagger(training, backoff = t0)
t2 = nltk.TrigramTagger(training, backoff = t1)
for item in brown_dev_words:
st = ''
tags = t2.tag(item)
for subitem in tags:
st = st+subitem[0]+'/'+subitem[1]+' '
tagged.append(st.strip()+'\n')
return tagged
# This function takes the output of nltk_tagger() and outputs it to file
def q6_output(tagged, filename):
outfile = open(filename, 'w')
for sentence in tagged:
outfile.write(sentence)
outfile.close()
DATA_PATH = 'data/'
OUTPUT_PATH = 'output/'
def main():
# start timer
time.clock()
# open Brown training data
infile = open(DATA_PATH + "Brown_tagged_train.txt", "r")
brown_train = infile.readlines()
infile.close()
# split words and tags, and add start and stop symbols (question 1)
brown_words, brown_tags = split_wordtags(brown_train)
# calculate tag trigram probabilities (question 2)
q_values = calc_trigrams(brown_tags)
# question 2 output
q2_output(q_values, OUTPUT_PATH + 'B2.txt')
# calculate list of words with count > 5 (question 3)
known_words = calc_known(brown_words)
# get a version of brown_words with rare words replace with '_RARE_' (question 3)
brown_words_rare = replace_rare(brown_words, known_words)
# question 3 output
q3_output(brown_words_rare, OUTPUT_PATH + "B3.txt")
# calculate emission probabilities (question 4)
e_values, taglist = calc_emission(brown_words_rare, brown_tags)
# question 4 output
q4_output(e_values, OUTPUT_PATH + "B4.txt")
# delete unneceessary data
del brown_train
del brown_words_rare
# open Brown development data (question 5)
infile = open(DATA_PATH + "Brown_dev.txt", "r")
brown_dev = infile.readlines()
infile.close()
# format Brown development data here
brown_dev_words = []
for sentence in brown_dev:
brown_dev_words.append(sentence.split(" ")[:-1])
# do viterbi on brown_dev_words (question 5)
viterbi_tagged = viterbi(brown_dev_words, taglist, known_words, q_values, e_values)
# question 5 output
q5_output(viterbi_tagged, OUTPUT_PATH + 'B5.txt')
# do nltk tagging here
nltk_tagged = nltk_tagger(brown_words, brown_tags, brown_dev_words)
# question 6 output
q6_output(nltk_tagged, OUTPUT_PATH + 'B6.txt')
# print total time to run Part B
print "Part B time: " + str(time.clock()) + ' sec'
if __name__ == "__main__": main()