-
Notifications
You must be signed in to change notification settings - Fork 1
/
build_vocabulary.py
33 lines (28 loc) · 1.34 KB
/
build_vocabulary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import json
from preprocessing import preprocess
fileNames = ['ingredients', 'directions', 'dish_names', 'recipe_links']
def main():
'''
Builds the stemmed and unstemmed vocabularies for 3 corpuses.
'''
print('Building Vocabulary...', end=' ')
for fileName in fileNames[:-1]: # we don't need recipe_links
vocab, autocompleteVocab = set(), set() # one stemmed, one unstemmed
with open("corpus/{}.json".format(fileName), 'r') as f:
stringList = json.load(f)
docList, autocompleteDocList = [], []
for i in range(len(stringList)):
docList.append(preprocess(stringList[i]))
autocompleteDocList.append(
preprocess(stringList[i], stem=False))
for word in docList[i]:
vocab.add(word)
for word in autocompleteDocList[i]:
autocompleteVocab.add(word)
with open('vocabulary/{}.json'.format(fileName), 'w') as vocabFile:
json.dump(list(vocab), vocabFile) # set isn't serializable
with open('vocabulary/{}_autocomplete.json'.format(fileName), 'w') as vocabFile:
json.dump(list(autocompleteVocab), vocabFile) # set isn't serializable
print('Done.')
if __name__ == '__main__':
main()