-
Notifications
You must be signed in to change notification settings - Fork 9
/
build_vocab.py
executable file
·88 lines (71 loc) · 2.4 KB
/
build_vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env python3
import argparse
from collections import Counter
#import pdb
import pickle
import re
import sys
parser = argparse.ArgumentParser()
parser.add_argument("--train", type=str, default="", help="training file")
parser.add_argument("--output", type=str, default="vocab_dict.pkl", help="output file")
parser.add_argument("--min_count", type=int, default=5, help="minimum frequency of a word")
# Build the vocabulary.
def file_split(f, delim=' \t\n', bufsize=1024):
prev = ''
while True:
s = f.read(bufsize)
if not s:
break
tokens = re.split('['+delim+']{1,}', s)
if len(tokens) > 1:
yield prev + tokens[0]
prev = tokens[-1]
for x in tokens[1:-1]:
yield x
else:
prev += s
if prev:
yield prev
def build_vocab(args):
vocab = Counter()
word_count = 0
for word in file_split(open(args.train)):
if word:
vocab[word] += 1
word_count += 1
if word_count % 10000 == 0:
sys.stdout.write('%d\r' % len(vocab))
freq = {k:v for k,v in vocab.items() if v >= args.min_count}
word_count = sum([freq[k] for k in freq])
word_list = sorted(freq, key=freq.get, reverse=True)
word2idx = {}
for i,w in enumerate(word_list):
word2idx[w] = i
print("Vocab size: %ld" % len(word2idx))
print("Words in train file: %ld" % word_count)
# FAQ page: https://nlp.stanford.edu/software/parser-faq.shtml
# POS tags: https://catalog.ldc.upenn.edu/docs/LDC99T42/tagguid1.pdf
# Dep rels: http://universaldependencies.org/u/dep/index.html
pos2idx = {}
with open('pos_tag_set.txt') as f:
idx = 0
for line in f:
pos2idx[ line.strip() ] = idx
idx += 1
dep2idx = {}
with open('dep_rel_set.txt') as f:
idx = 0
for line in f:
dep2idx[ line.strip() ] = idx
idx += 1
return word2idx, word_list, freq, pos2idx, dep2idx
if __name__ == '__main__':
args = parser.parse_args()
print("Starting training using file %s" % args.train)
train_file = open(args.train)
train_file.seek(0, 2)
vars(args)['file_size'] = train_file.tell()
# objs = word2idx, word_list, freq, pos2idx, dep2idx
objs = build_vocab(args)
with open(args.output, 'wb') as f:
pickle.dump(objs, f, pickle.HIGHEST_PROTOCOL)