forked from Kyubyong/wordvectors
-
Notifications
You must be signed in to change notification settings - Fork 2
/
build_corpus.py
169 lines (152 loc) · 6.55 KB
/
build_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# coding: utf-8
#!/usr/bin/python2
import argparse
import codecs
import lxml.etree as ET
import os
import regex
# arguments setting
parser = argparse.ArgumentParser()
parser.add_argument('--lcode', help='ISO 639-1 code of target language. See `lcodes.txt`.')
parser.add_argument('--max_corpus_size', type=int, default=1000000000, help='the maximum size of the corpus. Feel free to adjust it according to your computing power.')
args = parser.parse_args()
lcode = args.lcode
if lcode == 'ko':
from konlpy.tag import Kkma # pip install konlpy. See http://konlpy.org/en/v0.4.4/ for further information.
kkma = Kkma()
print "kkma succesfuly loaded!"
elif lcode == 'ja':
import MeCab # See https://pypi.python.org/pypi/mecab-python/0.996
mecab = MeCab.Tagger("-Owakati")
print "mecab succesfuly loaded!"
elif lcode == 'zh':
import jieba # See https://pypi.python.org/pypi/jieba/
print "jieba succesfuly loaded!"
elif lcode == 'vi':
from pyvi.pyvi import ViTokenizer # See https://pypi.python.org/pypi/pyvi
print "pyvi succesfuly loaded!"
elif lcode == 'th':
import pythai # See https://pypi.python.org/pypi/pythai
print "pythai succesfuly loaded!"
# elif lcode == 'ar':
# os.environ['CLASSPATH'] = "../stanford-segmenter-2015-12-09"
# from nltk.tokenize.stanford_segmenter import StanfordSegmenter
# segmenter = StanfordSegmenter(path_to_jar="../stanford-segmenter-2015-12-09/stanford-segmenter-3.6.0.jar",
# path_to_sihan_corpora_dict="../stanford-segmenter-2015-12-09/data",
# path_to_model="../stanford-segmenter-2015-12-09/data/pku.gz",
# path_to_dict="../stanford-segmenter-2015-12-09/data/dict-chris6.ser.gz")
# print "StanfordSegmenter succesfuly loaded!"
max_corpus_size = args.max_corpus_size
fname = "{}wiki-20161201-pages-articles-multistream.xml".format(lcode)
def clean_text(text):
global lcode
# Common
text = regex.sub("(?s)<ref>.+?</ref>", "", text) # remove reference links
text = regex.sub("(?s)<[^>]+>", "", text) # remove html tags
text = regex.sub("&[a-z]+;", "", text) # remove html entities
text = regex.sub("(?s){{.+?}}", "", text) # remove markup tags
text = regex.sub("(?s){.+?}", "", text) # remove markup tags
text = regex.sub("(?s)\[\[([^]]+\|)", "", text) # remove link target strings
text = regex.sub("(?s)\[\[([^]]+\:.+?]])", "", text) # remove media links
text = regex.sub("[']{5}", "", text) # remove italic+bold symbols
text = regex.sub("[']{3}", "", text) # remove bold symbols
text = regex.sub("[']{2}", "", text) # remove italic symbols
if lcode in ['ko']: # korean
text = regex.sub(u"[^ \r\n\p{Hangul}.?!]", " ", text) # Replace unacceptable characters with a space.
elif lcode in ['ja']: # japanese
text = regex.sub(u"[^\r\n\p{Han}\p{Hiragana}\p{Katakana}ー。!?]", "", text)
elif lcode in ['zh']: # chinsese
text = regex.sub(u"[^\r\n\p{Han}。!?]", "", text)
elif lcode in ['th']: # thai
text = regex.sub(u"[^ \r\n\p{Thai}.?!]", " ", text)
elif lcode in ['ru']: # russian
text = regex.sub(u"[^ \r\n\p{Cyrillic}.?!\-]", " ", text)
text = text.lower()
# elif lcode in ['ar']: # arabic
# text = regex.sub(u"[^ \r\n\p{Arabic}.?!\-]", " ", text)
elif lcode in ['hi']: # hindi
text = regex.sub(u"[^ \r\n\p{Devanagari}.।?!\-]", " ", text)
elif lcode in ['bn']: # bengali
text = regex.sub(u"[^ \r\n\p{Bengali}.।?!\-]", " ", text)
elif lcode in ['de']: # german
text = regex.sub(u"[^ \r\n\p{Latin}\-'‘’.?!]", " ", text)
else: # Mostly european languages
text = regex.sub(u"[^ \r\n\p{Latin}\-'‘’.?!]", " ", text)
text = text.lower()
# Common
text = regex.sub("[ ]{2,}", " ", text) # Squeeze spaces.
return text
def sentence_segment(text):
'''
Args:
text: A string. A unsegmented paragraph.
Returns:
A list of sentences.
'''
global lcode
if lcode in ['ja', 'zh']:
sents = regex.split(u"([。!?])?[\n]+|[。!?]", text)
elif lcode in ['th']:
sents = text.split("[\n]+")
elif lcode in ['hi', 'bn']: # hindi, bengali
sents = regex.split(u"([.।?!])?[\n]+|[.।?!] ", text)
elif lcode in ['de']: # german
sents = regex.split("([.?!])?[\n]+|[.?!] ", text)
sents = [sent[0].lower() + sent[1:] for sent in sents if sent is not None and len(sent) > 1]
else:
sents = regex.split("([.?!])?[\n]+|[.?!] ", text)
return sents
def word_segment(sent):
'''
Args:
sent: A string. A sentence.
Returns:
A list of words.
'''
global lcode
if lcode in ['ko']:
words = [word for word, _ in kkma.pos(sent)]
elif lcode in ['ja']:
words = mecab.parse(sent.encode('utf8')).split()
elif lcode in ['th']:
words = pythai.split(sent)
elif lcode in ['vi']:
words = ViTokenizer.tokenize(sent).split()
elif lcode in ['zh']:
words = list(jieba.cut(sent, cut_all=False))
# elif lcode in ['ar']:
# words = segmenter.segment(sent).split()
else: # Mostly european languages
words = sent.split()
return words
def build_corpus():
global lcode, max_corpus_size, fname
with codecs.open("data/{}.txt".format(lcode), 'w', 'utf-8') as fout:
i = 1
j = 1
ns = "{http://www.mediawiki.org/xml/export-0.10/}" # namespace
for _, elem in ET.iterparse("data/{}".format(fname), tag=ns+"text"):
running_text = elem.text
try:
running_text = clean_text(running_text)
sents = sentence_segment(running_text)
for sent in sents:
if sent is not None:
words = word_segment(sent)
if len(words) > 10:
if lcode in ['ja']:
fout.write(" ".join(words).decode('utf8') + "\n")
else:
fout.write(" ".join(words) + "\n")
except:
continue # it's okay as we have a pretty big corpus!
elem.clear() # We need to save memory!
if i % 1000 == 0:
print i,
fsize = os.path.getsize("data/{}.txt".format(lcode))
if fsize > max_corpus_size:
break
i += 1
if __name__ == "__main__":
build_corpus()
print "Done"