Skip to content

Commit

Permalink
Processing XML for enwik9 data (#1292)
Browse files Browse the repository at this point in the history
  • Loading branch information
parmeet committed May 2, 2021
1 parent 064e7f2 commit ede6ce6
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 3 deletions.
6 changes: 6 additions & 0 deletions docs/source/data_functional.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,9 @@ torchtext.data.functional
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. autofunction:: numericalize_tokens_from_iterator


:hidden:`filter_wikipedia_xml`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. autofunction:: filter_wikipedia_xml
14 changes: 11 additions & 3 deletions docs/source/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,6 @@ The following datasets are available:
Text Classification
^^^^^^^^^^^^^^^^^^^

TextClassificationDataset
~~~~~~~~~~~~~~~~~~~~~~~~~

AG_NEWS
~~~~~~~

Expand Down Expand Up @@ -126,6 +123,7 @@ CoNLL2000Chunking

.. autofunction:: CoNLL2000Chunking


Question Answer
^^^^^^^^^^^^^^^

Expand All @@ -139,3 +137,13 @@ SQuAD 2.0
~~~~~~~~~

.. autofunction:: SQuAD2


Unsupervised Learning
^^^^^^^^^^^^^^^^^^^^^

EnWik9
~~~~~~

.. autofunction:: EnWik9

69 changes: 69 additions & 0 deletions torchtext/data/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,3 +180,72 @@ def numericalize_tokens_from_iterator(vocab, iterator, removed_tokens=None):
else:
yield iter(map(lambda x: vocab[x],
filter(lambda x: x not in removed_tokens, tokens)))


_patterns = [(r'<.*>', ''),
(r'&amp;', '&'),
(r'&lt;', '<'),
(r'&gt;', '>'),
(r'<ref[^<]*<\/ref>', ''),
(r'<[^>]*>', ''),
(r'\[http:[^] ]*', '['),
(r'\|thumb', ''),
(r'\|left', ''),
(r'\|right', ''),
(r'\|\d+px', ''),
(r'\[\[image:[^\[\]]*\|', ''),
(r'\[\[category:([^|\]]*)[^]]*\]\]', '[[$1]]'),
(r'\[\[[a-z\-]*:[^\]]*\]\]', ''),
(r'\[\[[^\|\]]*\|', '[['),
(r'\{\{[^\}]*\}\}', ''),
(r'\{[^\}]*\}', ''),
(r'\[', ''),
(r'\]', ''),
(r'&[^;]*;', ' '),
(r'A', 'a'), (r'B', 'b'), (r'C', 'c'),
(r'D', 'd'), (r'E', 'e'), (r'F', 'f'),
(r'G', 'g'), (r'H', 'h'), (r'I', 'i'),
(r'J', 'j'), (r'K', 'k'), (r'L', 'l'),
(r'M', 'm'), (r'N', 'n'), (r'O', 'o'),
(r'P', 'p'), (r'Q', 'q'), (r'R', 'r'),
(r'S', 's'), (r'T', 't'), (r'U', 'u'),
(r'V', 'v'), (r'W', 'w'), (r'X', 'x'),
(r'Y', 'y'), (r'Z', 'z'),
(r'0', ' zero '), (r'1', ' one '), (r'2', ' two '),
(r'3', ' three '), (r'4', ' four '), (r'5', ' five '),
(r'6', ' six '), (r'7', ' seven '), (r'8', ' eight '),
(r'9', ' nine '),
(r'[^a-z\n]+', ' '),
(r'\n ', ''),
(r'\s+', ' '),
(r'\n\s*\n', r'\n')
]


def filter_wikipedia_xml(text_iterator):
r"""Filter wikipedia xml lines according to https://github.com/facebookresearch/fastText/blob/master/wikifil.pl
args:
text_iterator: An iterator type object that yields strings. Examples include string list, text io, generators etc.
Examples:
>>> from torchtext.data.functional import filter_wikipedia_xml
>>> from torchtext.datasets import EnWik9
>>> data_iter = EnWik9(split='train')
>>> filter_data_iter = filter_wikipedia_xml(data_iter)
>>> file_name = '.data/EnWik9/enwik9'
>>> filter_data_iter = filter_wikipedia_xml(open(file_name,'r'))
"""

try:
iter(text_iterator)
except:
raise TypeError("Input {} must support iterator semantics".format(text_iterator))

norm_transform = custom_replace(_patterns)
for line in text_iterator:
if '#redirect' in line or '#REDIRECT' in line:
continue
line = list(norm_transform([line]))[0].strip()
if line:
yield line

0 comments on commit ede6ce6

Please sign in to comment.