Skip to content

Commit

Permalink
Try to generalize wikiner reading - currently the download format is a
Browse files Browse the repository at this point in the history
bz2 file with one thing in it, but an older layout I have had the text
itself in a "raw" subdirectory

ignore leftover bz2 files

Some of the input files are Windows encoded ffs
  • Loading branch information
AngledLuffa committed May 28, 2022
1 parent fde9755 commit 815b411
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 8 deletions.
19 changes: 14 additions & 5 deletions stanza/utils/datasets/ner/prepare_ner_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,16 +445,25 @@ def process_wikiner(paths, dataset):
base_input_path = os.path.join(paths["NERBASE"], dataset)
base_output_path = paths["NER_DATA_DIR"]

raw_input_path = os.path.join(base_input_path, "raw")
input_files = glob.glob(os.path.join(raw_input_path, "aij-wikiner*"))
expected_filename = "aij*wikiner*"
input_files = [x for x in glob.glob(os.path.join(base_input_path, expected_filename)) if not x.endswith("bz2")]
if len(input_files) == 0:
raise FileNotFoundError("Could not find any raw wikiner files in %s" % raw_input_path)
raw_input_path = os.path.join(base_input_path, "raw")
input_files = [x for x in glob.glob(os.path.join(raw_input_path, expected_filename)) if not x.endswith("bz2")]
if len(input_files) > 1:
raise FileNotFoundError("Found too many raw wikiner files in %s: %s" % (raw_input_path, ", ".join(input_files)))
elif len(input_files) > 1:
raise FileNotFoundError("Found too many raw wikiner files in %s: %s" % (raw_input_path, ", ".join(input_files)))
raise FileNotFoundError("Found too many raw wikiner files in %s: %s" % (base_input_path, ", ".join(input_files)))

if len(input_files) == 0:
raise FileNotFoundError("Could not find any raw wikiner files in %s or %s" % (base_input_path, raw_input_path))

csv_file = os.path.join(base_output_path, short_name + "_csv")
print("Converting raw input %s to space separated file in %s" % (input_files[0], csv_file))
preprocess_wikiner(input_files[0], csv_file)
try:
preprocess_wikiner(input_files[0], csv_file)
except UnicodeDecodeError:
preprocess_wikiner(input_files[0], csv_file, encoding="iso8859-1")

# this should create train.bio, dev.bio, and test.bio
print("Splitting %s to %s" % (csv_file, base_output_path))
Expand Down
6 changes: 3 additions & 3 deletions stanza/utils/datasets/ner/preprocess_wikiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

import sys

def preprocess_wikiner(input_file, output_file):
with open(input_file) as fin:
with open(output_file, "w") as fout:
def preprocess_wikiner(input_file, output_file, encoding="utf-8"):
with open(input_file, encoding=encoding) as fin:
with open(output_file, "w", encoding="utf-8") as fout:
for line in fin:
line = line.strip()
if not line:
Expand Down

0 comments on commit 815b411

Please sign in to comment.