Skip to content

Commit

Permalink
Pass a list of annotated lines to maltparser instead of a filename.
Browse files Browse the repository at this point in the history
  • Loading branch information
EmilStenstrom committed Apr 16, 2016
1 parent 4c1a6b2 commit 62a1d86
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 89 deletions.
89 changes: 6 additions & 83 deletions conll.py
@@ -1,85 +1,8 @@
suc2ufeat = {
"AKT": ["Voice=Act"],
"DEF": ["Definite=Def"],
"GEN": ["Case=Gen"],
"IND": ["Definite=Ind"],
"INF": ["VerbForm=Inf"],
"IMP": ["VerbForm=Fin", "Mood=Imp"],
"KOM": ["Degree=Cmp"],
"KON": ["Mood=Sub"],
"NEU": ["Gender=Neut"],
"NOM": ["Case=Nom"],
"MAS": ["Gender=Masc"],
"OBJ": ["Case=Acc"],
"PLU": ["Number=Plur"],
"POS": ["Degree=Pos"],
"PRF": ["VerbForm=Part", "Tense=Past"],
"PRT": ["VerbForm=Fin", "Tense=Past"],
"PRS": ["VerbForm=Fin", "Tense=Pres"],
"SFO": ["Voice=Pass"],
"SIN": ["Number=Sing"],
"SMS": [],
"SUB": ["Case=Nom"],
"SUP": ["VerbForm=Sup"],
"SUV": ["Degree=Sup"],
"UTR": ["Gender=Com"],
"AN": [],
"-": [],
}
def tagged_to_tagged_conll(annotated_sentences, tagged_conll):
for line in annotated_sentences:
for t_id, (word, lemma, ud_tags, suc_tags) in enumerate(line):
ud_tag, ud_features = ud_tags.split("|", maxsplit=1)

print("\t".join([str(t_id), word, lemma, ud_tag, suc_tags, ud_features]), file=tagged_conll)

def tagged_to_tagged_conll(tagged, tagged_conll):
"""Read a .tag file and write to the corresponding .tagged.conll file"""
s_id = 1
t_id = 1
for line in tagged:
line = line.strip()
if not line:
print(line, file=tagged_conll)
s_id += 1
t_id = 1
continue
fields = line.split('\t')
token = fields[0]
suc_tags = fields[1]
ud_tag = suc_tags if len(fields) < 4 else fields[2]
lemma = '_' if len(fields) < 4 else fields[3]
ud_features = '_'

if "|" not in suc_tags:
print("%s\t%s\t%s\t%s\t%s\t%s" % (
"%d" % t_id,
token,
lemma,
ud_tag,
suc_tags,
ud_features), file=tagged_conll)
t_id += 1
continue

suc_tag, suc_features = suc_tags.split("|", 1)
ud_feature_list = []
for suc_feature in suc_features.split("|"):
# Don't include suc_features with multiple options in the UD suc_features
if "/" not in suc_feature:
ud_feature_list += suc2ufeat[suc_feature]

if "VerbForm=Fin" in ud_feature_list and "Mood=Imp" not in ud_feature_list and "Mood=Sub" not in ud_feature_list:
ud_feature_list += ["Mood=Ind"]

if suc_tag in ["HA", "HD", "HP", "HS"]:
ud_feature_list += ["PronType=Int,Rel"]

if suc_tag in ["HS", "PS"]:
ud_feature_list += ["Poss=Yes"] # Test this!

ud_features = "|".join(sorted(ud_feature_list)) or "_"

print("%s\t%s\t%s\t%s\t%s\t%s" % (
"%d" % t_id,
token,
lemma,
ud_tag,
suc_tags,
ud_features), file=tagged_conll)
t_id += 1
print(file=tagged_conll)
13 changes: 7 additions & 6 deletions swe-pipeline.py
Expand Up @@ -138,17 +138,19 @@ def process_file(options, filename, tmp_dir, lemmatizer, suc_tagger, ud_tagger):
tokenized_filename = output_filename(tmp_dir, filename, "tok")
tagged_filename = output_filename(tmp_dir, filename, "tag")

sentences = run_tokenization(options, filename)
annotated_sentences = []

with open(tokenized_filename, "w", encoding="utf-8") as tokenized, \
open(tagged_filename, "w", encoding="utf-8") as tagged:

sentences = run_tokenization(options, filename)

# Run only one pass over sentences for writing to both files
for sentence in sentences:
write_to_file(tokenized, sentence)

if options.tagged or options.parsed:
lemmas, ud_tags_list, suc_tags_list = run_tagging_and_lemmatization(sentence, lemmatizer, suc_tagger, ud_tagger)
annotated_sentences.append(zip(sentence, lemmas, ud_tags_list, suc_tags_list))

ud_tag_list = [ud_tags[:ud_tags.find("|")] for ud_tags in ud_tags_list]
if lemmas and ud_tags_list:
Expand All @@ -160,7 +162,7 @@ def process_file(options, filename, tmp_dir, lemmatizer, suc_tagger, ud_tagger):

parsed_filename = ""
if options.parsed:
parsed_filename = parse(options, filename, tmp_dir)
parsed_filename = parse(options, filename, annotated_sentences, tmp_dir)

write_to_output(options, tokenized_filename, tagged_filename, parsed_filename)

Expand Down Expand Up @@ -191,8 +193,7 @@ def run_tagging_and_lemmatization(sentence, lemmatizer, suc_tagger, ud_tagger):

return lemmas, ud_tags_list, suc_tags_list

def parse(options, filename, tmp_dir):
tagged_filename = output_filename(tmp_dir, filename, "tag")
def parse(options, filename, annotated_sentences, tmp_dir):
tagged_conll_filename = output_filename(tmp_dir, filename, "tag.conll")
parsed_filename = output_filename(tmp_dir, filename, "conll")
log_filename = output_filename(tmp_dir, filename, "log")
Expand All @@ -209,7 +210,7 @@ def parse(options, filename, tmp_dir):

# Conversion from .tag file to tagged.conll (input format for the parser)
tagged_conll_file = open(tagged_conll_filename, "w", encoding="utf-8")
tagged_to_tagged_conll(open(tagged_filename, "r", encoding="utf-8"), tagged_conll_file)
tagged_to_tagged_conll(annotated_sentences, tagged_conll_file)
tagged_conll_file.close()

# Run the parser
Expand Down

0 comments on commit 62a1d86

Please sign in to comment.