Pass a list of annotated lines to maltparser instead of a filename.

robertostling · Apr 16, 2016 · 62a1d86 · 62a1d86
1 parent 4c1a6b2
commit 62a1d86
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 89 deletions.
diff --git a/conll.py b/conll.py
@@ -1,85 +1,8 @@
-suc2ufeat = {
-    "AKT": ["Voice=Act"],
-    "DEF": ["Definite=Def"],
-    "GEN": ["Case=Gen"],
-    "IND": ["Definite=Ind"],
-    "INF": ["VerbForm=Inf"],
-    "IMP": ["VerbForm=Fin", "Mood=Imp"],
-    "KOM": ["Degree=Cmp"],
-    "KON": ["Mood=Sub"],
-    "NEU": ["Gender=Neut"],
-    "NOM": ["Case=Nom"],
-    "MAS": ["Gender=Masc"],
-    "OBJ": ["Case=Acc"],
-    "PLU": ["Number=Plur"],
-    "POS": ["Degree=Pos"],
-    "PRF": ["VerbForm=Part", "Tense=Past"],
-    "PRT": ["VerbForm=Fin", "Tense=Past"],
-    "PRS": ["VerbForm=Fin", "Tense=Pres"],
-    "SFO": ["Voice=Pass"],
-    "SIN": ["Number=Sing"],
-    "SMS": [],
-    "SUB": ["Case=Nom"],
-    "SUP": ["VerbForm=Sup"],
-    "SUV": ["Degree=Sup"],
-    "UTR": ["Gender=Com"],
-    "AN": [],
-    "-": [],
-}
+def tagged_to_tagged_conll(annotated_sentences, tagged_conll):
+    for line in annotated_sentences:
+        for t_id, (word, lemma, ud_tags, suc_tags) in enumerate(line):
+            ud_tag, ud_features = ud_tags.split("|", maxsplit=1)
 
+            print("\t".join([str(t_id), word, lemma, ud_tag, suc_tags, ud_features]), file=tagged_conll)
 
-def tagged_to_tagged_conll(tagged, tagged_conll):
-    """Read a .tag file and write to the corresponding .tagged.conll file"""
-    s_id = 1
-    t_id = 1
-    for line in tagged:
-        line = line.strip()
-        if not line:
-            print(line, file=tagged_conll)
-            s_id += 1
-            t_id = 1
-            continue
-        fields = line.split('\t')
-        token = fields[0]
-        suc_tags = fields[1]
-        ud_tag = suc_tags if len(fields) < 4 else fields[2]
-        lemma = '_' if len(fields) < 4 else fields[3]
-        ud_features = '_'
-
-        if "|" not in suc_tags:
-            print("%s\t%s\t%s\t%s\t%s\t%s" % (
-                "%d" % t_id,
-                token,
-                lemma,
-                ud_tag,
-                suc_tags,
-                ud_features), file=tagged_conll)
-            t_id += 1
-            continue
-
-        suc_tag, suc_features = suc_tags.split("|", 1)
-        ud_feature_list = []
-        for suc_feature in suc_features.split("|"):
-            # Don't include suc_features with multiple options in the UD suc_features
-            if "/" not in suc_feature:
-                ud_feature_list += suc2ufeat[suc_feature]
-
-        if "VerbForm=Fin" in ud_feature_list and "Mood=Imp" not in ud_feature_list and "Mood=Sub" not in ud_feature_list:
-            ud_feature_list += ["Mood=Ind"]
-
-        if suc_tag in ["HA", "HD", "HP", "HS"]:
-            ud_feature_list += ["PronType=Int,Rel"]
-
-        if suc_tag in ["HS", "PS"]:
-            ud_feature_list += ["Poss=Yes"]  # Test this!
-
-        ud_features = "|".join(sorted(ud_feature_list)) or "_"
-
-        print("%s\t%s\t%s\t%s\t%s\t%s" % (
-            "%d" % t_id,
-            token,
-            lemma,
-            ud_tag,
-            suc_tags,
-            ud_features), file=tagged_conll)
-        t_id += 1
+        print(file=tagged_conll)
diff --git a/swe-pipeline.py b/swe-pipeline.py
@@ -138,17 +138,19 @@ def process_file(options, filename, tmp_dir, lemmatizer, suc_tagger, ud_tagger):
     tokenized_filename = output_filename(tmp_dir, filename, "tok")
     tagged_filename = output_filename(tmp_dir, filename, "tag")
 
+    sentences = run_tokenization(options, filename)
+    annotated_sentences = []
+
     with open(tokenized_filename, "w", encoding="utf-8") as tokenized, \
             open(tagged_filename, "w", encoding="utf-8") as tagged:
 
-        sentences = run_tokenization(options, filename)
-
         # Run only one pass over sentences for writing to both files
         for sentence in sentences:
             write_to_file(tokenized, sentence)
 
             if options.tagged or options.parsed:
                 lemmas, ud_tags_list, suc_tags_list = run_tagging_and_lemmatization(sentence, lemmatizer, suc_tagger, ud_tagger)
+                annotated_sentences.append(zip(sentence, lemmas, ud_tags_list, suc_tags_list))
 
                 ud_tag_list = [ud_tags[:ud_tags.find("|")] for ud_tags in ud_tags_list]
                 if lemmas and ud_tags_list:
@@ -160,7 +162,7 @@ def process_file(options, filename, tmp_dir, lemmatizer, suc_tagger, ud_tagger):
 
     parsed_filename = ""
     if options.parsed:
-        parsed_filename = parse(options, filename, tmp_dir)
+        parsed_filename = parse(options, filename, annotated_sentences, tmp_dir)
 
     write_to_output(options, tokenized_filename, tagged_filename, parsed_filename)
 
@@ -191,8 +193,7 @@ def run_tagging_and_lemmatization(sentence, lemmatizer, suc_tagger, ud_tagger):
 
     return lemmas, ud_tags_list, suc_tags_list
 
-def parse(options, filename, tmp_dir):
-    tagged_filename = output_filename(tmp_dir, filename, "tag")
+def parse(options, filename, annotated_sentences, tmp_dir):
     tagged_conll_filename = output_filename(tmp_dir, filename, "tag.conll")
     parsed_filename = output_filename(tmp_dir, filename, "conll")
     log_filename = output_filename(tmp_dir, filename, "log")
@@ -209,7 +210,7 @@ def parse(options, filename, tmp_dir):
 
     # Conversion from .tag file to tagged.conll (input format for the parser)
     tagged_conll_file = open(tagged_conll_filename, "w", encoding="utf-8")
-    tagged_to_tagged_conll(open(tagged_filename, "r", encoding="utf-8"), tagged_conll_file)
+    tagged_to_tagged_conll(annotated_sentences, tagged_conll_file)
     tagged_conll_file.close()
 
     # Run the parser