Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

script to Python3 #19

Merged
merged 1 commit into from
Oct 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
208 changes: 192 additions & 16 deletions docs/scripts/configurations.json
Original file line number Diff line number Diff line change
@@ -1,16 +1,192 @@
{"localpaths":{"ontonotes":"../../data/ontonotes/","ewt":"../../data/google/ewt/","questionbank":"../../data/google/questionbank/","bolt-cts-3":"../../data/bolt/CTS/CTS/03",
"bolt-cts-2":"../../data/bolt/CTS/CTS/02","bolt-cts-1":"../../data/bolt/CTS/CTS/01","bolt-sms-4":"../../data/bolt/SMS/SMS/04","bolt-sms-3":"../../data/bolt/SMS/SMS/03",
"bolt-sms-2":"../../data/bolt/SMS/SMS/02","bolt-sms-1":"../../data/bolt/SMS/SMS/01","bolt-sms-5":"../../data/bolt/SMS/SMS/05",
"bolt-df-1":"../../data/bolt/DF/DF/01","bolt-df-2":"../../data/bolt/DF/DF/02","bolt-df-3":"../../data/bolt/DF/DF/03","bolt-df-4":"../../data/bolt/DF/DF/04","bolt-df-5":"../../data/bolt/DF/DF/05","bolt-df-6":"../../data/bolt/DF/DF/06","bolt-df-7":"../../data/bolt/DF/DF/07"},
"modifications":{"ontonotes":["",""], "ewt":["/00","/penntree"], "questionbank":["questionbank/00",""], "bolt-cts-3":["CTS/CTS/03",""], "bolt-cts-2":["CTS/CTS/02 ",""], "bolt-cts-1":["CTS/CTS/03",""],
"bolt-sms-3":["SMS/SMS/03",""],"bolt-sms-2":["SMS/SMS/02",""],"bolt-sms-1":["SMS/SMS/01",""],"bolt-sms-4":["SMS/SMS/04",""],"bolt-sms-5":["SMS/SMS/05",""],
"bolt-df-7":["DF/DF/07",""],"bolt-df-6":["DF/DF/06",""],"bolt-df-5":["DF/DF/05",""],"bolt-df-4":["DF/DF/04",""],"bolt-df-3":["DF/DF/03",""],"bolt-df-2":["DF/DF/02",""],"bolt-df-1":["DF/DF/01",""]},
"filemoddict":{"questionbank":[["QB-revised","QB-revised.v1"]]},
"metadata":{"ontonotes":"data/files/data/english/annotations/", "ewt":"data/", "questionbank":"data/","bolt-cts-3":"/data/translation-alternates-included/penntree/","bolt-cts-2":"/data/translation-alternates-included/penntree/","bolt-cts-1":"/data/translation-alternates-included/penntree/",
"bolt-sms-4":"/data/translation-alternates-included/penntree/","bolt-sms-3":"/data/translation-alternates-included/penntree/","bolt-sms-2":"/data/penntree/","bolt-sms-1":"/data/penntree","bolt-sms-5":"/data/translation-alternates-included/penntree/",
"bolt-df-7":"/data/translation-alternates-removed/penntree/","bolt-df-6":"/data/source/","bolt-df-5":"/data/penntree/","bolt-df-4":"/data/penntree/","bolt-df-3":"/data/penntree/","bolt-df-2":"/data/penntree/","bolt-df-1":"/data/penntree/"
},
"filemoddict":{"questionbank":[["QB-revised","QB-revised.v1"]],"bolt-df-6":[[".xml.",".xml.meta_removed."]]},
"flagdict":{"questionbank":["--topless"],"ewt":["--topless"], "ontonotes":[], "bolt-cts-3":["--topless"],"bolt-cts-2":["--topless"],"bolt-cts-1":["--topless"],
"bolt-sms-5":["--topless"],"bolt-sms-4":["--topless"],"bolt-sms-3":["--topless"],"bolt-sms-2":["--topless"],"bolt-sms-1":["--topless"],
"bolt-df-1":["--topless"],"bolt-df-2":["--topless"],"bolt-df-3":["--topless"],"bolt-df-4":["--topless"],"bolt-df-5":["--topless"],"bolt-df-6":["--topless"],"bolt-df-7":["--topless"]}}
{
"localpaths": {
"ontonotes": "../../data/ontonotes/",
"ewt": "../../data/google/ewt/",
"questionbank": "../../data/google/questionbank/",
"bolt-cts-3": "../../data/bolt/CTS/CTS/03",
"bolt-cts-2": "../../data/bolt/CTS/CTS/02",
"bolt-cts-1": "../../data/bolt/CTS/CTS/01",
"bolt-sms-4": "../../data/bolt/SMS/SMS/04",
"bolt-sms-3": "../../data/bolt/SMS/SMS/03",
"bolt-sms-2": "../../data/bolt/SMS/SMS/02",
"bolt-sms-1": "../../data/bolt/SMS/SMS/01",
"bolt-sms-5": "../../data/bolt/SMS/SMS/05",
"bolt-df-1": "../../data/bolt/DF/DF/01",
"bolt-df-2": "../../data/bolt/DF/DF/02",
"bolt-df-3": "../../data/bolt/DF/DF/03",
"bolt-df-4": "../../data/bolt/DF/DF/04",
"bolt-df-5": "../../data/bolt/DF/DF/05",
"bolt-df-6": "../../data/bolt/DF/DF/06",
"bolt-df-7": "../../data/bolt/DF/DF/07"
},
"modifications": {
"ontonotes": [
"",
""
],
"ewt": [
"/00",
"/penntree"
],
"questionbank": [
"questionbank/00",
""
],
"bolt-cts-3": [
"CTS/CTS/03",
""
],
"bolt-cts-2": [
"CTS/CTS/02 ",
""
],
"bolt-cts-1": [
"CTS/CTS/03",
""
],
"bolt-sms-3": [
"SMS/SMS/03",
""
],
"bolt-sms-2": [
"SMS/SMS/02",
""
],
"bolt-sms-1": [
"SMS/SMS/01",
""
],
"bolt-sms-4": [
"SMS/SMS/04",
""
],
"bolt-sms-5": [
"SMS/SMS/05",
""
],
"bolt-df-7": [
"DF/DF/07",
""
],
"bolt-df-6": [
"DF/DF/06",
""
],
"bolt-df-5": [
"DF/DF/05",
""
],
"bolt-df-4": [
"DF/DF/04",
""
],
"bolt-df-3": [
"DF/DF/03",
""
],
"bolt-df-2": [
"DF/DF/02",
""
],
"bolt-df-1": [
"DF/DF/01",
""
]
},
"filemoddict": {
"questionbank": [
[
"QB-revised",
"QB-revised.v1"
]
]
},
"metadata": {
"ontonotes": "data/files/data/english/annotations/",
"ewt": "data/",
"questionbank": "data/",
"bolt-cts-3": "/data/translation-alternates-included/penntree/",
"bolt-cts-2": "/data/translation-alternates-included/penntree/",
"bolt-cts-1": "/data/translation-alternates-included/penntree/",
"bolt-sms-4": "/data/translation-alternates-included/penntree/",
"bolt-sms-3": "/data/translation-alternates-included/penntree/",
"bolt-sms-2": "/data/penntree/",
"bolt-sms-1": "/data/penntree",
"bolt-sms-5": "/data/translation-alternates-included/penntree/",
"bolt-df-7": "/data/translation-alternates-removed/penntree/",
"bolt-df-6": "/data/source/",
"bolt-df-5": "/data/penntree/",
"bolt-df-4": "/data/penntree/",
"bolt-df-3": "/data/penntree/",
"bolt-df-2": "/data/penntree/",
"bolt-df-1": "/data/penntree/"
},
"filemoddict": {
"questionbank": [
[
"QB-revised",
"QB-revised.v1"
]
],
"bolt-df-6": [
[
".xml.",
".xml.meta_removed."
]
]
},
"flagdict": {
"questionbank": [
"--topless"
],
"ewt": [
"--topless"
],
"ontonotes": [],
"bolt-cts-3": [
"--topless"
],
"bolt-cts-2": [
"--topless"
],
"bolt-cts-1": [
"--topless"
],
"bolt-sms-5": [
"--topless"
],
"bolt-sms-4": [
"--topless"
],
"bolt-sms-3": [
"--topless"
],
"bolt-sms-2": [
"--topless"
],
"bolt-sms-1": [
"--topless"
],
"bolt-df-1": [
"--topless"
],
"bolt-df-2": [
"--topless"
],
"bolt-df-3": [
"--topless"
],
"bolt-df-4": [
"--topless"
],
"bolt-df-5": [
"--topless"
],
"bolt-df-6": [
"--topless"
],
"bolt-df-7": [
"--topless"
]
}
}
48 changes: 23 additions & 25 deletions docs/scripts/skeleton2conll.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,12 @@
import re
import string
from collections import defaultdict
from cStringIO import StringIO
from io import StringIO

WORD_COLUMN=3
LEMMA_COLUMN=6




MIN_VERBOSITY = 0
MED_VERBOSITY = 5
MAX_VERBOSITY = 10
Expand Down Expand Up @@ -389,7 +387,7 @@ def parse_sexpr(s):
if parens == 0:
try:
x = parse_sexpr("".join(cur))
except InvalidSexprException, e:
except InvalidSexprException as e:
raise InvalidSexprException("Parent: %s" % s, e)

if x:
Expand Down Expand Up @@ -843,8 +841,8 @@ def pretty_print_table(rows, separator=None, out_file=None):

if(out_file == None):
for row in r_c_matrix:
print " ".join(row).strip()
print
print(" ".join(row).strip())
print()

elif(out_file == "-"):
rows=[]
Expand Down Expand Up @@ -1106,7 +1104,7 @@ def start(input_fname, conll_fname, output_fname, encoding, changes):

if DEBUG:
if columns[LEMMA_COLUMN] == a_list_of_lemmas[w].lemma.strip():
print "found the same lemma"
print("found the same lemma")
else:
raise Exception("Something is wrong: %s %s %s" % (columns[LEMMA_COLUMN], a_list_of_lemmas[w].lemma.strip(), " ".join(columns)))

Expand All @@ -1121,7 +1119,7 @@ def start(input_fname, conll_fname, output_fname, encoding, changes):
pretty_print_table_string = pretty_print_table(rows, out_file="-")

if output_fname == "-":
print pretty_print_table_string
print(pretty_print_table_string)
else:
with codecs.open(output_fname, "a", encoding) as outf:
outf.write("%s\n" % (pretty_print_table_string))
Expand All @@ -1131,7 +1129,7 @@ def start(input_fname, conll_fname, output_fname, encoding, changes):

elif(line.startswith("#")):
if output_fname == "-":
print line.strip()
print(line.strip())
else:
with codecs.open(output_fname, "a", encoding) as outf:
outf.write("%s\n" % (line.strip()))
Expand All @@ -1154,27 +1152,27 @@ def start(input_fname, conll_fname, output_fname, encoding, changes):
sys.argv.remove("--gb18030")

if len(sys.argv[1:]) == 2 and sys.argv[1] in ["--help", "-h"] and sys.argv[2] in transformations:
print
print " ", transformations[sys.argv[2]].__doc__
print()
print(" ", transformations[sys.argv[2]].__doc__)
elif not sys.argv[1:] or "--help" in sys.argv[1:] or "-h" in sys.argv[1:]:
print
print "-"*120
print "Usage: python skeleton2conll.py <ontonotes-parse-file> <input-skel-file> <conll-output-file> [transformations] ..."
print "\nAllowed transforms:"
print()
print("-"*120)
print("Usage: python skeleton2conll.py <ontonotes-parse-file> <input-skel-file> <conll-output-file> [transformations] ...")
print("\nAllowed transforms:")

max_key_len = max(len(t) for t in transformations) + 1 # +1 for colon

for key in transformations:
print " %s %s" %(("%s:"%key).rjust(max_key_len),
transformations[key].__doc__.split("\n")[0])

print " %s %s" % ("--text:".rjust(max_key_len),
"Produce text output instead of parse trees")
print
print
print "Example:"
print "python skeleton2conll.py <ontonotes-release-directory>/data/.../bc/cnn/00/cnn_0000.parse conll-2011/dev/data/english/annotations/bc/cnn/00/cnn_0000.v0_gold_skel conll-2011/dev/data/english/annotations/bc/cnn/00/cnn_0000.v0_gold_conll -edited --text"
print "-"*120
print(" %s %s" %(("%s:"%key).rjust(max_key_len),
transformations[key].__doc__.split("\n")[0]))

print(" %s %s" % ("--text:".rjust(max_key_len),
"Produce text output instead of parse trees"))
print()
print()
print("Example:")
print("python skeleton2conll.py <ontonotes-release-directory>/data/.../bc/cnn/00/cnn_0000.parse conll-2011/dev/data/english/annotations/bc/cnn/00/cnn_0000.v0_gold_skel conll-2011/dev/data/english/annotations/bc/cnn/00/cnn_0000.v0_gold_conll -edited --text")
print("-"*120)
else:
input_fname, conll_fname, output_fname, changes = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4:]
start(codecs.open(input_fname).read().replace("( (","(TOP ("), conll_fname, output_fname, encoding, changes)
Binary file removed docs/scripts/skeleton2conll.pyc
Binary file not shown.