Skip to content

Commit

Permalink
[foliasplit] do shallow copy/move by default #20
Browse files Browse the repository at this point in the history
  • Loading branch information
proycon committed Nov 18, 2020
1 parent 0510297 commit 5343d06
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 5 deletions.
2 changes: 1 addition & 1 deletion foliatools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""FoLiA-tools contains various Python-based command line tools for working with FoLiA XML (Format for Linguistic Annotation)"""

VERSION = "2.4.0"
VERSION = "2.4.1"
10 changes: 7 additions & 3 deletions foliatools/foliasplit.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import folia.main as folia
import folia.fql as fql

def split(doc, expression, batchsize=1, copymetadata=False, require_submetadata=False, suffix_template="_{:04d}", alter_ids=False, external=False, callback=None):
def split(doc, expression, batchsize=1, copymetadata=False, require_submetadata=False, suffix_template="_{:04d}", alter_ids=False, external=False, callback=None, deep=False):
query = fql.Query("SELECT " + expression)
childdoc = None
prevmatch = None
Expand Down Expand Up @@ -66,7 +66,10 @@ def split(doc, expression, batchsize=1, copymetadata=False, require_submetadata=
raise Exception("Unable to find child from parent! This shouldn't happen")
match.parent.data[i] = folia.External(doc, src=childdoc.id + ".folia.xml", id=childdoc.id, processor=proc)
substituted = True
matchcopy = match.copy(childdoc, id_suffix if alter_ids else "")
if deep:
matchcopy = match.copy(childdoc, id_suffix if alter_ids else "") #deep copy
else:
matchcopy = match.move(childdoc, id_suffix if alter_ids else "") #shallow copy
matchcopy.metadata = None
body.append(matchcopy)
if len(body.data) == batchsize:
Expand All @@ -86,6 +89,7 @@ def main():
parser.add_argument('--copymetadata','-m',help="Copy all metadata from the parent document to the children", action='store_true', required=False)
parser.add_argument('--batchsize','-b',type=int, help="Batch size: create documents with this many matches", action='store', required=False, default=1)
parser.add_argument('--alterids','-i',help="Alter the IDs of all split elements by appending a suffix", action='store_true', required=False)
parser.add_argument('--deep',help="Make a deep copy (slower)", action='store_true', required=False)
parser.add_argument('--suffixtemplate',help="A template for adding suffixes to IDs, in Python's format syntax", action='store_true', required=False, default="_{:04d}")
parser.add_argument('--submetadata',help="Only split elements that have associated submetadata (extra parameter as the query can't capture this)", action='store_true', required=False)
parser.add_argument('--query','-q',type=str, help="Query to select elements to split, this is an FQL SELECT expression without the SELECT statement, it can be as simple as the element type, e.g. s for sentences or more complex like: 'div OF https://your.set WHERE class = \"chapter\"' ", action='store', required=True)
Expand All @@ -102,7 +106,7 @@ def main():

for filename in args.files:
doc = folia.Document(file=filename)
for i, childdoc in enumerate(split(doc, args.query, args.batchsize, args.copymetadata, args.submetadata, args.suffixtemplate, args.alterids, args.external)):
for i, childdoc in enumerate(split(doc, args.query, args.batchsize, args.copymetadata, args.submetadata, args.suffixtemplate, args.alterids, args.external, args.deep)):
print("#" + str(i+1) + " - " + childdoc.id + ".folia.xml", file=sys.stderr)
childdoc.save(os.path.join(args.outputdir, childdoc.id) + ".folia.xml")

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def read(fname):

setup(
name = "FoLiA-tools",
version = "2.4.0", #also change in __init__.py
version = "2.4.1", #also change in __init__.py
author = "Maarten van Gompel",
author_email = "proycon@anaproy.nl",
description = ("FoLiA-tools contains various Python-based command line tools for working with FoLiA XML (Format for Linguistic Annotation)"),
Expand Down

0 comments on commit 5343d06

Please sign in to comment.