Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

added support for GZ and BZ2 compression in reading and writing FoLiA…

… documents
  • Loading branch information...
commit 36e5abffb39cca5b0ee2f296ba9b0329ab04576f 1 parent 9860b4b
Maarten van Gompel authored
Showing with 79 additions and 11 deletions.
  1. +38 −11 formats/folia.py
  2. +41 −0 tests/folia.py
49 formats/folia.py
View
@@ -53,9 +53,11 @@
import io
import multiprocessing
import threading
+import bz2
+import gzip
FOLIAVERSION = '0.9.1'
-LIBVERSION = '0.9.1.31' #== FoLiA version + library revision
+LIBVERSION = '0.9.1.32' #== FoLiA version + library revision
#0.9.1.31 is the first version with Python 3 support
@@ -179,7 +181,7 @@ def parsecommonarguments(object, doc, annotationtype, required, allowed, **kwarg
if not Attrib.CLASS in supported:
raise ValueError("Class is not supported for " + object.__class__.__name__)
object.cls = kwargs['class']
- del kwargs['class']
+ del kwargs['class']
elif 'cls' in kwargs:
if not Attrib.CLASS in supported:
raise ValueError("Class is not supported on " + object.__class__.__name__)
@@ -3716,15 +3718,31 @@ def __init__(self, *args, **kwargs):
self.id = kwargs['id']
elif 'file' in kwargs:
self.filename = kwargs['file']
- if not self.bypassleak:
- self.load(self.filename)
- else:
- f = io.open(self.filename,'r',encoding='utf-8')
+ if self.filename[-4:].lower() == '.bz2':
+ f = bz2.BZ2File(self.filename)
+ contents = f.read()
+ f.close()
+ self.tree = xmltreefromstring(contents,self.bypassleak)
+ del contents
+ self.parsexml(self.tree.getroot())
+ elif self.filename[-3:].lower() == '.gz':
+ f = gzip.GzipFile(self.filename)
contents = f.read()
f.close()
- #contents is bytes (utf-8 bytes)
self.tree = xmltreefromstring(contents,self.bypassleak)
- self.parsexml(self.tree.getroot())
+ del contents
+ self.parsexml(self.tree.getroot())
+ else:
+ if not self.bypassleak:
+ self.load(self.filename)
+ else:
+ f = io.open(self.filename,'r',encoding='utf-8')
+ contents = f.read()
+ f.close()
+ #contents is bytes (utf-8 bytes)
+ self.tree = xmltreefromstring(contents,self.bypassleak)
+ del contents
+ self.parsexml(self.tree.getroot())
elif 'string' in kwargs:
self.tree = xmltreefromstring(kwargs['string'],self.bypassleak)
del kwargs['string']
@@ -3909,9 +3927,18 @@ def save(self, filename=None):
filename = self.filename
if not filename:
raise Exception("No filename specified")
- f = io.open(filename,'w',encoding='utf-8')
- f.write(self.xmlstring())
- f.close()
+ if filename[-4:].lower() == '.bz2':
+ f = bz2.BZ2File(filename,'wb')
+ f.write(self.xmlstring().encode('utf-8'))
+ f.close()
+ elif filename[-3:].lower() == '.gz':
+ f = gzip.GzipFile(filename,'wb')
+ f.write(self.xmlstring().encode('utf-8'))
+ f.close()
+ else:
+ f = io.open(filename,'w',encoding='utf-8')
+ f.write(self.xmlstring())
+ f.close()
def setcmdi(self,filename):
self.metadatatype = MetaDataType.CMDI
41 tests/folia.py
View
@@ -31,6 +31,8 @@
import os
import unittest
import io
+import gzip
+import bz2
FOLIAPATH = '../../FoLiA/'
@@ -89,6 +91,37 @@ def test1_readfromfile(self):
#sanity check: reading from file must yield the exact same data as reading from string
doc2 = folia.Document(string=FOLIAEXAMPLE)
self.assertEqual( doc, doc2)
+
+ def test1a_readfromfile(self):
+ """Reading from GZ file"""
+ global FOLIAEXAMPLE
+ #write example to file
+ f = gzip.GzipFile('/tmp/foliatest.xml.gz','w')
+ f.write(FOLIAEXAMPLE.encode('utf-8'))
+ f.close()
+
+ doc = folia.Document(file='/tmp/foliatest.xml.gz')
+ self.assertTrue(isinstance(doc,folia.Document))
+
+ #sanity check: reading from file must yield the exact same data as reading from string
+ doc2 = folia.Document(string=FOLIAEXAMPLE)
+ self.assertEqual( doc, doc2)
+
+
+ def test1b_readfromfile(self):
+ """Reading from BZ2 file"""
+ global FOLIAEXAMPLE
+ #write example to file
+ f = bz2.BZ2File('/tmp/foliatest.xml.bz2','w')
+ f.write(FOLIAEXAMPLE.encode('utf-8'))
+ f.close()
+
+ doc = folia.Document(file='/tmp/foliatest.xml.bz2')
+ self.assertTrue(isinstance(doc,folia.Document))
+
+ #sanity check: reading from file must yield the exact same data as reading from string
+ doc2 = folia.Document(string=FOLIAEXAMPLE)
+ self.assertEqual( doc, doc2)
def test2_readfromstring(self):
@@ -680,6 +713,14 @@ def test039_findspan(self):
def test099_write(self):
"""Sanity Check - Writing to file"""
self.doc.save('/tmp/foliasavetest.xml')
+
+ def test099b_write(self):
+ """Sanity Check - Writing to GZ file"""
+ self.doc.save('/tmp/foliasavetest.xml.gz')
+
+ def test099c_write(self):
+ """Sanity Check - Writing to BZ2 file"""
+ self.doc.save('/tmp/foliasavetest.xml.bz2')
def test100a_sanity(self):
"""Sanity Check - A - Checking output file against input (should be equal)"""
Please sign in to comment.
Something went wrong with that request. Please try again.