Permalink
Browse files

added support for GZ and BZ2 compression in reading and writing FoLiA…

… documents
  • Loading branch information...
1 parent 9860b4b commit 36e5abffb39cca5b0ee2f296ba9b0329ab04576f @proycon committed Apr 5, 2013
Showing with 79 additions and 11 deletions.
  1. +38 −11 formats/folia.py
  2. +41 −0 tests/folia.py
View
@@ -53,9 +53,11 @@
import io
import multiprocessing
import threading
+import bz2
+import gzip
FOLIAVERSION = '0.9.1'
-LIBVERSION = '0.9.1.31' #== FoLiA version + library revision
+LIBVERSION = '0.9.1.32' #== FoLiA version + library revision
#0.9.1.31 is the first version with Python 3 support
@@ -179,7 +181,7 @@ def parsecommonarguments(object, doc, annotationtype, required, allowed, **kwarg
if not Attrib.CLASS in supported:
raise ValueError("Class is not supported for " + object.__class__.__name__)
object.cls = kwargs['class']
- del kwargs['class']
+ del kwargs['class']
elif 'cls' in kwargs:
if not Attrib.CLASS in supported:
raise ValueError("Class is not supported on " + object.__class__.__name__)
@@ -3716,15 +3718,31 @@ def __init__(self, *args, **kwargs):
self.id = kwargs['id']
elif 'file' in kwargs:
self.filename = kwargs['file']
- if not self.bypassleak:
- self.load(self.filename)
- else:
- f = io.open(self.filename,'r',encoding='utf-8')
+ if self.filename[-4:].lower() == '.bz2':
+ f = bz2.BZ2File(self.filename)
+ contents = f.read()
+ f.close()
+ self.tree = xmltreefromstring(contents,self.bypassleak)
+ del contents
+ self.parsexml(self.tree.getroot())
+ elif self.filename[-3:].lower() == '.gz':
+ f = gzip.GzipFile(self.filename)
contents = f.read()
f.close()
- #contents is bytes (utf-8 bytes)
self.tree = xmltreefromstring(contents,self.bypassleak)
- self.parsexml(self.tree.getroot())
+ del contents
+ self.parsexml(self.tree.getroot())
+ else:
+ if not self.bypassleak:
+ self.load(self.filename)
+ else:
+ f = io.open(self.filename,'r',encoding='utf-8')
+ contents = f.read()
+ f.close()
+ #contents is bytes (utf-8 bytes)
+ self.tree = xmltreefromstring(contents,self.bypassleak)
+ del contents
+ self.parsexml(self.tree.getroot())
elif 'string' in kwargs:
self.tree = xmltreefromstring(kwargs['string'],self.bypassleak)
del kwargs['string']
@@ -3909,9 +3927,18 @@ def save(self, filename=None):
filename = self.filename
if not filename:
raise Exception("No filename specified")
- f = io.open(filename,'w',encoding='utf-8')
- f.write(self.xmlstring())
- f.close()
+ if filename[-4:].lower() == '.bz2':
+ f = bz2.BZ2File(filename,'wb')
+ f.write(self.xmlstring().encode('utf-8'))
+ f.close()
+ elif filename[-3:].lower() == '.gz':
+ f = gzip.GzipFile(filename,'wb')
+ f.write(self.xmlstring().encode('utf-8'))
+ f.close()
+ else:
+ f = io.open(filename,'w',encoding='utf-8')
+ f.write(self.xmlstring())
+ f.close()
def setcmdi(self,filename):
self.metadatatype = MetaDataType.CMDI
View
@@ -31,6 +31,8 @@
import os
import unittest
import io
+import gzip
+import bz2
FOLIAPATH = '../../FoLiA/'
@@ -89,6 +91,37 @@ def test1_readfromfile(self):
#sanity check: reading from file must yield the exact same data as reading from string
doc2 = folia.Document(string=FOLIAEXAMPLE)
self.assertEqual( doc, doc2)
+
+ def test1a_readfromfile(self):
+ """Reading from GZ file"""
+ global FOLIAEXAMPLE
+ #write example to file
+ f = gzip.GzipFile('/tmp/foliatest.xml.gz','w')
+ f.write(FOLIAEXAMPLE.encode('utf-8'))
+ f.close()
+
+ doc = folia.Document(file='/tmp/foliatest.xml.gz')
+ self.assertTrue(isinstance(doc,folia.Document))
+
+ #sanity check: reading from file must yield the exact same data as reading from string
+ doc2 = folia.Document(string=FOLIAEXAMPLE)
+ self.assertEqual( doc, doc2)
+
+
+ def test1b_readfromfile(self):
+ """Reading from BZ2 file"""
+ global FOLIAEXAMPLE
+ #write example to file
+ f = bz2.BZ2File('/tmp/foliatest.xml.bz2','w')
+ f.write(FOLIAEXAMPLE.encode('utf-8'))
+ f.close()
+
+ doc = folia.Document(file='/tmp/foliatest.xml.bz2')
+ self.assertTrue(isinstance(doc,folia.Document))
+
+ #sanity check: reading from file must yield the exact same data as reading from string
+ doc2 = folia.Document(string=FOLIAEXAMPLE)
+ self.assertEqual( doc, doc2)
def test2_readfromstring(self):
@@ -680,6 +713,14 @@ def test039_findspan(self):
def test099_write(self):
"""Sanity Check - Writing to file"""
self.doc.save('/tmp/foliasavetest.xml')
+
+ def test099b_write(self):
+ """Sanity Check - Writing to GZ file"""
+ self.doc.save('/tmp/foliasavetest.xml.gz')
+
+ def test099c_write(self):
+ """Sanity Check - Writing to BZ2 file"""
+ self.doc.save('/tmp/foliasavetest.xml.bz2')
def test100a_sanity(self):
"""Sanity Check - A - Checking output file against input (should be equal)"""

0 comments on commit 36e5abf

Please sign in to comment.