Permalink
Browse files

updated libraries, including FoLiA library, for Python 3 compatibility

  • Loading branch information...
1 parent 9302cd2 commit 0a19ff4f7f4cb488d252555a95d67b8e6710b0c3 @proycon committed Mar 29, 2013
Showing with 350 additions and 213 deletions.
  1. +4 −4 datatypes.py
  2. +11 −3 evaluation.py
  3. +15 −2 formats/cgn.py
  4. +30 −20 formats/dutchsemcor.py
  5. +101 −83 formats/folia.py
  6. +14 −9 formats/giza.py
  7. +46 −28 formats/moses.py
  8. +14 −14 formats/sonar.py
  9. +8 −3 formats/taggerdata.py
  10. +16 −4 formats/timbl.py
  11. +10 −2 lm/lm.py
  12. +17 −9 net.py
  13. +8 −2 search.py
  14. +14 −7 statistics.py
  15. +33 −21 tagger.py
  16. +9 −2 textprocessors.py
View
@@ -22,6 +22,7 @@
from __future__ import absolute_import
from pynlpl.common import u
+import random
import bisect
import array
@@ -273,10 +274,10 @@ def __getitem__(self, index):
raise
def __str__(self):
- return str(value)
+ return str(self.value)
def __unicode__(self): #Python 2.x
- return u(value)
+ return u(self.value)
@@ -324,7 +325,7 @@ def __setitem__(self, key, subtrie):
if not self.children: self.children = {}
subtrie.value = key
subtrie.parent = self
- self.children[key] = value
+ self.children[key] = subtrie
def append(self, sequence):
if not sequence:
@@ -403,7 +404,6 @@ def walk(self, leavesonly=True, maxdepth=None, _depth = 0):
def containsnullbyte(i):
assert isinstance(i,int)
while True:
- r = i % 256
if i % 256 == 0:
return True
if i >= 256:
View
@@ -17,6 +17,14 @@
from __future__ import division
from __future__ import absolute_import
from pynlpl.common import u
+import sys
+if sys.version < '3':
+ from codecs import getwriter
+ stderr = getwriter('utf-8')(sys.stderr)
+ stdout = getwriter('utf-8')(sys.stdout)
+else:
+ stderr = sys.stderr
+ stdout = sys.stdout
import io
@@ -29,7 +37,7 @@
import copy
import datetime
import os.path
-import sys
+
class ProcessFailed(Exception):
@@ -321,7 +329,7 @@ def startcommand(self, command, cwd, stdout, stderr, *arguments, **parameters):
else:
cmd += ' ' + key + str(value)
if printcommand:
- print("STARTING COMMAND: " + cmd.encode('utf-8'))
+ print("STARTING COMMAND: " + cmd.encode('utf-8'),file=stderr)
self.begintime = datetime.datetime.now()
if not cwd:
@@ -375,7 +383,7 @@ def poll(self, haltonerror=True):
if experiment.done():
done.append( experiment )
except ProcessFailed:
- print("ERROR: One experiment in the pool failed: " + repr(experiment.inputdata) + repr(experiment.parameters), file=sys.stderr)
+ print("ERROR: One experiment in the pool failed: " + repr(experiment.inputdata) + repr(experiment.parameters), file=stderr)
if haltonerror:
raise
else:
View
@@ -14,9 +14,22 @@
#
###############################################################
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import division
+from __future__ import absolute_import
+import sys
+if sys.version < '3':
+ from codecs import getwriter
+ stderr = getwriter('utf-8')(sys.stderr)
+ stdout = getwriter('utf-8')(sys.stdout)
+else:
+ stderr = sys.stderr
+ stdout = sys.stdout
+
from pynlpl.formats import folia
from pynlpl.common import Enum
-import sys
+
class InvalidTagException(Exception):
pass
@@ -81,7 +94,7 @@ def parse_cgn_postag(rawtag, raisefeatureexceptions = False):
tag.append( folia.Feature, subset=subset,cls=rawfeature)
break
if not found:
- print >>sys.stderr, "\t\tUnknown feature value: " + rawfeature + " in " + rawtag
+ print("\t\tUnknown feature value: " + rawfeature + " in " + rawtag, file=stderr)
if raisefeatureexceptions:
raise InvalidFeatureException("Unknown feature value: " + rawfeature + " in " + rawtag)
else:
@@ -1,26 +1,39 @@
#-*- coding:utf-8 -*-
###############################################################
-# Modified by Ruben Izquierdo
-# We need also to store the TIMBL distance to the nearest neighboor
-#
# PyNLPl - DutchSemCor
# by Maarten van Gompel (proycon)
# http://ilk.uvt.nl/~mvgompel
# Induction for Linguistic Knowledge Research Group
# Universiteit van Tilburg
#
# Licensed under GPLv3
+#
+# Modified by Ruben Izquierdo
+# We need also to store the TIMBL distance to the nearest neighboor
#
# Collection of formats for the DutchSemCor project
#
###############################################################
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import division
+from __future__ import absolute_import
+from pynlpl.common import u
+import sys
+if sys.version < '3':
+ from codecs import getwriter
+ stderr = getwriter('utf-8')(sys.stderr)
+ stdout = getwriter('utf-8')(sys.stdout)
+else:
+ stderr = sys.stderr
+ stdout = sys.stdout
from pynlpl.formats.timbl import TimblOutput
from pynlpl.statistics import Distribution
-import codecs
-from sys import stderr
+import io
+
class WSDSystemOutput(object):
def __init__(self, filename = None):
@@ -79,7 +92,7 @@ def __getitem__(self, word_id):
return self.data[word_id]
def load(self, filename):
- f = codecs.open(filename,'r','utf-8')
+ f = io.open(filename,'r',encoding='utf-8')
for line in f:
fields = line.strip().split(" ")
word_id = fields[0]
@@ -104,7 +117,7 @@ def load(self, filename):
f.close()
def save(self, filename):
- f = codecs.open(filename,'w','utf-8')
+ f = io.open(filename,'w',encoding='utf-8')
for word_id, senses,distance in self:
f.write(word_id)
for sense, confidence in senses:
@@ -117,11 +130,11 @@ def save(self, filename):
def out(self, filename):
for word_id, senses,distance in self:
- print word_id,distance,
+ print(word_id,distance,end="")
for sense, confidence in senses:
if confidence == None: confidence = "?"
- print " " + sense + " " + str(confidence),
- print
+ print(" " + sense + " " + str(confidence),end="")
+ print()
def senses(self, bestonly=False):
"""Returns a list of all predicted senses"""
@@ -135,24 +148,24 @@ def senses(self, bestonly=False):
def loadfromtimbl(self, filename):
- timbloutput = TimblOutput(codecs.open(filename,'r','utf-8'))
+ timbloutput = TimblOutput(io.open(filename,'r',encoding='utf-8'))
for i, (features, referenceclass, predictedclass, distribution, distance) in enumerate(timbloutput):
if distance != None:
#distance='+vdi'+str(distance)
distance=float(distance)
if len(features) == 0:
- print >>stderr, "WARNING: Empty feature vector in " + filename + " (line " + str(i+1) + ") skipping!!"
+ print("WARNING: Empty feature vector in " + filename + " (line " + str(i+1) + ") skipping!!",file=stderr)
continue
word_id = features[0] #note: this is an assumption that must be adhered to!
if distribution:
self.append(word_id, distribution,distance)
def fromTimblToWsdout(self,fileTimbl,fileWsdout):
- timbloutput = TimblOutput(codecs.open(fileTimbl,'r','utf-8'))
- wsdoutfile = codecs.open(fileWsdout,'w','utf-8')
+ timbloutput = TimblOutput(io.open(fileTimbl,'r',encoding='utf-8'))
+ wsdoutfile = io.open(fileWsdout,'w',encoding='utf-8')
for i, (features, referenceclass, predictedclass, distribution, distance) in enumerate(timbloutput):
if len(features) == 0:
- print >>stderr, "WARNING: Empty feature vector in " + fileTimbl + " (line " + str(i+1) + ") skipping!!"
+ print("WARNING: Empty feature vector in " + fileTimbl + " (line " + str(i+1) + ") skipping!!",file=stderr)
continue
word_id = features[0] #note: this is an assumption that must be adhered to!
if distribution:
@@ -169,7 +182,7 @@ class DataSet(object): #for testsets/trainingsets
def __init__(self, filename):
self.sense = {} #word_id => (sense_id, lemma,pos)
self.targetwords = {} #(lemma,pos) => [sense_id]
- f = codecs.open(filename,'r','utf-8')
+ f = io.open(filename,'r',encoding='utf-8')
for line in f:
if len(line) > 0 and line[0] != '#':
fields = line.strip('\n').split('\t')
@@ -197,10 +210,7 @@ def getpos(self, word_id):
return self.sense[self._sanitize(word_id)][2]
def _sanitize(self, word_id):
- if isinstance(word_id, unicode):
- return word_id
- else:
- return unicode(word_id,'utf-8')
+ return u(word_id)
def __contains__(self, word_id):
return (self._sanitize(word_id) in self.sense)
Oops, something went wrong.

0 comments on commit 0a19ff4

Please sign in to comment.