Permalink
Browse files

updated LM for Python 3 compatibility

  • Loading branch information...
1 parent 94c06d5 commit 95945a773613c89b51de61c89bf8204e80c1707c @proycon committed Mar 29, 2013
Showing with 33 additions and 12 deletions.
  1. +12 −2 lm/client.py
  2. +14 −7 lm/lm.py
  3. +1 −1 lm/makesrilmcc
  4. +2 −0 lm/server.py
  5. +4 −2 lm/srilm.py
View
@@ -1,15 +1,22 @@
#!/usr/bin/env python
#-*- coding:utf-8 -*-
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import division
+from __future__ import absolute_import
+
import socket
class LMClient(object):
def __init__(self,host= "localhost",port=12346,n = 0):
self.BUFSIZE = 1024
self.socket = socket.socket(socket.AF_INET,socket.SOCK_STREAM) #Create the socket
- self.socket.settimeout(120)
+ self.socket.settimeout(120)
+ assert isinstance(port,int)
self.socket.connect((host, port)) #Connect to server
+ assert isinstance(n,int)
self.n = n
def scoresentence(self, sentence):
@@ -27,7 +34,10 @@ def __getitem__(self, ngram):
ngram = ngram.split(" ")
if len(ngram) != self.n:
raise Exception("This client instance has been set to send only " + str(self.n) + "-grams.")
- self.socket.send(" ".join(ngram)+ "\r\n")
+ ngram = " ".join(ngram)
+ if (sys.version < '3' and isinstance(ngram,unicode)) or( sys.version == '3' and isinstance(ngram,str)):
+ ngram = ngram.encode('utf-8')
+ self.socket.send(ngram + b"\r\n")
return float(self.socket.recv(self.BUFSIZE).strip())
View
@@ -8,20 +8,27 @@
#
#----------------------------------------------------------------
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import division
+from __future__ import absolute_import
+#from pynlpl.common import u
+
from pynlpl.statistics import FrequencyList, product
from pynlpl.textprocessors import Windower
-import codecs
+import io
from sys import stderr
class SimpleLanguageModel:
"""This is a very simple unsmoothed language model"""
+
def __init__(self, n=2, casesensitive = True, beginmarker = "<begin>", endmarker = "<end>"):
self.casesensitive = casesensitive
self.freqlistN = FrequencyList(None, self.casesensitive)
self.freqlistNm1 = FrequencyList(None, self.casesensitive)
-
- assert n >= 2
+
+ assert isinstance(n,int) and n >= 2
self.n = n
self.beginmarker = beginmarker
self.endmarker = endmarker
@@ -45,7 +52,7 @@ def append(self, sentence):
def load(self, filename):
self.freqlistN = FrequencyList(None, self.casesensitive)
self.freqlistNm1 = FrequencyList(None, self.casesensitive)
- f = codecs.open(filename,'r','utf-8')
+ f = io.open(filename,'r',encoding='utf-8')
mode = False
for line in f.readlines():
line = line.strip()
@@ -80,13 +87,13 @@ def load(self, filename):
type, count = line.split("\t")
self.freqlistN.count(type.split(' '),int(count))
except:
- print >>stderr,"Warning, could not parse line whilst loading frequency list: ", line
+ print("Warning, could not parse line whilst loading frequency list: ", line,file=stderr)
elif mode == 3:
try:
type, count = line.split("\t")
self.freqlistNm1.count(type.split(' '),int(count))
except:
- print >>stderr,"Warning, could not parse line whilst loading frequency list: ", line
+ print("Warning, could not parse line whilst loading frequency list: ", line,file=stderr)
if self.beginmarker:
self._begingram = [self.beginmarker] * (self.n-1)
@@ -95,7 +102,7 @@ def load(self, filename):
def save(self, filename):
- f = codecs.open(filename,'w','utf-8')
+ f = io.open(filename,'w',encoding='utf-8')
f.write("[simplelanguagemodel]\n")
f.write("n="+str(self.n)+"\n")
f.write("sentences="+str(self.sentences)+"\n")
View
@@ -27,7 +27,7 @@ export SRILMLIBS=$SRILM/lib/i686
if [ -z $1 ]; then
PYTHONVERSION=$2
else
- PYTHONVERSION="2.5"
+ PYTHONVERSION="2.7"
fi
g++ -fPIC -shared -I/usr/include/python$PYTHONVERSION -lpython$PYTHONVERSION -I$SRILM/src -I$SRILM/include -lboost_python srilm.cc $SRILMLIBS/liboolm.a $SRILMLIBS/libdstruct.a $SRILMLIBS/libmisc.a -o srilmcc.so
View
@@ -11,6 +11,8 @@
#
#----------------------------------------------------------------
+#No Python 3 support for twisted yet...
+
from twisted.internet import protocol, reactor
from twisted.protocols import basic
View
@@ -13,6 +13,10 @@
#
#----------------------------------------------------------------
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import division
+from __future__ import absolute_import
import srilmcc
from pynlpl.textprocessors import Windower
@@ -38,8 +42,6 @@ def __contains__(self, key):
return self.model.exists( key )
def logscore(self, ngram):
- n = len(ngram)
-
#Bug work-around
#if "" in ngram or "_" in ngram or "__" in ngram:
# print >> sys.stderr, "WARNING: Invalid word in n-gram! Ignoring", ngram

0 comments on commit 95945a7

Please sign in to comment.