Skip to content

Commit

Permalink
more unicode bugfixes, added mst server option
Browse files Browse the repository at this point in the history
  • Loading branch information
sammthomson committed Apr 5, 2013
1 parent 074ab5c commit 397d39a
Show file tree
Hide file tree
Showing 10 changed files with 280 additions and 118 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ To compile:
sass --watch semviz/static/css/style.scss:semviz/static/css/style.css

Make sure `semviz.settings.SEMAFOR_HOME` points to a valid installation of
SEMAFOR (>= v3.0-alpha-02).
SEMAFOR (>= v3.0-alpha-03).

Make sure SEMAFOR is running in server mode:

cd $SEMAFOR_HOME
java -Xms4g -Xmx4g -jar target/Semafor-3.0-alpha-02.jar model-dir:<directory-of-trained-model> port:4444
java -Xms4g -Xmx4g -jar target/Semafor-3.0-alpha-03.jar model-dir:<directory-of-trained-model> port:4444

and `semviz.settings.SEMAFOR_HOST` and `semviz.settings.SEMAFOR_PORT` point to the running instance.

Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
mock==1.0.1
nose==1.2.1
Flask==0.9
Flask-WTF==0.8
Jinja2==2.6
Expand Down
36 changes: 0 additions & 36 deletions semviz/malt_client.py

This file was deleted.

74 changes: 0 additions & 74 deletions semviz/semafor_client.py

This file was deleted.

170 changes: 170 additions & 0 deletions semviz/services.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
"""
Author: Sam Thomson (sthomson@cs.cmu.edu)
"""
import codecs
import json
import os
import socket
from tempfile import mkdtemp
from semviz.settings import SEMAFOR_HOST, SEMAFOR_PORT, MST_HOST, MST_PORT, SEMAFOR_HOME
from semviz.utils import reshape, deleting

NUM_CONLL_FIELDS = 10
DEFAULT_BUFFER_SIZE = 8192
DEFAULT_TIMEOUT = 20.0
UTF_8 = 'utf8'


class SocketClient(object):
""" A client for interacting with a running TCP socket server. """
def __init__(self, host, port, buffer_size=DEFAULT_BUFFER_SIZE, timeout=DEFAULT_TIMEOUT):
self.host = host
self.port = port
self.buffer_size = buffer_size
self.timeout = timeout

def make_request(self, request):
"""
Sends request to the server and gets the server's response.
End of response is indicated by an empty string.
Opens and closes a new connection for each request.
"""
client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
client.settimeout(self.timeout)
client.connect((self.host, self.port))
client.sendall(request)
client.shutdown(socket.SHUT_WR)
response = []
while True:
chunk = client.recv(self.buffer_size)
if not chunk:
break
response.append(chunk)
return ''.join(response)


class UnicodeSocketClient(SocketClient):
"""
A client for interacting with a running TCP socket server.
Requests and responses are encoded and decoded as utf8
"""
def make_request(self, request):
""" Sends request to the server and gets the server's response. """
response = super(UnicodeSocketClient, self).make_request(request.encode(UTF_8))
return response.decode(UTF_8)


class SemaforClient(object):
"""
A client for retrieving frame-semantic parses from a running SEMAFOR
server.
"""
def __init__(self, dependency_parser, socket_client):
"""
Creates a new client with the given dependency parser, connection to
a running SEMAFOR server.
"""
self._dependency_parser = dependency_parser
self._client = socket_client

@staticmethod
def create(dependency_parser, host=SEMAFOR_HOST, port=SEMAFOR_PORT):
""" Convenience static constructor """
return SemaforClient(dependency_parser, UnicodeSocketClient(host, port))

def get_parse(self, sentence):
""" Gets a frame-semantic parse as json from a sentence string. """
return self.get_parses([sentence])[0]

def get_parses(self, sentences):
"""
Gets frame-semantic parses as a list of dicts from a list of sentence
strings.
"""
dependency_parses = self._dependency_parser.get_parses(sentences)
return self._get_parses_from_conll(dependency_parses)

def _get_parses_from_conll(self, dependency_parses):
"""
Gets frame-semantic parses as a list of dicts from dependency-parsed
English sentences in conll format.
"""
response = self._client.make_request(dependency_parses)
return [json.loads(x) for x in response.splitlines()]


class PosTagger(object):
""" A client for running tokenization and part-of-speech tagging """
def tag_sentences(self, sentences):
""" Runs tokenization and part-of-speech tagging a sentence str. """
with deleting(mkdtemp(suffix='XXXXXX', prefix='semafor.')) as temp_dir:
input_filename = os.path.join(temp_dir, "sentence")
output_filename = os.path.join(temp_dir, "pos.tagged")
with codecs.open(input_filename, 'w', encoding="utf8") as input_file:
input_file.write(u'\n'.join(sentences))
os.system("cd %s && ./bin/tokenizeAndPosTag.sh %s %s" %
(SEMAFOR_HOME, input_filename, temp_dir))
with codecs.open(output_filename, encoding="utf8") as output_file:
output = output_file.read()
return output


class MstClient(object):
"""
A client for retrieving dependency parses from a running MSTParser server
"""
def __init__(self, pos_tagger, socket_client):
"""
Creates a new client with the given part-of-speech tagger and socket
connection to a running MSTParser server.
"""
self._pos_tagger = pos_tagger
self._client = socket_client

@staticmethod
def create(pos_tagger, host=MST_HOST, port=MST_PORT):
""" Convenience static constructor """
return MstClient(pos_tagger, UnicodeSocketClient(host, port))

@staticmethod
def _reshape_conll(conll, pos):
"""
Takes one output parse from MSTParser server, and converts it to conll
MST changes tokens... we change them back here.
"""
all_fields = conll.split(u'\t')
tokens = [word.split('_')[0] for word in pos.split()]
# give each token its own line
rows = reshape(all_fields, NUM_CONLL_FIELDS)
# Revert any changes MST made to tokens
for i, token in enumerate(tokens):
rows[i][1:3] = [token, token]
return u'\n'.join(u'\t'.join(row) for row in rows)

def get_parses(self, sentences):
""" Gets a dependency parse as conll from a sentence str. """
pos_tagged = self._pos_tagger.tag_sentences(sentences)
response = self._client.make_request(pos_tagged)
# reformat a response from the MSTParser server into proper conll.
parse_pos_pairs = zip(response.splitlines(), pos_tagged.splitlines())
return u'\n\n'.join(MstClient._reshape_conll(parse, pos)
for parse, pos in parse_pos_pairs)


class MaltClient(object):
"""
A client for retrieving dependency parses from an MaltParser executable
"""
def get_parses(self, sentences):
""" Gets a dependency semantic parse as conll from a sentence str. """
# TODO: server version of Malt?
with deleting(mkdtemp(suffix='XXXXXX', prefix='semafor.')) as temp_dir:
input_filename = os.path.join(temp_dir, "sentence")
output_filename = os.path.join(temp_dir, "conll")
with codecs.open(input_filename, 'w', encoding="utf8") as input_file:
input_file.write(u'\n'.join(sentences))
os.system("cd %s && ./bin/runMalt.sh %s %s" %
(SEMAFOR_HOME, input_filename, temp_dir))
with codecs.open(output_filename, encoding="utf8") as output_file:
output = output_file.read()
return output
11 changes: 11 additions & 0 deletions semviz/settings.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
"""
Author: Sam Thomson (sthomson@cs.cmu.edu)
"""

# SEMAFOR server settings
SEMAFOR_HOST = "localhost"
SEMAFOR_PORT = 4444

# Location of MaltParser and POS tagger executables
SEMAFOR_HOME = "/Users/sam/code/semafor/semafor"

# MST Parser server settings
MST_HOST = "localhost"
MST_PORT = 12345
2 changes: 1 addition & 1 deletion semviz/templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<h1>Semafor Demo</h1>

<form method="GET" onSubmit="semViz.submitSentence(); return false;">
Type a sentence and press "Submit."
Type an English sentence and press "Submit."
</br>
{{ form.sentence(rows=6, columns=200) }}
</br>
Expand Down
63 changes: 63 additions & 0 deletions semviz/test_services.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""
Author: Sam Thomson (sthomson@cs.cmu.edu)
"""
from unittest import TestCase
from mock import Mock
from semviz.services import MstClient, PosTagger, UnicodeSocketClient

SENTENCES = u"""My kitchen no longer smells.
0.2 miles later the trail and the creek run nearly level with each other and the sound of Minnehaha Falls fills the air."""
POS_TAGGED = u"""My_PRP$ kitchen_NN no_RB longer_RB smells_VBZ ._.
0.2_CD miles_NNS later_RB the_DT trail_NN and_CC the_DT creek_NN run_NN nearly_RB level_NN with_IN each_DT other_JJ and_CC the_DT sound_NN of_IN Minnehaha_NNP Falls_NNP fills_VBZ the_DT air_NN ._."""
MST_RESPONSE = u"""1 my my PRP$ PRP$ - 2 NMOD - - 2 kitchen kitchen NN NN - 5 SUB - - 3 no no RB RB - 5 VMOD - - 4 longer longer RB RB - AMOD - - 5 smells smells VBZ VBZ - 0 ROOT - - 6 . . . . - 5 P - -
1 <num> <num> CD CD - 2 NMOD - - 2 miles miles NNS NNS - 11 NMOD - - 3 later later RB RB - 11 NMOD - - 4 the the DT DT - 11 NMOD - - 5 trail trail NN NN - 11 NMOD - 6 and and CC CC - 11 NMOD - - 7 the the DT DT - 11 NMOD - - 8 creek creek NN NN - 11 NMOD - - 9 run run NN NN 11 NMOD - - 10 nearly nearly RB RB - 11 NMOD - 11 level level NN NN - 17 NMOD - - 12 with with IN IN - 11 NMOD - - 13 each each DT DT - 14 NMOD - - 14 other other JJ JJ 12 PMOD - - 15 and and CC CC - 17 NMOD - 16 the the DT DT - 17 NMOD - - 17 sound sound NN NN - 21 SUB - - 18 of of IN IN - 17 NMOD - - 19 minnehaha minnehaha NNP NNP - 20 NMOD - - 20 falls falls NNP NNP 18 PMOD - - 21 fills fills VBZ VBZ - 0 ROOT - 22 the the DT DT - 23 NMOD - - 23 air air NN NN - 21 OBJ - - 24 . . . - 21 P - -"""
CONLL = u"""1 My My PRP$ PRP$ - 2 NMOD - -
2 kitchen kitchen NN NN - 5 SUB - -
3 no no RB RB - 5 VMOD - -
4 longer longer RB RB - AMOD - -
5 smells smells VBZ VBZ - 0 ROOT - -
6 . . . . - 5 P - -
1 0.2 0.2 CD CD - 2 NMOD - -
2 miles miles NNS NNS - 11 NMOD - -
3 later later RB RB - 11 NMOD - -
4 the the DT DT - 11 NMOD - -
5 trail trail NN NN - 11 NMOD -
6 and and CC CC - 11 NMOD - -
7 the the DT DT - 11 NMOD - -
8 creek creek NN NN - 11 NMOD - -
9 run run NN NN 11 NMOD - -
10 nearly nearly RB RB - 11 NMOD -
11 level level NN NN - 17 NMOD - -
12 with with IN IN - 11 NMOD - -
13 each each DT DT - 14 NMOD - -
14 other other JJ JJ 12 PMOD - -
15 and and CC CC - 17 NMOD -
16 the the DT DT - 17 NMOD - -
17 sound sound NN NN - 21 SUB - -
18 of of IN IN - 17 NMOD - -
19 Minnehaha Minnehaha NNP NNP - 20 NMOD - -
20 Falls Falls NNP NNP 18 PMOD - -
21 fills fills VBZ VBZ - 0 ROOT -
22 the the DT DT - 23 NMOD - -
23 air air NN NN - 21 OBJ - -
24 . . . - 21 P - -"""


class TestMstClient(TestCase):
def test_get_parses(self):
self.maxDiff = None
# set up a mocked MST server
pos_tagger = Mock(spec=PosTagger)
pos_tagger.tag_sentences = Mock(return_value=POS_TAGGED)
socket_client = Mock(spec=UnicodeSocketClient)
socket_client.make_request = Mock(return_value=MST_RESPONSE)
mst = MstClient(pos_tagger, socket_client)
# exercise the MstClient
output = mst.get_parses(SENTENCES)
# verify that it called the server and reformatted the output correctly
# this includes putting each token row on its own line, and
# reverting any changes MST made to the token.
pos_tagger.tag_sentences.assert_called_once_with(SENTENCES)
socket_client.make_request.assert_called_once_with(POS_TAGGED)
self.assertEqual(CONLL, output)
Loading

0 comments on commit 397d39a

Please sign in to comment.