-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
more unicode bugfixes, added mst server option
- Loading branch information
1 parent
074ab5c
commit 397d39a
Showing
10 changed files
with
280 additions
and
118 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
mock==1.0.1 | ||
nose==1.2.1 | ||
Flask==0.9 | ||
Flask-WTF==0.8 | ||
Jinja2==2.6 | ||
|
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
""" | ||
Author: Sam Thomson (sthomson@cs.cmu.edu) | ||
""" | ||
import codecs | ||
import json | ||
import os | ||
import socket | ||
from tempfile import mkdtemp | ||
from semviz.settings import SEMAFOR_HOST, SEMAFOR_PORT, MST_HOST, MST_PORT, SEMAFOR_HOME | ||
from semviz.utils import reshape, deleting | ||
|
||
NUM_CONLL_FIELDS = 10 | ||
DEFAULT_BUFFER_SIZE = 8192 | ||
DEFAULT_TIMEOUT = 20.0 | ||
UTF_8 = 'utf8' | ||
|
||
|
||
class SocketClient(object): | ||
""" A client for interacting with a running TCP socket server. """ | ||
def __init__(self, host, port, buffer_size=DEFAULT_BUFFER_SIZE, timeout=DEFAULT_TIMEOUT): | ||
self.host = host | ||
self.port = port | ||
self.buffer_size = buffer_size | ||
self.timeout = timeout | ||
|
||
def make_request(self, request): | ||
""" | ||
Sends request to the server and gets the server's response. | ||
End of response is indicated by an empty string. | ||
Opens and closes a new connection for each request. | ||
""" | ||
client = socket.socket(socket.AF_INET, socket.SOCK_STREAM) | ||
client.settimeout(self.timeout) | ||
client.connect((self.host, self.port)) | ||
client.sendall(request) | ||
client.shutdown(socket.SHUT_WR) | ||
response = [] | ||
while True: | ||
chunk = client.recv(self.buffer_size) | ||
if not chunk: | ||
break | ||
response.append(chunk) | ||
return ''.join(response) | ||
|
||
|
||
class UnicodeSocketClient(SocketClient): | ||
""" | ||
A client for interacting with a running TCP socket server. | ||
Requests and responses are encoded and decoded as utf8 | ||
""" | ||
def make_request(self, request): | ||
""" Sends request to the server and gets the server's response. """ | ||
response = super(UnicodeSocketClient, self).make_request(request.encode(UTF_8)) | ||
return response.decode(UTF_8) | ||
|
||
|
||
class SemaforClient(object): | ||
""" | ||
A client for retrieving frame-semantic parses from a running SEMAFOR | ||
server. | ||
""" | ||
def __init__(self, dependency_parser, socket_client): | ||
""" | ||
Creates a new client with the given dependency parser, connection to | ||
a running SEMAFOR server. | ||
""" | ||
self._dependency_parser = dependency_parser | ||
self._client = socket_client | ||
|
||
@staticmethod | ||
def create(dependency_parser, host=SEMAFOR_HOST, port=SEMAFOR_PORT): | ||
""" Convenience static constructor """ | ||
return SemaforClient(dependency_parser, UnicodeSocketClient(host, port)) | ||
|
||
def get_parse(self, sentence): | ||
""" Gets a frame-semantic parse as json from a sentence string. """ | ||
return self.get_parses([sentence])[0] | ||
|
||
def get_parses(self, sentences): | ||
""" | ||
Gets frame-semantic parses as a list of dicts from a list of sentence | ||
strings. | ||
""" | ||
dependency_parses = self._dependency_parser.get_parses(sentences) | ||
return self._get_parses_from_conll(dependency_parses) | ||
|
||
def _get_parses_from_conll(self, dependency_parses): | ||
""" | ||
Gets frame-semantic parses as a list of dicts from dependency-parsed | ||
English sentences in conll format. | ||
""" | ||
response = self._client.make_request(dependency_parses) | ||
return [json.loads(x) for x in response.splitlines()] | ||
|
||
|
||
class PosTagger(object): | ||
""" A client for running tokenization and part-of-speech tagging """ | ||
def tag_sentences(self, sentences): | ||
""" Runs tokenization and part-of-speech tagging a sentence str. """ | ||
with deleting(mkdtemp(suffix='XXXXXX', prefix='semafor.')) as temp_dir: | ||
input_filename = os.path.join(temp_dir, "sentence") | ||
output_filename = os.path.join(temp_dir, "pos.tagged") | ||
with codecs.open(input_filename, 'w', encoding="utf8") as input_file: | ||
input_file.write(u'\n'.join(sentences)) | ||
os.system("cd %s && ./bin/tokenizeAndPosTag.sh %s %s" % | ||
(SEMAFOR_HOME, input_filename, temp_dir)) | ||
with codecs.open(output_filename, encoding="utf8") as output_file: | ||
output = output_file.read() | ||
return output | ||
|
||
|
||
class MstClient(object): | ||
""" | ||
A client for retrieving dependency parses from a running MSTParser server | ||
""" | ||
def __init__(self, pos_tagger, socket_client): | ||
""" | ||
Creates a new client with the given part-of-speech tagger and socket | ||
connection to a running MSTParser server. | ||
""" | ||
self._pos_tagger = pos_tagger | ||
self._client = socket_client | ||
|
||
@staticmethod | ||
def create(pos_tagger, host=MST_HOST, port=MST_PORT): | ||
""" Convenience static constructor """ | ||
return MstClient(pos_tagger, UnicodeSocketClient(host, port)) | ||
|
||
@staticmethod | ||
def _reshape_conll(conll, pos): | ||
""" | ||
Takes one output parse from MSTParser server, and converts it to conll | ||
MST changes tokens... we change them back here. | ||
""" | ||
all_fields = conll.split(u'\t') | ||
tokens = [word.split('_')[0] for word in pos.split()] | ||
# give each token its own line | ||
rows = reshape(all_fields, NUM_CONLL_FIELDS) | ||
# Revert any changes MST made to tokens | ||
for i, token in enumerate(tokens): | ||
rows[i][1:3] = [token, token] | ||
return u'\n'.join(u'\t'.join(row) for row in rows) | ||
|
||
def get_parses(self, sentences): | ||
""" Gets a dependency parse as conll from a sentence str. """ | ||
pos_tagged = self._pos_tagger.tag_sentences(sentences) | ||
response = self._client.make_request(pos_tagged) | ||
# reformat a response from the MSTParser server into proper conll. | ||
parse_pos_pairs = zip(response.splitlines(), pos_tagged.splitlines()) | ||
return u'\n\n'.join(MstClient._reshape_conll(parse, pos) | ||
for parse, pos in parse_pos_pairs) | ||
|
||
|
||
class MaltClient(object): | ||
""" | ||
A client for retrieving dependency parses from an MaltParser executable | ||
""" | ||
def get_parses(self, sentences): | ||
""" Gets a dependency semantic parse as conll from a sentence str. """ | ||
# TODO: server version of Malt? | ||
with deleting(mkdtemp(suffix='XXXXXX', prefix='semafor.')) as temp_dir: | ||
input_filename = os.path.join(temp_dir, "sentence") | ||
output_filename = os.path.join(temp_dir, "conll") | ||
with codecs.open(input_filename, 'w', encoding="utf8") as input_file: | ||
input_file.write(u'\n'.join(sentences)) | ||
os.system("cd %s && ./bin/runMalt.sh %s %s" % | ||
(SEMAFOR_HOME, input_filename, temp_dir)) | ||
with codecs.open(output_filename, encoding="utf8") as output_file: | ||
output = output_file.read() | ||
return output |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,14 @@ | ||
""" | ||
Author: Sam Thomson (sthomson@cs.cmu.edu) | ||
""" | ||
|
||
# SEMAFOR server settings | ||
SEMAFOR_HOST = "localhost" | ||
SEMAFOR_PORT = 4444 | ||
|
||
# Location of MaltParser and POS tagger executables | ||
SEMAFOR_HOME = "/Users/sam/code/semafor/semafor" | ||
|
||
# MST Parser server settings | ||
MST_HOST = "localhost" | ||
MST_PORT = 12345 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
""" | ||
Author: Sam Thomson (sthomson@cs.cmu.edu) | ||
""" | ||
from unittest import TestCase | ||
from mock import Mock | ||
from semviz.services import MstClient, PosTagger, UnicodeSocketClient | ||
|
||
SENTENCES = u"""My kitchen no longer smells. | ||
0.2 miles later the trail and the creek run nearly level with each other and the sound of Minnehaha Falls fills the air.""" | ||
POS_TAGGED = u"""My_PRP$ kitchen_NN no_RB longer_RB smells_VBZ ._. | ||
0.2_CD miles_NNS later_RB the_DT trail_NN and_CC the_DT creek_NN run_NN nearly_RB level_NN with_IN each_DT other_JJ and_CC the_DT sound_NN of_IN Minnehaha_NNP Falls_NNP fills_VBZ the_DT air_NN ._.""" | ||
MST_RESPONSE = u"""1 my my PRP$ PRP$ - 2 NMOD - - 2 kitchen kitchen NN NN - 5 SUB - - 3 no no RB RB - 5 VMOD - - 4 longer longer RB RB - AMOD - - 5 smells smells VBZ VBZ - 0 ROOT - - 6 . . . . - 5 P - - | ||
1 <num> <num> CD CD - 2 NMOD - - 2 miles miles NNS NNS - 11 NMOD - - 3 later later RB RB - 11 NMOD - - 4 the the DT DT - 11 NMOD - - 5 trail trail NN NN - 11 NMOD - 6 and and CC CC - 11 NMOD - - 7 the the DT DT - 11 NMOD - - 8 creek creek NN NN - 11 NMOD - - 9 run run NN NN 11 NMOD - - 10 nearly nearly RB RB - 11 NMOD - 11 level level NN NN - 17 NMOD - - 12 with with IN IN - 11 NMOD - - 13 each each DT DT - 14 NMOD - - 14 other other JJ JJ 12 PMOD - - 15 and and CC CC - 17 NMOD - 16 the the DT DT - 17 NMOD - - 17 sound sound NN NN - 21 SUB - - 18 of of IN IN - 17 NMOD - - 19 minnehaha minnehaha NNP NNP - 20 NMOD - - 20 falls falls NNP NNP 18 PMOD - - 21 fills fills VBZ VBZ - 0 ROOT - 22 the the DT DT - 23 NMOD - - 23 air air NN NN - 21 OBJ - - 24 . . . - 21 P - -""" | ||
CONLL = u"""1 My My PRP$ PRP$ - 2 NMOD - - | ||
2 kitchen kitchen NN NN - 5 SUB - - | ||
3 no no RB RB - 5 VMOD - - | ||
4 longer longer RB RB - AMOD - - | ||
5 smells smells VBZ VBZ - 0 ROOT - - | ||
6 . . . . - 5 P - - | ||
1 0.2 0.2 CD CD - 2 NMOD - - | ||
2 miles miles NNS NNS - 11 NMOD - - | ||
3 later later RB RB - 11 NMOD - - | ||
4 the the DT DT - 11 NMOD - - | ||
5 trail trail NN NN - 11 NMOD - | ||
6 and and CC CC - 11 NMOD - - | ||
7 the the DT DT - 11 NMOD - - | ||
8 creek creek NN NN - 11 NMOD - - | ||
9 run run NN NN 11 NMOD - - | ||
10 nearly nearly RB RB - 11 NMOD - | ||
11 level level NN NN - 17 NMOD - - | ||
12 with with IN IN - 11 NMOD - - | ||
13 each each DT DT - 14 NMOD - - | ||
14 other other JJ JJ 12 PMOD - - | ||
15 and and CC CC - 17 NMOD - | ||
16 the the DT DT - 17 NMOD - - | ||
17 sound sound NN NN - 21 SUB - - | ||
18 of of IN IN - 17 NMOD - - | ||
19 Minnehaha Minnehaha NNP NNP - 20 NMOD - - | ||
20 Falls Falls NNP NNP 18 PMOD - - | ||
21 fills fills VBZ VBZ - 0 ROOT - | ||
22 the the DT DT - 23 NMOD - - | ||
23 air air NN NN - 21 OBJ - - | ||
24 . . . - 21 P - -""" | ||
|
||
|
||
class TestMstClient(TestCase): | ||
def test_get_parses(self): | ||
self.maxDiff = None | ||
# set up a mocked MST server | ||
pos_tagger = Mock(spec=PosTagger) | ||
pos_tagger.tag_sentences = Mock(return_value=POS_TAGGED) | ||
socket_client = Mock(spec=UnicodeSocketClient) | ||
socket_client.make_request = Mock(return_value=MST_RESPONSE) | ||
mst = MstClient(pos_tagger, socket_client) | ||
# exercise the MstClient | ||
output = mst.get_parses(SENTENCES) | ||
# verify that it called the server and reformatted the output correctly | ||
# this includes putting each token row on its own line, and | ||
# reverting any changes MST made to the token. | ||
pos_tagger.tag_sentences.assert_called_once_with(SENTENCES) | ||
socket_client.make_request.assert_called_once_with(POS_TAGGED) | ||
self.assertEqual(CONLL, output) |
Oops, something went wrong.