/
corenlp.py
56 lines (48 loc) · 1.81 KB
/
corenlp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import json, requests
class StanfordCoreNLP:
def __init__(self, server_url):
if server_url[-1] == '/':
server_url = server_url[:-1]
self.server_url = server_url
def annotate(self, text, properties=None):
assert isinstance(text, str)
if properties is None:
properties = {}
else:
assert isinstance(properties, dict)
# Checks that the Stanford CoreNLP server is started.
try:
requests.get(self.server_url)
except requests.exceptions.ConnectionError:
raise Exception('Check whether you have started the CoreNLP server e.g.\n'
'$ cd stanford-corenlp-full-2015-12-09/ \n'
'$ java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer')
data = text.encode()
r = requests.post(
self.server_url, params={
'properties': str(properties)
}, data=data, headers={'Connection': 'close'})
output = r.text
if ('outputFormat' in properties
and properties['outputFormat'] == 'json'):
try:
output = json.loads(output, encoding='utf-8', strict=True)
except:
pass
return output
def tokensregex(self, text, pattern, filter):
return self.regex('/tokensregex', text, pattern, filter)
def semgrex(self, text, pattern, filter):
return self.regex('/semgrex', text, pattern, filter)
def regex(self, endpoint, text, pattern, filter):
r = requests.get(
self.server_url + endpoint, params={
'pattern': pattern,
'filter': filter
}, data=text)
output = r.text
try:
output = json.loads(r.text)
except:
pass
return output