In [1]:
import re
import json
import socket
from pathlib import Path
from CwnGraph import CwnBase
from tqdm.auto import tqdm

In [38]:
def build_xml(content):
    request_xml = """
<?xml version="1.0" ?>
<wordsegmentation version="0.1">
<option showcategory="2" />
<authentication username="seantyh" password="33669904" />
<text>{}</text>
</wordsegmentation>
    """
    return request_xml.format(content)

def parse_def(sock, text):
    try:
        request_xml = build_xml(text)
        sock.sendall(request_xml.encode("big5"))
    
        buf_size = 32
        data = b""
        while True:
            received = sock.recv(buf_size)                
            data += received    
            if data.endswith(b"\r\n"):
                break

    except Exception as ex:
        print("ERROR: ", ex)
        data = b""
    xml = data.decode("big5")
    sentences = re.findall("<sentence>(.*?)</sentence>", xml)
    return sentences, xml

def split_batch(batch, sep=None):
    if sep is None:
        sep = "S(NP(Head:N:你)|Head:Vi:好)#。(PERIODCATEGORY)"
    groups = []
    buf = []
    for x in batch:
        if sep in x:
            groups.append(buf)
            buf = []
        else:
            buf.append(x)
    if buf:
        groups.append(buf)
    return groups

In [39]:
## preparing definitions
cwn = CwnBase()
senses = cwn.get_all_senses()
all_defs = list(set(x.definition for x in senses if x.definition))

In [40]:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
    server_address = ('140.109.19.112', 8000)
    print('connecting to %s port %s' % server_address)
    sock.connect(server_address)
    
    batch_size = 1
    def_batch = all_defs[100:103]
    results, raw_resp = parse_def(sock, "\r\n你好。\r\n".join(def_batch))

connecting to 140.109.19.112 port 8000


In [41]:
results

['#1:1.[0] S(NP(head:NP(DET:三|Head:N:國)|Head:T:之)|Head:DET:一)#，(COMMACATEGORY)',
 '#2:1.[0] NP(Head:NP(N:西元|Head:N:221年)|PP(Head:P:至|NP(N:西元|Head:N:264年)))#，(COMMACATEGORY)',
 '#3:1.[0] S(NP(Head:N:劉備)|ADV:所|Head:Vt:建)#。(PERIODCATEGORY)',
 '#4:1.[0] S(NP(Head:N:你)|Head:Vi:好)#。(PERIODCATEGORY)',
 '#5:1.[0] VP(Head:Vt:瞞|ASP:著|NP(N:後述|Head:N:對象)|VP(Head:Vt:做|NP(Head:N:事)))#。(PERIODCATEGORY)',
 '#6:1.[0] S(NP(Head:N:你)|Head:Vi:好)#。(PERIODCATEGORY)',
 '#7:1.[0] S(NP(VP(Vi:囉囉嗦嗦|Head:Vt:說)|Head:M:個)|Head:Vi:不停)#。(PERIODCATEGORY)']

In [42]:
raw_resp

'<?xml version="1.0" ?><wordsegmentation version="0.1" charsetcode="big5" istag="0"><processstatus code="0">Success</processstatus><result><sentence>#1:1.[0] S(NP(head:NP(DET:三|Head:N:國)|Head:T:之)|Head:DET:一)#，(COMMACATEGORY)</sentence><sentence>#2:1.[0] NP(Head:NP(N:西元|Head:N:221年)|PP(Head:P:至|NP(N:西元|Head:N:264年)))#，(COMMACATEGORY)</sentence><sentence>#3:1.[0] S(NP(Head:N:劉備)|ADV:所|Head:Vt:建)#。(PERIODCATEGORY)</sentence><sentence>#4:1.[0] S(NP(Head:N:你)|Head:Vi:好)#。(PERIODCATEGORY)</sentence><sentence>#5:1.[0] VP(Head:Vt:瞞|ASP:著|NP(N:後述|Head:N:對象)|VP(Head:Vt:做|NP(Head:N:事)))#。(PERIODCATEGORY)</sentence><sentence>#6:1.[0] S(NP(Head:N:你)|Head:Vi:好)#。(PERIODCATEGORY)</sentence><sentence>#7:1.[0] S(NP(VP(Vi:囉囉嗦嗦|Head:Vt:說)|Head:M:個)|Head:Vi:不停)#。(PERIODCATEGORY)</sentence></result><ehownet word="" cate="" pos="" sense="" st="" class="" st2="" rule="" lc="" rc="" autotag=""/></wordsegmentation>\r\n'

In [32]:
def_batch = all_defs[100:103]

In [35]:
split_batch(results[:3])

[['#1:1.[0] S(NP(head:NP(DET:三|Head:N:國)|Head:T:之)|Head:DET:一)#，(COMMACATEGORY)',
  '#2:1.[0] NP(Head:NP(N:西元|Head:N:221年)|PP(Head:P:至|NP(N:西元|Head:N:264年)))#，(COMMACATEGORY)',
  '#3:1.[0] S(NP(Head:N:劉備)|ADV:所|Head:Vt:建)#。(PERIODCATEGORY)']]

In [37]:
"\u2027".encode("big5")

UnicodeEncodeError: 'big5' codec can't encode character '\u2027' in position 0: illegal multibyte sequence

In [33]:
res_batch = split_batch(results)

In [34]:
for def_x, res_x in zip(def_batch, res_batch):
    print(def_x, res_x)

三國之一，西元221年至西元264年，劉備所建。 ['#1:1.[0] S(NP(head:NP(DET:三|Head:N:國)|Head:T:之)|Head:DET:一)#，(COMMACATEGORY)', '#2:1.[0] NP(Head:NP(N:西元|Head:N:221年)|PP(Head:P:至|NP(N:西元|Head:N:264年)))#，(COMMACATEGORY)', '#3:1.[0] S(NP(Head:N:劉備)|ADV:所|Head:Vt:建)#。(PERIODCATEGORY)']
瞞著後述對象做事。 ['#5:1.[0] VP(Head:Vt:瞞|ASP:著|NP(N:後述|Head:N:對象)|VP(Head:Vt:做|NP(Head:N:事)))#。(PERIODCATEGORY)']
囉囉嗦嗦說個不停。 ['#7:1.[0] S(NP(VP(Vi:囉囉嗦嗦|Head:Vt:說)|Head:M:個)|Head:Vi:不停)#。(PERIODCATEGORY)']
