In [5]:
from nltk.parse import CoreNLPParser
from nltk.tree import ParentedTree

In [16]:
input_file = "./raw_data/msr_paraphrase_train.txt"
output_file = "./trunk_data/spo_train_multiword.txt"
parser = CoreNLPParser(url='http://localhost:9000')

In [10]:
def extract_subject(tree):
    object = []
    for i in tree.subtrees():
        if i.label().startswith("NN"):
            object.append(i[0])
    return object

def extract_predicate_object(tree):
    predicate = None
    for i in tree.subtrees():
        if i.label().startswith("VB"):
            predicate = i[0]
        if i.label().startswith("NN"):
            object = i[0]
    return predicate, object

def extract_predicate(tree):
    predicate = []
    for i in tree.subtrees():
        if i.label().startswith("VB"):
            predicate.append(i[0])
    return predicate

def extract_object(tree):
    object = []
    for i in tree.subtrees():
        if i.label().startswith("NN"):
            object.append(i[0])
    return object

def SPO_kernel(sentence):
    tree = parser.raw_parse(sentence)
    spo = []
    length = 0
    sentence_height = 0
    for i in tree:
        ptree = ParentedTree.convert(i);

        # ptree.pprint()

        sentence_root = None
        root_height = -1
        for j in ptree.subtrees(lambda t: t.label() == "S"):
            if root_height < j.height():
                sentence_root = j
                root_height = sentence_root.height()

        # print(f"sentence root: {sentence_root}")

        if sentence_root is None:
            return None

        for j in sentence_root.subtrees(lambda t: t.parent() == sentence_root):
            # print('---')
            # print(f"label: {j.label()} {j}")
            if(j.label() == "NP"):
                # print(f"extracted subject: {extract_subject(j)}")
                spo.append(extract_subject(j))
            if(j.label() == "VP"):
                # print(f"extracted predicate: {extract_predicate(j)}, object: {extract_object(j)}")
                spo.append(extract_predicate(j))
                spo.append(extract_object(j))
    return spo

def handle_line(line):
    tokens = line.split("\t")
    quality = tokens[0]
    id1 = tokens[1]
    id2 = tokens[2]
    sentence1 = tokens[3]
    sentence2 = tokens[4]
    spo1 = SPO_kernel(sentence1)
    spo2 = SPO_kernel(sentence2)
    return f"{quality}\t{id1}\t{id2}\t{spo1}\t{spo2}\n"

In [13]:
print(SPO_kernel("Amrozi accused his brother, whom he called \"the witness\", of deliberately distorting his evidence."))

[['Amrozi'], ['accused', 'called', 'distorting'], ['brother', 'witness', 'evidence']]


Run the following command to start a Java server running the Stanford parser:

`java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer \
-preload tokenize,ssplit,pos,lemma,ner,parse,depparse \
-status_port 9000 -port 9000 -timeout 15000`

In [11]:
with open(input_file, encoding="utf-8-sig") as in_file:
    with open(output_file, "w") as out_file:
        for line in in_file:
            # print(line)
            if(line[0] != "Q"):
                out_file.write(handle_line(line))