In [58]:
import os
import numpy as np
from tqdm import tqdm
from bisect import bisect_left, bisect_right
import spacy
import re

In [28]:
import requests

sentences = ['She hates him', 'The US fears Chinese politics', 'The doctor hates apples because an apple a day keeps the doctor away']

res = requests.post('http://127.0.0.1:6000/opinion/mine', json={'sentences': sentences})

In [29]:
res.json()["opinion"]

[{'opinion_frames': [{'expression': {'indices': [1], 'tokens': ['hates']},
    'holders': [{'indices': [0], 'tokens': ['She']}],
    'polarity': 'negative',
    'targets': [{'indices': [2], 'tokens': ['him']}]}],
  'sentence': 'She hates him',
  'tokenized': ['She', 'hates', 'him']},
 {'opinion_frames': [{'expression': {'indices': [2], 'tokens': ['fears']},
    'holders': [{'indices': [0, 1], 'tokens': ['The', 'US']}],
    'polarity': 'negative',
    'targets': [{'indices': [3, 4], 'tokens': ['Chinese', 'politics']}]}],
  'sentence': 'The US fears Chinese politics',
  'tokenized': ['The', 'US', 'fears', 'Chinese', 'politics']},
 {'opinion_frames': [{'expression': {'indices': [2], 'tokens': ['hates']},
    'holders': [{'indices': [0, 1], 'tokens': ['The', 'doctor']}],
    'polarity': 'negative',
    'targets': []}],
  'sentence': 'The doctor hates apples because an apple a day keeps the doctor away',
  'tokenized': ['The',
   'doctor',
   'hates',
   'apples',
   'because',
   'an',
   

In [18]:
completed_files = [str(i).zfill(3) for i in range(33)]

In [72]:
docs = []
opinion_tuples = []

for i in completed_files:
    ann_file = "../../data/80-bri-annotations/annotation_b00_{}.rsd.ann".format(i)
    txt_file = "../../data/80-bri-annotations/annotation_b00_{}.rsd.txt".format(i)

    doc = open(txt_file).read()
    ann_lines = open(ann_file).read().split("\n")

    spans = {}
    opinions = []

    for line in ann_lines:
        if line.startswith("T"):
            span_id, span_type, start, end = line.split()[:4]
            spans[span_id] = (span_type, int(start), int(end))
    
    for line in ann_lines:
        if line.startswith("E"):
            event_id = line.split()[0]
            event_type, event_head_span_id = line.split()[1].split(":")
            
            if event_type == "opinion":
                opinion_span = (spans[event_head_span_id][1], spans[event_head_span_id][2])
                opinion_text = doc[opinion_span[0]: opinion_span[1]]
                holder_spans, target_spans = [], []
                
                for value in line.split()[2:]:
                    role_type, span_id = value.split(":")
                    span = (spans[span_id][1], spans[span_id][2])
                    if role_type.startswith("HOLDER"):
                        holder_spans.append(span)
                    elif role_type.startswith("TARGET"):
                        target_spans.append(span)
                
                if len(holder_spans) > 0 and len(target_spans) > 0:
                    for holder_span in holder_spans:
                        holder_text = doc[holder_span[0]: holder_span[1]] if isinstance(holder_span, tuple) else holder_span
                        for target_span in target_spans:
                            target_text = doc[target_span[0]: target_span[1]] if isinstance(target_span, tuple) else target_span
                            opinion = dict(id=event_id, opinion=opinion_span, holder=holder_span, target=target_span, polarity=None, opinion_text=opinion_text, holder_text=holder_text, target_text=target_text)
                            opinions.append(opinion)
                elif len(holder_spans) == 0:
                    for target_span in target_spans:
                        target_text = doc[target_span[0]: target_span[1]] if isinstance(target_span, tuple) else target_span
                        opinion = dict(id=event_id, opinion=opinion_span, holder=None, target=target_span, polarity=None, opinion_text=opinion_text, holder_text=holder_text, target_text=target_text)
                        opinions.append(opinion)
                elif len(target_spans) == 0:
                    for holder_span in holder_spans:
                        holder_text = doc[holder_span[0]: holder_span[1]] if isinstance(holder_span, tuple) else holder_span
                        opinion = dict(id=event_id, opinion=opinion_span, holder=holder_span, target=None, polarity=None, opinion_text=opinion_text, holder_text=holder_text, target_text=target_text)
                        opinions.append(opinion)
    
    for line in ann_lines:
        if line.startswith("A"):
            values = line.split()
            opinion_id = values[2]
            for opinion in opinions:
                if opinion_id == opinion["id"]:
                    if values[1] == "AUTHOR":
                        opinion["holder"] = "AUTHOR"
                    elif values[1] == "SENTIMENT":
                        opinion["polarity"] = values[3]
    
    docs.append(doc)
    opinion_tuples.append(opinions)

In [73]:
for i, (doc, opinions) in enumerate(zip(docs, opinion_tuples)):
    print(f"doc {i}")
    for opinion in opinions:
        
        holder_text = "-"
        if opinion["holder"] is not None:
            if opinion["holder"] == "AUTHOR":
                holder_text = "WRITER"
            else:
                start, end = opinion["holder"]
                holder_text = doc[start: end]
        
        target_text = "-"
        if opinion["target"] is not None:
            start, end = opinion["target"]
            target_text = doc[start: end]
        
        polarity = "-"
        if opinion["polarity"] is not None:
            polarity = opinion["polarity"]

        start, end = opinion["opinion"]
        opinion_text = doc[start: end]
        
        print("{} --({}:{})--> {}".format(holder_text, opinion_text, polarity, target_text))
    print()

doc 0
scholars --(promote:Pos)--> BRI
Wenping --(sharing:Pos)--> China
Xu --(promote:Pos)--> China
he --(benefit:Pos)--> China
she --(result:Pos)--> Chinese
she --(result:Pos)--> companies
Wenping --(producer:Pos)--> China
researcher --(win-win:Pos)--> countries
fellows --(win-win:Pos)--> countries

doc 1
HE Wenping --(development:Pos)--> Chinese
Xi --(illuminating:Pos)--> BRI
Wenping --(effective:Pos)--> BRI
Wenping --(promote:Pos)--> BRI
WRITER --(positively:Pos)--> government

doc 2
he --(progress:Pos)--> China
Ngolle --(transforming:Pos)--> Chinese
Elvis Ngolle Ngolle --(boost:Pos)--> Chinese
Turay --(opportunities:Pos)--> China
he --(advantage:Pos)--> Chinese
Suliaman Turay --(eye-opener:Pos)--> China
Craig Allen --(miracle:Pos)--> China
Varaprasad Sekhar Dolla --(highly:Pos)--> China
scholar --(contribution:Pos)--> Chinese
Khairy Tourk --(look up to:Pos)--> China
he --(help:Pos)--> Chinese
Wang Yi --(welcomed:Pos)--> initiative
Keith Bennett --(greatest:Pos)--> BRI
he --(best:Pos

In [74]:
bri_docs = []

for doc, opinions in zip(docs, opinion_tuples):
    bri_doc = dict(doc=doc, opinions=opinions)
    
    sentences = []
    for sentence in doc.rstrip().split("\n"):
        sentences.append(sentence)
    
    sentence_starts = [0 for _ in range(len(sentences))]
    for i in range(len(sentences) - 1):
        sentence_starts[i + 1] = sentence_starts[i] + len(sentences[i]) + 1
    
    bri_doc["sentences"] = sentences
    bri_doc["sentence-starts"] = sentence_starts

    bri_docs.append(bri_doc)

In [75]:
for doc in bri_docs:
    for sentence, start in zip(doc["sentences"], doc["sentence-starts"]):
        if sentence != doc["doc"][start: start + len(sentence)]:
            print(sentence)
            print(doc["doc"][start: start + len(sentence)])
            print()

In [76]:
for doc in tqdm(bri_docs):
    res = requests.post('http://127.0.0.1:6000/opinion/mine', json={'sentences': doc["sentences"]})
    doc["pred"] = res.json()["opinion"]

100%|██████████| 33/33 [02:18<00:00,  4.19s/it]


In [34]:
def find_sentence_index(starts, span):
    for i in range(len(starts)):
        end = starts[i + 1] if i < len(starts) - 1 else 100000
        if starts[i] <= span[0] < span[1] < end:
            return i

In [77]:
n_opinions, n_sentence_opinions = 0, 0

for doc in bri_docs:
    starts = doc["sentence-starts"]
    for opinion in doc["opinions"]:
        sentence_indices = set()
        sentence_indices.add(find_sentence_index(starts, opinion["opinion"]))
        if opinion["target"] is not None:
            sentence_indices.add(find_sentence_index(starts, opinion["target"]))
        if isinstance(opinion["holder"], tuple):
            sentence_indices.add(find_sentence_index(starts, opinion["holder"]))
        if None not in sentence_indices and len(sentence_indices) == 1:
            n_sentence_opinions += 1
            opinion["sentence-index"] = list(sentence_indices)[0]
        n_opinions += 1

In [78]:
n_opinions, n_sentence_opinions

(201, 201)

In [47]:
nlp = spacy.load("en_core_web_sm")

In [82]:
new_bri_docs = []

def correct_span(start, doc, span, spacy_doc):
    i = 0
    length = len(re.sub("\s+", "", doc[span[0]: span[1]]))

    while i < len(spacy_doc):
        if span[0] - start == spacy_doc[i].idx:
            length2 = len(re.sub("\s+", "", spacy_doc[i].text))
            j = i + 1
            
            while j < len(spacy_doc) and length2 < length:
                length2 += len(re.sub("\s+", "", spacy_doc[j].text))
                j += 1
            
            if length == length2:
                return (i, j)
            
            i = j
        else:
            i += 1

for doc in tqdm(bri_docs):
    new_sentences = []

    for si, (start, sentence, pred_sentence) in enumerate(zip(doc["sentence-starts"], doc["sentences"], doc["pred"])):
        spacy_doc = nlp(sentence)
        spacy_tokens = [token.text for token in spacy_doc]
        pred_tokens = pred_sentence["tokenized"]
        
        pred_to_spacy = [-1 for _ in range(len(pred_tokens))]

        i, j = 0, 0
        while i < len(spacy_tokens) and j < len(pred_tokens):
            leni = len(spacy_tokens[i])
            lenj = len(pred_tokens[j])
            k = i + 1
            l = j + 1
            
            while k <= len(spacy_tokens) and l <= len(pred_tokens) and lenj != leni:
                if leni < lenj:
                    leni += len(spacy_tokens[k])
                    k += 1
                else:
                    lenj += len(pred_tokens[l])
                    l += 1
            
            if leni == lenj and "".join(spacy_tokens[i:k]) == "".join(pred_tokens[j:l]):
                pred_to_spacy[j] = i
                pred_to_spacy[l - 1] = k - 1
                i = k
                j = l
            else:
                break
        
        new_opinions = []
        old_opinions = []

        for opinion in doc["opinions"]:
            if opinion["sentence-index"] == si:
                opinion_expression = correct_span(start, doc["doc"], opinion["opinion"], spacy_doc)
                holder = correct_span(start, doc["doc"], opinion["holder"], spacy_doc) if isinstance(opinion["holder"], tuple) else opinion["holder"]
                target = correct_span(start, doc["doc"], opinion["target"], spacy_doc) if isinstance(opinion["target"], tuple) else opinion["target"]
                new_opinions.append(dict(opinion=opinion_expression, holder=holder, target=target, polarity=opinion["polarity"]))
                old_opinions.append(opinion)
        
        new_sentences.append(dict(text=sentence, tokens=spacy_tokens, gold=new_opinions, pred=pred_sentence["opinion_frames"], pred_to_spacy=pred_to_spacy, old_gold=old_opinions, start=start))
    
    new_bri_docs.append(new_sentences)

100%|██████████| 33/33 [00:05<00:00,  6.01it/s]


In [83]:
for sentence in new_bri_docs[0]:
    if len(sentence["old_gold"]) > 0:
        print(sentence["start"])
        print(sentence["text"])
        print(sentence["tokens"])
        print(sentence["old_gold"])
        print(sentence["gold"])
        print(sentence["pred"])
        print()

0
Brushing aside allegations that China’s Belt & Road Initiatives (BRI) was aimed at colonising smaller countries, senior Chinese scholars said that the BRI rather seeks to promote free trade through enhanced connectivity network and send its economic success to other countries.
['Brushing', 'aside', 'allegations', 'that', 'China', '’s', 'Belt', '&', 'Road', 'Initiatives', '(', 'BRI', ')', 'was', 'aimed', 'at', 'colonising', 'smaller', 'countries', ',', 'senior', 'Chinese', 'scholars', 'said', 'that', 'the', 'BRI', 'rather', 'seeks', 'to', 'promote', 'free', 'trade', 'through', 'enhanced', 'connectivity', 'network', 'and', 'send', 'its', 'economic', 'success', 'to', 'other', 'countries', '.']
[{'id': 'E1', 'opinion': (171, 178), 'holder': (128, 136), 'target': (151, 154), 'polarity': 'Pos', 'opinion_text': 'promote', 'holder_text': 'scholars', 'target_text': 'BRI', 'sentence-index': 0}]
[{'opinion': (30, 31), 'holder': (22, 23), 'target': (26, 27), 'polarity': 'Pos'}]
[{'expression': {

In [84]:
n_old, n_new = 0, 0

for doc in new_bri_docs:
    for sentence in doc:
        for old in sentence["old_gold"]:
            n_old += isinstance(old["opinion"], tuple) + isinstance(old["holder"], tuple) + isinstance(old["target"], tuple)
        for new in sentence["gold"]:
            n_new += isinstance(new["opinion"], tuple) + isinstance(new["holder"], tuple) + isinstance(new["target"], tuple)

In [85]:
n_old, n_new

(581, 579)

In [90]:
for doc in new_bri_docs:
    for sentence in doc:
        pred_to_spacy = sentence["pred_to_spacy"]
        new_pred = []

        for frame in sentence["pred"]:
            opinion = (pred_to_spacy[frame["expression"]["indices"][0]], pred_to_spacy[frame["expression"]["indices"][-1]] + 1)
            holders = [(pred_to_spacy[frame["holders"][i]["indices"][0]], pred_to_spacy[frame["holders"][i]["indices"][-1]] + 1) for i in range(len(frame["holders"]))]
            targets = [(pred_to_spacy[frame["targets"][i]["indices"][0]], pred_to_spacy[frame["targets"][i]["indices"][-1]] + 1) for i in range(len(frame["targets"]))]
            polarity = frame["polarity"]

            if "-1" not in opinion:
                holders = [holder for holder in holders if -1 not in holder]
                targets = [target for target in targets if -1 not in target]

                if len(holders) > 0 and len(targets) > 0:
                    for holder in holders:
                        for target in targets:
                            pred = dict(opinion=opinion, holder=holder, target=target, polarity=polarity)
                            new_pred.append(pred)
                
                elif len(holders) == 0:
                    for target in targets:
                        pred = dict(opinion=opinion, holder=None, target=target, polarity=polarity)
                        new_pred.append(pred)
                
                elif len(targets) == 0:
                    for holder in holders:
                        pred = dict(opinion=opinion, holder=holder, target=target, polarity=polarity)
                        new_pred.append(pred)
        
        sentence["new_pred"] = new_pred

In [91]:
for sentence in new_bri_docs[0]:
    if len(sentence["old_gold"]) > 0:
        print(sentence["start"])
        print(sentence["text"])
        print(sentence["tokens"])
        print(sentence["gold"])
        print(sentence["new_pred"])
        print()

0
Brushing aside allegations that China’s Belt & Road Initiatives (BRI) was aimed at colonising smaller countries, senior Chinese scholars said that the BRI rather seeks to promote free trade through enhanced connectivity network and send its economic success to other countries.
['Brushing', 'aside', 'allegations', 'that', 'China', '’s', 'Belt', '&', 'Road', 'Initiatives', '(', 'BRI', ')', 'was', 'aimed', 'at', 'colonising', 'smaller', 'countries', ',', 'senior', 'Chinese', 'scholars', 'said', 'that', 'the', 'BRI', 'rather', 'seeks', 'to', 'promote', 'free', 'trade', 'through', 'enhanced', 'connectivity', 'network', 'and', 'send', 'its', 'economic', 'success', 'to', 'other', 'countries', '.']
[{'opinion': (30, 31), 'holder': (22, 23), 'target': (26, 27), 'polarity': 'Pos'}]
[{'opinion': (2, 3), 'holder': None, 'target': (4, 7), 'polarity': 'negative'}, {'opinion': (23, 24), 'holder': (20, 23), 'target': (4, 7), 'polarity': 'positive'}, {'opinion': (28, 29), 'holder': (25, 27), 'target'

In [92]:
n_pred_tuples_overlap_sentiment = 0
n_gold_tuples_overlap_sentiment = 0
n_pred_tuples_overlap = 0
n_gold_tuples_overlap = 0
n_gold_tuples = 0
n_pred_tuples = 0

def is_span_overlap(span_x, span_y):
    return span_x[0] < span_y[1] and span_y[0] < span_x[1]

def is_overlap_tuple(tuple_x, tuple_y, attitude=False):
    holder_overlap = False
    if isinstance(tuple_x["holder"], tuple) and isinstance(tuple_y["holder"], tuple):
        holder_overlap = is_span_overlap(tuple_x["holder"], tuple_y["holder"])
    if isinstance(tuple_x["holder"], str) and isinstance(tuple_y["holder"], str):
        holder_overlap = tuple_x["holder"] == tuple_y["holder"]
    
    target_overlap = False
    if isinstance(tuple_x["target"], tuple) and isinstance(tuple_y["target"], tuple):
        target_overlap = is_span_overlap(tuple_x["target"], tuple_y["target"])
    
    attitude_match = True
    if attitude:
        polarity_x = tuple_x["polarity"][:3].lower()
        polarity_y = tuple_y["polarity"][:3].lower()
        attitude_match = polarity_x == polarity_y

    return holder_overlap and target_overlap and attitude_match

for doc in new_bri_docs:
    for sentence in doc:
        
        for gold_tuple in sentence["gold"]:
            for pred_tuple in sentence["new_pred"]:
                if is_overlap_tuple(gold_tuple, pred_tuple):
                    n_gold_tuples_overlap += 1
                    break
            for pred_tuple in sentence["new_pred"]:
                if is_overlap_tuple(gold_tuple, pred_tuple, attitude=True):
                    n_gold_tuples_overlap_sentiment += 1
                    break
        
        for pred_tuple in sentence["new_pred"]:
            for gold_tuple in sentence["gold"]:
                if is_overlap_tuple(pred_tuple, gold_tuple):
                    n_pred_tuples_overlap += 1
                    break
            for gold_tuple in sentence["gold"]:
                if is_overlap_tuple(pred_tuple, gold_tuple, attitude=True):
                    n_pred_tuples_overlap_sentiment += 1
                    break
        
        n_gold_tuples += len(sentence["gold"])
        n_pred_tuples += len(sentence["new_pred"])

In [97]:
precision_without_sentiment = (100 * n_pred_tuples_overlap)/n_pred_tuples
recall_without_sentiment = (100 * n_gold_tuples_overlap)/n_gold_tuples
f1_without_sentiment = (2 * precision_without_sentiment * recall_without_sentiment)/(precision_without_sentiment + recall_without_sentiment)

precision_with_sentiment = (100 * n_pred_tuples_overlap_sentiment)/n_pred_tuples
recall_with_sentiment = (100 * n_gold_tuples_overlap_sentiment)/n_gold_tuples
f1_with_sentiment = (2 * precision_with_sentiment * recall_with_sentiment)/(precision_with_sentiment + recall_with_sentiment)

print("binary (without sentiment): precision = {:5.1f}, recall = {:5.1f}, F1 = {:5.1f}".format(precision_without_sentiment, recall_without_sentiment, f1_without_sentiment))
print("binary (   with sentiment): precision = {:5.1f}, recall = {:5.1f}, F1 = {:5.1f}".format(precision_with_sentiment, recall_with_sentiment, f1_with_sentiment))

binary (without sentiment): precision =   7.3, recall =  42.8, F1 =  12.5
binary (   with sentiment): precision =   6.6, recall =  39.3, F1 =  11.3
