In [15]:
import torch
from torch import nn
from data import OpinionDataset
import config
import pandas as pd
import os
import json
from models.OpinionMinerV2 import OpinionMiner
from collections import Counter, defaultdict
from evaluate import OpinionPerformance
import spacy
import re

In [2]:
mpqa3_doc_ids = open(os.path.join(config.MPQA3_FOLDER, "doclist")).read().strip().split("\n")

In [19]:
mpqa3_docs = []

for doc_id in mpqa3_doc_ids:
    doc = json.load(open(os.path.join(config.MPQA3_PROCESSED_FOLDER, doc_id, "tokenized.json")))
    mpqa3_docs.append(doc)

In [4]:
def is_span_overlap(span_x, span_y):
    return span_x[0] <= span_y[1] and span_y[0] <= span_x[1]

In [5]:
nlp = spacy.load("en_core_web_sm")

In [20]:
n = 100

for doc in mpqa3_docs:
    for sentence in doc:
        
        spacy_doc = nlp(re.sub("\s+", " ", sentence["text"]))
        spacy_tokens = [token.text for token in spacy_doc]
        
        if spacy_tokens != sentence["tokens"]:
            print("spacy:  {}".format(spacy_tokens))
            print("tokens: {}".format(sentence["tokens"]))
            print()
            n -= 1
        sentence["spacy_doc"] = spacy_doc
    
    if n <= 0:
        break

In [21]:
entity_sizes, event_sizes = [], []

for doc in mpqa3_docs:
    for sentence in doc:
        for ex in sentence["dse"]:
            if ex["target-type"] == "entity":
                entity_sizes.append(ex["target"][1] - ex["target"][0] + 1)
            elif ex["target-type"] == "event":
                event_sizes.append(ex["target"][1] - ex["target"][0] + 1)

In [22]:
print(Counter(entity_sizes))
print(Counter(event_sizes))

Counter({1: 1812, 3: 6, 2: 2})
Counter({1: 982, 3: 2})


In [23]:
for doc in mpqa3_docs[:100]:
    for sentence in doc:

        target_entities, target_events = [], []
        spacy_entities = []

        for ent in sentence["spacy_doc"].ents:
            spacy_entities.append(ent.text)

        for ex in sentence["dse"]:
            target = " ".join(sentence["tokens"][ex["target"][0]: ex["target"][1] + 1])
            if ex["target-type"] == "entity":
                target_entities.append(target)
            elif ex["target-type"] == "event":
                target_events.append(target)
        
        print(sentence["text"])
        
        print("spacy entities:")
        for ent in spacy_entities:
            print("\t{}".format(ent))
        
        print("target entities:")
        for ent in target_entities:
            print("\t{}".format(ent))
        
        print("target events:")
        for event in target_events:
            print("\t{}".format(event))
        
        print()

["Opinion" U.S. Human Rights Claims Only Empty Rhetoric]
spacy entities:
	Opinion" U.S. Human Rights Claims Only Empty Rhetoric
target entities:
target events:
	Claims

The U.S. State Department on Monday published its annual report on the status of human rights in other countries in the year 2001.
spacy entities:
	The U.S. State Department
	Monday
	annual
	the year 2001
target entities:
target events:

In this report, when referring to Iran, the United States repeated its allegations against the Islamic Republic but failed to provide any evidence in support of its baseless charges.
spacy entities:
	Iran
	the United States
	the Islamic Republic
target entities:
	charges
	States
	charges
	charges
	Iran
target events:
	failed

Among the unfounded allegations was the claim that the Islamic Republic enjoys no social base and is an unpopular system because of its human rights violations.
spacy entities:
	the Islamic Republic
target entities:
	claim
	allegations
	allegations
target events:



In [24]:
ent

'Rice'

In [25]:
type(ent)

str

In [26]:
text = "He lives in New York by the riverside."

In [49]:
doc = nlp(text)

In [29]:
for ent in doc.ents:
    print(ent)

New York


In [30]:
ent.start

3

In [31]:
ent.end

5

In [39]:
for nc in doc.noun_chunks:
    print(nc.root.i)

0
4
7


In [52]:
n_target_entities, n_target_events = 0, 0
n_target_entities_outside_target_span, n_target_events_outside_target_span = 0, 0

d = 2

n_target_entities_covered_by_spacy_entities, n_within_span_target_entities_covered_by_spacy_entities, n_spacy_entities = 0, 0, 0
n_target_entities_covered_by_spacy_noun_chunk_heads, n_within_span_target_entities_covered_by_spacy_noun_chunk_heads, n_spacy_noun_chunk_heads = 0, 0, 0
n_target_entities_covered_by_d_spacy_noun_chunk_heads, n_d_spacy_noun_chunk_heads = 0, 0

n_within_span_target_events_covered_by_spacy_verb, n_spacy_verbs = 0, 0
n_target_events_covered_by_d_spacy_verbs, n_d_spacy_verbs = 0, 0

for doc in mpqa3_docs:
    for sentence in doc:
        
        target_spans, target_entities, target_events = set(), set(), set()
        spacy_entities, spacy_noun_chunk_heads, spacy_verbs = set(), set(), set()
        d_spacy_noun_chunk_heads, d_spacy_verbs = set(), set()
        target_entities_outside_target_span, target_events_outside_target_span = set(), set()

        for ex in sentence["dse"]:
            if ex["target-type"] == "span":
                target_spans.add(tuple(ex["target"]))
            elif ex["target-type"] == "entity":
                target_entities.add(tuple(ex["target"]))
            else:
                target_events.add(tuple(ex["target"]))
        
        for ent in target_entities:
            for span in target_spans:
                if span[0] <= ent[0] <= ent[1] <= span[1]:
                    break
            else:
                target_entities_outside_target_span.add(ent)

        for event in target_events:
            for span in target_spans:
                if span[0] <= event[0] <= event[1] <= span[1]:
                    break
            else:
                target_events_outside_target_span.add(event)
        
        for ent in sentence["spacy_doc"].ents:
            entity = (ent.start, ent.end - 1)
            for span in target_spans:
                if span[0] <= entity[0] <= entity[1] <= span[1]:
                    spacy_entities.add(entity)
                    break
        
        for noun_chunk in sentence["spacy_doc"].noun_chunks:
            head = (noun_chunk.root.i, noun_chunk.root.i)
            for span in target_spans:
                if span[0] <= head[0] <= span[1]:
                    spacy_noun_chunk_heads.add(head)
                    break
            for span in target_spans:
                if span[0] - d <= head[0] <= span[1] + d:
                    d_spacy_noun_chunk_heads.add(head)
                    break
        
        for token in sentence["spacy_doc"]:
            if token.pos_ == "VERB":
                verb = (token.i, token.i)
                for span in target_spans:
                    if span[0] <= verb[0] <= span[1]:
                        spacy_verbs.add(verb)
                        break
                for span in target_spans:
                    if span[0] - d <= verb[0] <= span[1] + d:
                        d_spacy_verbs.add(verb)
                        break
        
        n_target_entities += len(target_entities)
        n_target_events += len(target_events)
        n_spacy_entities += len(spacy_entities)
        n_spacy_noun_chunk_heads += len(spacy_noun_chunk_heads)
        n_spacy_verbs += len(spacy_verbs)
        n_d_spacy_noun_chunk_heads += len(d_spacy_noun_chunk_heads)
        n_d_spacy_verbs += len(d_spacy_verbs)

        n_target_entities_outside_target_span += len(target_entities_outside_target_span)
        n_target_events_outside_target_span += len(target_events_outside_target_span)

        n_target_entities_covered_by_spacy_entities += len(target_entities.intersection(spacy_entities))
        n_target_entities_covered_by_spacy_noun_chunk_heads += len(target_entities.intersection(spacy_noun_chunk_heads))
        
        n_within_span_target_entities_covered_by_spacy_entities += len((target_entities.difference(target_entities_outside_target_span)).intersection(spacy_entities))
        n_within_span_target_entities_covered_by_spacy_noun_chunk_heads += len((target_entities.difference(target_entities_outside_target_span)).intersection(spacy_noun_chunk_heads))

        n_within_span_target_events_covered_by_spacy_verb += len((target_events.difference(target_events_outside_target_span)).intersection(spacy_verbs))

        n_target_entities_covered_by_d_spacy_noun_chunk_heads += len(target_entities.intersection(d_spacy_noun_chunk_heads))
        n_target_events_covered_by_d_spacy_verbs += len(target_events.intersection(d_spacy_verbs))

print("{:.2f} target entities lie outside target span".format(n_target_entities_outside_target_span/n_target_entities))
print("{:.2f} target events   lie outside target span".format(n_target_events_outside_target_span/n_target_events))
print("ner:                       precision = {:.2f}, recall = {:.2f}".format(n_target_entities_covered_by_spacy_entities/n_spacy_entities, n_target_entities_covered_by_spacy_entities/n_target_entities))
print("ner (within span):         precision = {:.2f}, recall = {:.2f}".format(n_within_span_target_entities_covered_by_spacy_entities/n_spacy_entities, n_within_span_target_entities_covered_by_spacy_entities/(n_target_entities - n_target_entities_outside_target_span)))
print("noun chunk:                precision = {:.2f}, recall = {:.2f}".format(n_target_entities_covered_by_spacy_noun_chunk_heads/n_spacy_noun_chunk_heads, n_target_entities_covered_by_spacy_noun_chunk_heads/n_target_entities))
print("noun chunk (within span):  precision = {:.2f}, recall = {:.2f}".format(n_within_span_target_entities_covered_by_spacy_noun_chunk_heads/n_spacy_noun_chunk_heads, n_within_span_target_entities_covered_by_spacy_noun_chunk_heads/(n_target_entities - n_target_entities_outside_target_span)))
print("noun chunk + d = {}:        precision = {:.2f}, recall = {:.2f}".format(d, n_target_entities_covered_by_d_spacy_noun_chunk_heads/n_d_spacy_noun_chunk_heads, n_target_entities_covered_by_d_spacy_noun_chunk_heads/n_target_entities))
print("spacy verb (within span):  precision = {:.2f}, recall = {:.2f}".format(n_within_span_target_events_covered_by_spacy_verb/n_spacy_verbs, n_within_span_target_events_covered_by_spacy_verb/(n_target_events - n_target_events_outside_target_span)))
print("spacy verb + d = {}:        precision = {:.2f}, recall = {:.2f}".format(d, n_target_events_covered_by_d_spacy_verbs/n_d_spacy_verbs, n_target_events_covered_by_d_spacy_verbs/n_target_events))

0.30 target entities lie outside target span
0.55 target events   lie outside target span
ner:                       precision = 0.20, recall = 0.12
ner (within span):         precision = 0.20, recall = 0.17
noun chunk:                precision = 0.35, recall = 0.63
noun chunk (within span):  precision = 0.35, recall = 0.89
noun chunk + d = 2:        precision = 0.31, recall = 0.68
spacy verb (within span):  precision = 0.37, recall = 0.66
spacy verb + d = 2:        precision = 0.24, recall = 0.49
