In [29]:
from pprint import pprint

def print_sentence_tags(tagged_sentence, tag_filter):
    for i, (wd,tags) in enumerate(tagged_sentence):
        tags = tags.intersection(tag_filter)
        stags = ""
        if tags:
            stags = ",".join(tags)
        print(str(i).ljust(3),wd.ljust(30), stags)

In [2]:
class Stack(object):
    def __init__(self, verbose=False):    
        self.stack = []
        self.verbose = verbose
    
    def tos(self):
        if self.len() == 0:
            return None
        #assert self.len() > 0, "Can't peek when stack is empty"
        return self.stack[-1]
    
    def pop(self):
        assert self.len() > 0, "Can't pop when stack is empty"
        item = self.stack.pop()
        if self.verbose:
            print("POPPING: %s" % item)
            print("LEN:     %i" % len(self.stack))
        return item
    
    def push(self, item):
        self.stack.append(item)
        if self.verbose:
            print("PUSHING: %s" % item)
            print("LEN:     %i" % len(self.stack))
    
    def len(self):
        return len(self.stack)

    def contains(self, item):
        return item in self.stack
    
    def __repr__(self):
        return "|".join(self.stack)

In [3]:
ROOT = "root"

def norm_arc(arc):
    return tuple(sorted(arc))

def norm_arcs(arcs):
    return set(map(norm_arc, arcs))

class Parser(object):
    def __init__(self, stack):
        self.stack = stack
        self.arcs = []
        self.normed_arcs = set()
        # nodes with heads
        self.children = set()
        self.actions = []

    def get_dependencies(self):
        return [(l, r) for (l, r) in self.arcs if r != ROOT and l != ROOT]

    def left_arc(self, buffer):
        tos = self.stack.pop()
        # Pre-condition
        # assert self.has_head(tos) == False
        arc = (tos, buffer)
        n_arc = norm_arc(arc)
        assert n_arc not in self.normed_arcs, "Arc already processed %s" % str(n_arc)
        self.arcs.append(arc)
        self.normed_arcs.add(arc)
        self.children.add(tos)
        self.actions.append("L ARC   : " + tos + "->" + buffer)

    def right_arc(self, buffer):
        tos = self.stack.tos()
        # normalize arc
        arc = (buffer, tos)
        n_arc = norm_arc(arc)
        assert n_arc not in self.normed_arcs, "Arc already processed %s" % str(n_arc)
        self.arcs.append(arc)
        self.normed_arcs.add(n_arc)
        self.actions.append("R ARC   : " + tos + "<-" + buffer)
        self.children.add(buffer)
        self.stack.push(buffer)

    def reduce(self):
        tos = self.stack.pop()
        # assert self.has_head(tos) == True
        self.actions.append("REDUCE  : Pop  %s" % tos)

    def shift(self, buffer):
        self.stack.push(buffer)
        self.actions.append("SHIFT   : Push %s" % buffer)

    def skip(self, buffer):
        self.actions.append("SKIP    : item %s" % buffer)

    def has_head(self, item):
        return item in self.children

    def in_stack(self, item):
        return self.stack.contains(item)

    def clone(self):
        cloney = Parser(self.stack.clone())
        cloney.arcs = list(self.arcs)
        cloney.normed_arcs = set(self.normed_arcs)
        # nodes with heads
        cloney.children = set(self.children)
        cloney.actions = list(self.actions)
        return cloney

In [4]:
from collections import defaultdict

SHIFT = "Shift"
REDUCE = "Reduce"
LARC = "LArc"
RARC = "Rarc"
SKIP = "Skip"

class Oracle(object):
    def __init__(self, crels, parser):
        self.parser = parser
        self.raw_crels = crels
        self.crels = norm_arcs(crels)  # type: Set[Tuple[str,str]]
        self.mapping = self.build_mappings(crels)

    def build_mappings(self, pairs):
        mapping = defaultdict(set)
        for c, res in pairs:
            mapping[c].add(res)
            mapping[res].add(c)
        return mapping

    def should_continue(self, action):
        # continue parsing if REDUCE or LARC
        return action in (REDUCE, LARC)

    def remove_relation(self, a, b):
        # as we can force it to execute actions that are invalid, we have to see if this is a valid relation to remove
        if a in self.mapping and b in self.mapping[a]:
            self.mapping[a].remove(b)
            if len(self.mapping[a]) == 0:
                del self.mapping[a]
            self.mapping[b].remove(a)
            if len(self.mapping[b]) == 0:
                del self.mapping[b]

    def consult(self, tos, buffer):
        """
        Performs optimal decision for parser
        If true, continue processing, else Consume Buffer
        """
        parser = self.parser
        a, b = norm_arc((tos, buffer))
        if (a, b) in self.crels:
            # TOS has arcs remaining? If so, we need RARC, else LARC
            if len(self.mapping[tos]) == 1:
                return LARC
            else:
                return RARC
        else:
            if buffer not in self.mapping:
                return SKIP
            # If the buffer has relations further down in the stack, we need to POP the TOS
            for item in self.mapping[buffer]:
                if item == tos:
                    continue
                if parser.in_stack(item):
                    return REDUCE
            # end for
            # ELSE
            return SHIFT

    def execute(self, action, tos, buffer):
        """
        Performs optimal decision for parser
        If true, continue processing, else Consume Buffer
        """
        parser = self.parser
        if action == LARC:
            parser.left_arc(buffer)
            self.remove_relation(tos, buffer)
        elif action == RARC:
            parser.right_arc(buffer)
            self.remove_relation(tos, buffer)
        elif action == REDUCE:
            parser.reduce()
        elif action == SHIFT:
            parser.shift(buffer)
        elif action == SKIP:
            parser.skip(buffer)
        else:
            raise Exception("Unknown parsing action %s" % action)
        return self.should_continue(action)

    def tos(self):
        return self.parser.stack.tos()

    def is_stack_empty(self):
        return self.parser.stack.len() == 0

    def clone(self):
        cloney = Oracle(set(self.raw_crels), self.parser.clone())
        # Need to ensure a deep clone of the mappings dict
        cloney.mapping = defaultdict(set)
        for key, set_vals in self.mapping.items():
            cloney.mapping[key].update(set_vals)
        return cloney

In [5]:
def test_oracle(codes, crels, orcl_fact, verbose=False):
    
    crels = set(crels)
    if verbose:
        prn_fun = lambda s="": print(s)
    else:
        prn_fun = lambda s="": None
    
    stack = Stack(False)
    stack.push(ROOT)
    parser = Parser(stack)
    oracle = orcl_fact(crels, parser)

    prn_fun("DEPS")
    for crel in sorted(crels):
        prn_fun("\t" + str(crel))
    prn_fun()

    PAD = 20
    LINE = PAD + len(ROOT) + 2 * len(codes) + 1

    for buffer in codes:
        prn_fun("-" * LINE)
        prn_fun(buffer)
        prn_fun("-" * LINE)

        while True:
            tos = stack.tos()
            action = oracle.consult(tos, buffer)
            if not oracle.execute(action, tos, buffer):
                prn_fun(parser.actions[-1].ljust(PAD) + " || STACK : " + str(stack))
                break

            prn_fun(parser.actions[-1].ljust(PAD) + " || STACK : " + str(stack))
            if stack.len() == 0:
                prn_fun("Empty stack, stopping")
                break

    prn_fun()
    prn_fun("*" * LINE)
    prn_fun("Stack")
    prn_fun("\t" + str(stack))
    deps = parser.get_dependencies()
    prn_fun("DEPS Actual")
    for crel in sorted(crels):
        prn_fun("\t" + str(crel))
    prn_fun("DEPS Pred")
    for dep in sorted(deps):
        prn_fun("\t" + str(dep))
    prn_fun("Actions")
    for a in parser.actions:
        prn_fun("\t" + a)
    prn_fun()
    prn_fun("Ordered Match?    " + str(set(deps) == crels))

    ndeps = norm_arcs(deps)
    ncrels = norm_arcs(crels)
    diff = (ndeps - ncrels).union(ncrels - ndeps)
    success = (len(diff) == 0)
    prn_fun("Un Ordered Match? " + str(success))
    if diff:
        prn_fun(diff)
    return success

In [6]:
test_pairs = []

test_pairs.append([
    ("A","B"),
])
test_pairs.append([
    ("A","B"),
    ("B","C"),
])
#C->B->A
test_pairs.append([
    ("C","B"),
    ("B","A"),
])
test_pairs.append([
    ("A","C"),
    ("B","C"),
])
test_pairs.append([
    ("A","B"),
    ("C","B"),
])
test_pairs.append([
    ("B","A"),
    ("B","C"),
])
test_pairs.append([
    ("A","C"),
    ("C","B"),
])

# Hard - has to flip relation
test_pairs.append([
    ("A","D"),
    ("D","B"),
    ("B","C"),
])
test_pairs.append([
    ("D","A"),
    ("D","B"),
    ("B","C"),
])
test_pairs.append([
    ("D","A"),
    ("B","D"),
    ("B","C"),
])

test_pairs.append([
    ("A","E"),
    ("E","B"),
    ("B","D"),
    ("D","C"),
])
test_pairs.append([
    ("A","D"),
    ("D","B"),
    ("B","C"),
    ("A", "F"),
    ("A", "E"),
])

test_pairs.append([
    ("A","D"),
    ("D","B"),
    ("B","C"),
    ("A", "F"),
    ("E", "F"),
])

oracle_fact = Oracle
for pairs in test_pairs:
    try:
        success = test_oracle("ABCDEF", pairs, oracle_fact, verbose=False)
    except:
        success = False
        
    if not success:
        print("Error for relations:")
        pprint(pairs)
        print()
        success = test_oracle("ABCDEF", pairs, oracle_fact, verbose=True)

## Visualize Parse for Tricker Graphs

### <span style="color:red">Doesn't Handle Cycles</span>
- So we remove the condition about only having a single parent

In [7]:
#[('1', '3'), ('1', '50'), ('3', '50')]
#['50', '1', '3']
pairs =[
    ("B","A"),
    ("B","C"),
    ("C","A"),
]
test_oracle("ABCDEF", pairs, Oracle, verbose=True)

DEPS
	('B', 'A')
	('B', 'C')
	('C', 'A')

-------------------------------------
A
-------------------------------------
SHIFT   : Push A     || STACK : root|A
-------------------------------------
B
-------------------------------------
R ARC   : A<-B       || STACK : root|A|B
-------------------------------------
C
-------------------------------------
L ARC   : B->C       || STACK : root|A
L ARC   : A->C       || STACK : root
SKIP    : item C     || STACK : root
-------------------------------------
D
-------------------------------------
SKIP    : item D     || STACK : root
-------------------------------------
E
-------------------------------------
SKIP    : item E     || STACK : root
-------------------------------------
F
-------------------------------------
SKIP    : item F     || STACK : root

*************************************
Stack
	root
DEPS Actual
	('B', 'A')
	('B', 'C')
	('C', 'A')
DEPS Pred
	('A', 'C')
	('B', 'A')
	('B', 'C')
Actions
	SHIFT   : Push A
	R ARC   : A<-B

True

In [8]:
pairs =[
    ("A","D"),
    ("D","B"),
    ("B","C"),
]
test_oracle("ABCDEF", pairs, Oracle, verbose=True)

DEPS
	('A', 'D')
	('B', 'C')
	('D', 'B')

-------------------------------------
A
-------------------------------------
SHIFT   : Push A     || STACK : root|A
-------------------------------------
B
-------------------------------------
SHIFT   : Push B     || STACK : root|A|B
-------------------------------------
C
-------------------------------------
R ARC   : B<-C       || STACK : root|A|B|C
-------------------------------------
D
-------------------------------------
REDUCE  : Pop  C     || STACK : root|A|B
L ARC   : B->D       || STACK : root|A
L ARC   : A->D       || STACK : root
SKIP    : item D     || STACK : root
-------------------------------------
E
-------------------------------------
SKIP    : item E     || STACK : root
-------------------------------------
F
-------------------------------------
SKIP    : item F     || STACK : root

*************************************
Stack
	root
DEPS Actual
	('A', 'D')
	('B', 'C')
	('D', 'B')
DEPS Pred
	('A', 'D')
	('B', 'D')
	('C',

True

## Non Projective Parse Should Fail Test

In [9]:
pairs =[
    ("A","C"),
    ("B","E"),
]
try:
    success = test_oracle("ABCDEF", pairs, Oracle, verbose=True)
except Exception as e:
    success = False
    raise e
assert success == False

DEPS
	('A', 'C')
	('B', 'E')

-------------------------------------
A
-------------------------------------
SHIFT   : Push A     || STACK : root|A
-------------------------------------
B
-------------------------------------
SHIFT   : Push B     || STACK : root|A|B
-------------------------------------
C
-------------------------------------
REDUCE  : Pop  B     || STACK : root|A
L ARC   : A->C       || STACK : root
SKIP    : item C     || STACK : root
-------------------------------------
D
-------------------------------------
SKIP    : item D     || STACK : root
-------------------------------------
E
-------------------------------------
SHIFT   : Push E     || STACK : root|E
-------------------------------------
F
-------------------------------------
SKIP    : item F     || STACK : root|E

*************************************
Stack
	root|E
DEPS Actual
	('A', 'C')
	('B', 'E')
DEPS Pred
	('A', 'C')
Actions
	SHIFT   : Push A
	SHIFT   : Push B
	REDUCE  : Pop  B
	L ARC   : A->C
	SKIP

## Test on Real Causal Relations (Limit to 2 or More Relations in a Sentence)

In [10]:
def normalize(code):
    return code.replace("Causer:","").replace("Result:","")

def normalize_cr(cr):
    return tuple(normalize(cr).split("->"))

In [11]:
normalize("Causer:14"),normalize("Result:50")

('14', '50')

In [12]:
normalize_cr('Causer:14->Result:50')

('14', '50')

In [13]:
import pickle 

training_pickled = "/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/training.pl"
with open(training_pickled, "rb+") as f:
    tagged_essays = pickle.load(f)
len(tagged_essays)

902

In [14]:
from collections import defaultdict

tag_freq = defaultdict(int)
unique_words = set()
for essay in tagged_essays:
    for sentence in essay.sentences:
        for word, tags in sentence:
            unique_words.add(word)
            for tag in tags:
                tag_freq[tag] += 1

EMPTY_TAG = "Empty"
#TODO - don't ignore Anaphor, other and rhetoricals here
cr_tags  = list((t for t in tag_freq.keys() if ( "->" in t) and not "Anaphor" in t and not "other" in t and not "rhetorical" in t))
reg_tags = set((t for t in tag_freq.keys() if ( "->" not in t) and (t == "explicit" or t[0].isdigit())))

## Parse Causal Relations Using Position Information to Differentiate Codes and Crels (So can have multiple of the same type)

In [60]:
def get_tags_relations_for(tagged_sentence, tag_freq, reg_tags, cr_tags):    
    most_common_tag  = [None] # seed with None
    most_common_crel = [None]
    tag_seq = []
    crel_seq = []
    
    crel_child_tags = defaultdict(set)    
    for i, (wd,tags) in enumerate(tagged_sentence):
        rtags = set([normalize(t) for t in tags])
        rtags = rtags.intersection(reg_tags)
        # Get tag seq
        tag = None
        if rtags:
            if len(rtags) > 1 and "explicit" in rtags:
                rtags.remove("explicit")
            tag = max(rtags, key = lambda t: tag_freq[t])
            # if no prev tag and the current matches -2 (a gap of one), skip over
            if tag != most_common_tag[-1] and \
                not(most_common_tag[-1] is None and (len(most_common_tag) > 2) and tag == most_common_tag[-2]):
                tag_seq.append((tag,i))
        most_common_tag.append(tag)
                            
        crels = tags.intersection(cr_tags)
        crel = None
        if crels:
            crel = max(crels, key = lambda cr: tag_freq[cr])
            if crel != most_common_crel[-1]\
                and not(most_common_crel[-1] is None and (len(most_common_crel) > 2) and crel == most_common_crel[-2]):
                crel_seq.append((crel, i))
        most_common_crel.append(crel)
                
        # to have child tags, need a tag sequence and a current valid regular tag
        if not tag or len(tag_seq) == 0 or not crel or len(crel_seq) == 0:
            continue

        if tag != tag_seq[-1][0]:
            raise Exception("Tags don't match % s" % str((i,tag,tag_seq[-1])))
        if crel != crel_seq[-1][0]:
            raise Exception("Crels don't match % s" % str((i,crel,crel_seq[-1])))

        l,r = normalize_cr(crel)        
        if tag in (l,r):
            crel_child_tags[crel_seq[-1]].add(tag_seq[-1])
    return tag_seq, crel_child_tags

In [61]:
from pprint import pprint

relations = []
skipped_sent = 0
skipped_crels = 0
num_sents = 0
num_csl = 0

diffs = []

too_many_kids = []

for essay_ix, essay in enumerate(tagged_essays):
    for sent_ix, tagged_sentence in enumerate(essay.sentences):
        
        tag_seq, crel_child_tags = get_tags_relations_for(tagged_sentence, tag_freq, reg_tags, cr_tags)
        
        num_sents += 1
        un_csl  = set()
        for i, (wd,tags) in enumerate(tagged_sentence):
            csl = tags.intersection(cr_tags)
            un_csl.update(csl)
        
        num_csl += len(un_csl)
        
        # Don't count sentences without any relations as skipped
        if un_csl:        
            supported_causal = set()
            for (crel,ix), posn_tags in crel_child_tags.items():
                unique_child_tags = set()
                for tag, ix in posn_tags:
                    unique_child_tags.add(tag)                    
                if len(unique_child_tags) < 2:
                    l,r = normalize_cr(crel)
                    # if l == r then we want to keep these
                    if len(unique_child_tags) == 0 or (len(unique_child_tags) == 1 and l != r):
                        skipped_crels += 1
                        continue
                if len(posn_tags) > 2:
                    too_many_kids.append((essay_ix, sent_ix, crel, list(tag_seq), dict(crel_child_tags)))
                supported_causal.add(crel)

            if not supported_causal:
                skipped_sent += 1
                continue
            # filter out any tags that were only part of unsupported causal relations
            #tag_seq = [tag for tag in tag_seq if tag in supported_codes]
            relations.append((essay_ix,sent_ix,supported_causal,tag_seq))
            
            if len(supported_causal) != len(un_csl):
                diffs.append((essay_ix, sent_ix, un_csl, supported_causal))
        else:
            if un_csl:
                diffs.append((essay_ix, sent_ix, un_csl, set()))
        #del tag_seq
        #del crel_child_tags
        
num_sents, len(relations), num_csl #skipped_sent, skipped_crels, 
#(8292, 2217, 3006)

(8292, 2197, 3006)

### How Many of the Relations Have More than 2 Child Tags?

In [62]:
len(too_many_kids)

6

## Do Any of these Have More than 3 Children?

In [88]:
for (essay_ix, sent_ix, crel, tag_seq, crel_child_tags) in too_many_kids:
    #for (crel,ix), posn_tags in crel_child_tags.items():
    pprint(crel_child_tags)

{('Causer:3->Result:50', 3): {('3', 7), ('50', 3), ('3', 15)}}
{('Causer:1->Result:5', 6): {('5', 20), ('5', 9), ('1', 6)}}
{('Causer:7->Result:50', 0): {('50', 0), ('7', 11), ('50', 16)}}
{('Causer:1->Result:3', 0): {('3', 12), ('1', 0), ('3', 5)}}
{('Causer:1->Result:50', 6): {('50', 10), ('50', 15), ('1', 6)}}
{('Causer:3->Result:50', 1): {('3', 1), ('50', 19), ('50', 12)}}


**>>> Create separate causal relations for each combo of children where l != r (unless a self to self relation)**

In [77]:
e_ix, s_ix, crel, tseq, cr_kids = too_many_kids[2]
crel, tseq, cr_kids

('Causer:7->Result:50',
 [('50', 0), ('explicit', 8), ('7', 11), ('50', 16)],
 {('Causer:7->Result:50', 0): {('50', 0), ('50', 16), ('7', 11)}})

In [90]:
get_tags_relations_for(sentence, tag_freq, reg_tags, cr_tags)

([('50', 0), ('explicit', 8), ('7', 11), ('50', 16)],
 defaultdict(set,
             {('Causer:7->Result:50', 0): {('50', 0), ('50', 16), ('7', 11)}}))

In [87]:
sentence = tagged_essays[e_ix].sentences[s_ix]
print_sentence_tags(sentence, [k for k in tag_freq.keys() if (":" in k and not "rhetorical" in k and not "->" in k) or k[0].isdigit()])
print()
print_sentence_tags(sentence, cr_tags)

0   coral                          50,Result:50
1   bleaching                      50,Result:50
2   is                             50,Result:50
3   when                           50,Result:50
4   coral                          50,Result:50
5   loses                          50,Result:50
6   it                             50,Result:50
7   color                          50,Result:50
8   due                            
9   too                            
10  the                            
11  algae                          Causer:7
12  that                           Causer:7
13  lives                          Causer:7
14  on                             Causer:7
15  the                            Causer:7
16  coral                          50,Causer:7
17  INFREQUENT                     50
18  .                              

0   coral                          Causer:7->Result:50
1   bleaching                      Causer:7->Result:50
2   is                             Causer:7->Result:50
3

In [184]:
2217/3006

0.7375249500998003

In [186]:
#normalize_cr("Causer:5->Result:10")

e_ix = essay_ix
s_ix = sent_ix
sentence = tagged_essays[e_ix].sentences[s_ix]
tag_seq, crel_children = get_tags_relations_for(sentence, tag_freq, reg_tags, set(cr_tags))
for pair in tag_seq:
    print(pair)
print("*" * 30)
for crel, kids in crel_children.items():
    print(crel)
    for k in kids:
        print(str(k))
    print()

('50', 1)
('explicit', 6)
('7', 9)
******************************
Causer:7->Result:50
('7', 9)
('50', 1)



## Test New Parsing Logic

In [91]:
get_tags_relations_for(sentence, tag_freq, reg_tags, cr_tags)

([('50', 0), ('explicit', 8), ('7', 11), ('50', 16)],
 defaultdict(set,
             {('Causer:7->Result:50', 0): {('50', 0), ('50', 16), ('7', 11)}}))

In [95]:
errors = 0
exs = []
for essay in tagged_essays:
    for sentence in essay.sentences:
        
        tag_seq, crel_children = get_tags_relations_for(sentence, tag_freq, reg_tags, cr_tags)
        if not tag_seq or not crel_children:
            continue
        crels = []
        for _, tag_pairs in crel_children.items():
            tag2pairs = defaultdict(set)
            for tag,ix in tag_pairs:
                tag2pairs[tag].add((tag,ix))
            for taga, pairsa in tag2pairs.items():                    
                for tagb, pairsb in tag2pairs.items():
                    if taga != tagb:
                        for pa in pairsa:
                            for pb in pairsb:                        
                                crels.append((pa,pb))

        try:
            success = test_oracle(tag_seq, crels, Oracle, verbose=False)
        except Exception as e:
            exs.append(e)
            success = False

        if not success:
            errors += 1
            print("Error for relations:", e_ix, ",", s_ix)
            pprint(crels)
            pprint(tag_seq)
            #print()
            #success = test_oracle(tag_seq, crels, Oracle, verbose=True)
            #break

Error for relations: 379 , 0
[(('50', 11), ('5', 2)), (('5', 2), ('50', 11))]
[('5', 2), ('explicit', 9), ('50', 11)]
Error for relations: 379 , 0
[(('7', 16), ('50', 24)), (('50', 24), ('7', 16))]
[('50', 8), ('7', 16), ('explicit', 22), ('50', 24)]
Error for relations: 379 , 0
[(('3', 1), ('4', 8)), (('4', 8), ('3', 1))]
[('explicit', 0), ('3', 1), ('explicit', 4), ('4', 8)]
Error for relations: 379 , 0
[(('50', 2), ('7', 8)), (('7', 8), ('50', 2))]
[('50', 2), ('explicit', 5), ('7', 8)]
Error for relations: 379 , 0
[(('50', 2), ('7', 8)), (('7', 8), ('50', 2))]
[('50', 2), ('explicit', 5), ('7', 8)]
Error for relations: 379 , 0
[(('50', 5), ('1', 14)), (('1', 14), ('50', 5))]
[('explicit', 3), ('50', 5), ('13', 9), ('1', 14), ('11', 18)]
Error for relations: 379 , 0
[(('7', 23), ('50', 16)), (('50', 16), ('7', 23))]
[('50', 8), ('50', 16), ('explicit', 20), ('7', 23), ('50', 46)]
Error for relations: 379 , 0
[(('7', 4), ('50', 12)), (('50', 12), ('7', 4))]
[('7', 4), ('explicit', 10

[('1', 0), ('explicit', 2), ('2', 3)]
Error for relations: 379 , 0
[(('50', 5), ('3', 2)), (('3', 2), ('50', 5))]
[('explicit', 0), ('3', 2), ('50', 5)]
Error for relations: 379 , 0
[(('6', 4), ('7', 13)), (('7', 13), ('6', 4))]
[('6', 4), ('explicit', 11), ('7', 13)]
Error for relations: 379 , 0
[(('3', 6), ('4', 11)), (('4', 11), ('3', 6))]
[('explicit', 5), ('3', 6), ('explicit', 9), ('4', 11)]
Error for relations: 379 , 0
[(('11', 2), ('13', 4)), (('13', 4), ('11', 2))]
[('11', 2), ('13', 4)]
Error for relations: 379 , 0
[(('11', 1), ('12', 2)), (('12', 2), ('11', 1))]
[('11', 1), ('12', 2)]
Error for relations: 379 , 0
[(('7', 15), ('6', 10)), (('6', 10), ('7', 15))]
[('6', 10), ('explicit', 14), ('7', 15)]
Error for relations: 379 , 0
[(('3', 7), ('1', 1)), (('1', 1), ('3', 7))]
[('1', 1), ('explicit', 5), ('3', 7)]
Error for relations: 379 , 0
[(('50', 5), ('1', 13)), (('1', 13), ('50', 5))]
[('50', 5), ('explicit', 10), ('1', 13)]
Error for relations: 379 , 0
[(('1', 1), ('3', 

[('50', 2), ('explicit', 7), ('3', 11)]
Error for relations: 379 , 0
[(('50', 6), ('6', 16)), (('6', 16), ('50', 6))]
[('50', 6), ('explicit', 10), ('6', 16)]
Error for relations: 379 , 0
[(('6', 1), ('7', 13)), (('7', 13), ('6', 1))]
[('6', 1), ('explicit', 2), ('7', 13)]
Error for relations: 379 , 0
[]
[('6', 26)]
Error for relations: 379 , 0
[]
[('explicit', 4), ('50', 7)]
Error for relations: 379 , 0
[(('7', 9), ('50', 4)), (('50', 4), ('7', 9))]
[('50', 4), ('explicit', 7), ('7', 9)]
Error for relations: 379 , 0
[(('50', 10), ('7', 2)), (('7', 2), ('50', 10))]
[('7', 2), ('explicit', 9), ('50', 10)]
Error for relations: 379 , 0
[(('14', 5), ('13', 1)), (('13', 1), ('14', 5))]
[('explicit', 0), ('13', 1), ('14', 5)]
Error for relations: 379 , 0
[(('13', 17), ('11', 7)), (('11', 7), ('13', 17))]
[('11', 7), ('explicit', 15), ('13', 17), ('explicit', 20), ('12', 22)]
Error for relations: 379 , 0
[(('7', 13), ('50', 7)), (('50', 7), ('7', 13))]
[('50', 7), ('explicit', 11), ('7', 13)]

[('3', 1), ('explicit', 13), ('3', 15), ('explicit', 20), ('4', 21), ('14', 25)]
Error for relations: 379 , 0
[(('50', 10), ('1', 14)), (('1', 14), ('50', 10))]
[('50', 10), ('explicit', 12), ('1', 14)]
Error for relations: 379 , 0
[(('3', 18), ('50', 24)), (('50', 24), ('3', 18))]
[('1', 11), ('explicit', 17), ('3', 18), ('explicit', 22), ('50', 24)]
Error for relations: 379 , 0
[(('50', 8), ('1', 2)), (('1', 2), ('50', 8))]
[('1', 2), ('explicit', 4), ('50', 8)]
Error for relations: 379 , 0
[(('5', 11), ('50', 8)), (('50', 8), ('5', 11))]
[('explicit', 3), ('50', 8), ('5', 11)]
Error for relations: 379 , 0
[(('12', 3), ('13', 15)), (('13', 15), ('12', 3))]
[('12', 3), ('explicit', 13), ('13', 15)]
Error for relations: 379 , 0
[(('13', 1), ('14', 4)), (('14', 4), ('13', 1))]
[('13', 1), ('14', 4)]
Error for relations: 379 , 0
[(('50', 13), ('7', 5)), (('7', 5), ('50', 13))]
[('7', 5), ('explicit', 11), ('50', 13)]
Error for relations: 379 , 0
[(('3', 2), ('5', 14)), (('5', 14), ('3', 

Error for relations: 379 , 0
[(('14', 11), ('4', 1)), (('4', 1), ('14', 11))]
[('4', 1), ('explicit', 9), ('14', 11)]
Error for relations: 379 , 0
[(('50', 12), ('13', 4)), (('13', 4), ('50', 12))]
[('13', 4),
 ('explicit', 5),
 ('50', 12),
 ('explicit', 14),
 ('13', 15),
 ('14', 17)]
Error for relations: 379 , 0
[(('50', 13), ('6', 5)), (('6', 5), ('50', 13))]
[('6', 5), ('explicit', 6), ('50', 13)]
Error for relations: 379 , 0
[(('7', 18), ('6', 9)), (('6', 9), ('7', 18))]
[('6', 9), ('explicit', 16), ('7', 18)]
Error for relations: 379 , 0
[(('50', 0), ('3', 5)), (('3', 5), ('50', 0))]
[('50', 0), ('explicit', 3), ('3', 5), ('11', 10)]
Error for relations: 379 , 0
[]
[('50', 5)]
Error for relations: 379 , 0
[]
[('explicit', 1), ('7', 5)]
Error for relations: 379 , 0
[(('3', 2), ('50', 20)), (('50', 20), ('3', 2))]
[('1', 0), ('3', 2), ('explicit', 18), ('50', 20)]
Error for relations: 379 , 0
[(('6', 12), ('50', 6)), (('50', 6), ('6', 12))]
[('50', 6), ('explicit', 9), ('6', 12)]
Er

[]
[('3', 0)]
Error for relations: 379 , 0
[(('3', 7), ('4', 18)), (('4', 18), ('3', 7))]
[('2', 1), ('explicit', 6), ('3', 7), ('4', 18), ('5', 25)]
Error for relations: 379 , 0
[(('7', 9), ('12', 1)), (('12', 1), ('7', 9))]
[('12', 1), ('explicit', 8), ('7', 9)]
Error for relations: 379 , 0
[]
[('6', 1)]
Error for relations: 379 , 0
[(('7', 2), ('50', 11)), (('50', 11), ('7', 2))]
[('explicit', 0), ('7', 2), ('explicit', 10), ('50', 11)]
Error for relations: 379 , 0
[(('6', 9), ('7', 1)), (('7', 1), ('6', 9))]
[('7', 1), ('explicit', 6), ('6', 9)]
Error for relations: 379 , 0
[(('4', 2), ('50', 13)), (('50', 13), ('4', 2))]
[('explicit', 0), ('4', 2), ('explicit', 9), ('50', 13)]
Error for relations: 379 , 0
[(('5', 13), ('4', 0)), (('4', 0), ('5', 13))]
[('4', 0), ('explicit', 12), ('5', 13)]
Error for relations: 379 , 0
[(('14', 9), ('4', 6)), (('4', 6), ('14', 9))]
[('3', 2), ('4', 6), ('explicit', 8), ('14', 9)]
Error for relations: 379 , 0
[(('50', 3), ('4', 9)), (('4', 9), ('50

Error for relations: 379 , 0
[(('50', 2), ('7', 9)), (('7', 9), ('50', 2))]
[('50', 2), ('explicit', 7), ('7', 9)]
Error for relations: 379 , 0
[(('1', 3), ('3', 7)),
 (('3', 7), ('1', 3)),
 (('50', 30), ('6', 23)),
 (('6', 23), ('50', 30))]
[('1', 3),
 ('explicit', 6),
 ('3', 7),
 ('explicit', 21),
 ('6', 23),
 ('explicit', 28),
 ('50', 30)]
Error for relations: 379 , 0
[(('1', 8), ('50', 23)), (('50', 23), ('1', 8))]
[('1', 8), ('3', 12), ('explicit', 22), ('50', 23)]
Error for relations: 379 , 0
[(('4', 5), ('3', 1)), (('3', 1), ('4', 5))]
[('explicit', 0), ('3', 1), ('explicit', 4), ('4', 5), ('14', 11)]
Error for relations: 379 , 0
[(('11', 18), ('13', 20)), (('13', 20), ('11', 18))]
[('explicit', 5),
 ('11', 7),
 ('13', 13),
 ('11', 18),
 ('explicit', 19),
 ('13', 20),
 ('explicit', 23),
 ('14', 24)]
Error for relations: 379 , 0
[(('7', 7), ('6', 2)), (('6', 2), ('7', 7))]
[('explicit', 0), ('6', 2), ('7', 7)]
Error for relations: 379 , 0
[]
[('5b', 2)]
Error for relations: 379 ,

Error for relations: 379 , 0
[(('2', 15), ('3', 20)), (('3', 20), ('2', 15))]
[('explicit', 14), ('2', 15), ('3', 20)]
Error for relations: 379 , 0
[(('50', 6), ('1', 0)), (('1', 0), ('50', 6))]
[('1', 0), ('explicit', 4), ('50', 6)]
Error for relations: 379 , 0
[(('5', 19), ('3', 7)), (('3', 7), ('5', 19))]
[('3', 7), ('explicit', 17), ('5', 19), ('3', 25)]
Error for relations: 379 , 0
[(('3', 9), ('5', 14)), (('5', 14), ('3', 9))]
[('explicit', 8), ('3', 9), ('explicit', 12), ('5', 14)]
Error for relations: 379 , 0
[(('14', 6), ('4', 0)), (('4', 0), ('14', 6))]
[('4', 0), ('14', 6)]
Error for relations: 379 , 0
[(('1', 10), ('50', 7)),
 (('50', 7), ('1', 10)),
 (('1', 20), ('2', 22)),
 (('2', 22), ('1', 20))]
[('50', 7),
 ('explicit', 9),
 ('1', 10),
 ('1', 20),
 ('2', 22),
 ('explicit', 27),
 ('3', 29)]
Error for relations: 379 , 0
[(('50', 2), ('7', 7)), (('7', 7), ('50', 2))]
[('50', 2), ('explicit', 4), ('7', 7)]
Error for relations: 379 , 0
[(('2', 9), ('1', 8)),
 (('1', 8), ('2

[('14', 6), ('explicit', 10), ('7', 11)]
Error for relations: 379 , 0
[(('50', 2), ('6', 12)), (('6', 12), ('50', 2))]
[('50', 2), ('explicit', 4), ('6', 12)]
Error for relations: 379 , 0
[(('6', 3), ('7', 12)), (('7', 12), ('6', 3))]
[('6', 3), ('explicit', 10), ('7', 12)]
Error for relations: 379 , 0
[(('3', 0), ('5', 12)),
 (('5', 12), ('3', 0)),
 (('4', 21), ('3', 17)),
 (('3', 17), ('4', 21))]
[('3', 0), ('explicit', 10), ('5', 12), ('explicit', 16), ('3', 17), ('4', 21)]
Error for relations: 379 , 0
[(('50', 21), ('3', 12)),
 (('3', 12), ('50', 21)),
 (('3', 44), ('50', 41)),
 (('50', 41), ('3', 44))]
[('50', 6),
 ('explicit', 10),
 ('3', 12),
 ('50', 21),
 ('50', 41),
 ('explicit', 43),
 ('3', 44)]
Error for relations: 379 , 0
[(('1', 10), ('50', 6)), (('50', 6), ('1', 10))]
[('explicit', 4), ('50', 6), ('1', 10)]
Error for relations: 379 , 0
[(('4', 9), ('5', 6)), (('5', 6), ('4', 9))]
[('explicit', 5), ('5', 6), ('4', 9)]
Error for relations: 379 , 0
[(('50', 14), ('1', 6)),
 

[('1', 0), ('explicit', 2), ('2', 3), ('1', 16)]
Error for relations: 379 , 0
[(('13', 11), ('11', 0)), (('11', 0), ('13', 11))]
[('11', 0), ('explicit', 9), ('13', 11), ('explicit', 15), ('7', 17)]
Error for relations: 379 , 0
[(('1', 9), ('50', 3)), (('50', 3), ('1', 9))]
[('50', 3), ('explicit', 7), ('1', 9)]
Error for relations: 379 , 0
[]
[('explicit', 5), ('50', 6)]
Error for relations: 379 , 0
[]
[('1', 2), ('11', 5)]
Error for relations: 379 , 0
[(('5b', 12), ('7', 20)), (('7', 20), ('5b', 12))]
[('7', 1), ('5b', 12), ('explicit', 18), ('7', 20)]
Error for relations: 379 , 0
[(('4', 17), ('50', 7)), (('50', 7), ('4', 17))]
[('50', 7), ('explicit', 9), ('4', 17)]
Error for relations: 379 , 0
[(('1', 11), ('3', 1)), (('3', 1), ('1', 11))]
[('3', 1), ('explicit', 9), ('1', 11)]
Error for relations: 379 , 0
[(('1', 1), ('3', 9)), (('3', 9), ('1', 1))]
[('explicit', 0), ('1', 1), ('explicit', 8), ('3', 9)]
Error for relations: 379 , 0
[(('50', 14), ('3', 19)), (('3', 19), ('50', 14)

### <span style="color:red">TODO - call success if we recover the original crel, as we are generating a lot of crels</span>
### <span style="color:red">TODO - Also check against fully supported as we want to get that to 100%</span>

In [96]:
errors

2302

## For the Unsupported Relations, are the Missing Tags in the Previous or Subsequent Sentence?

In [188]:
from pprint import pprint
def print_sentence(sentence):
    for wd, tags in sentence:
        print(wd.ljust(20), [t for t in tags if t[0].isdigit()])

for essay_ix, sent_ix, un_csl, supported_causal in diffs[0:10]:
    sentence = tagged_essays[essay_ix].sentences[sent_ix]
    pprint(un_csl)
    pprint(supported_causal)
    print_sentence(sentence)
    if sent_ix > 0:
        print("--Previous--")    
        print_sentence(tagged_essays[essay_ix].sentences[sent_ix-1])
    if sent_ix < len(tagged_essays[essay_ix].sentences)-1:
        print("--Next--")    
        print_sentence(tagged_essays[essay_ix].sentences[sent_ix+1])

    print() 

{'Causer:4->Result:14', 'Causer:3->Result:4'}
{'Causer:4->Result:14'}
the                  []
amount               ['4']
of                   ['4']
co2                  ['4']
can                  []
threaten             ['14']
the                  ['14']
health               ['14']
of                   ['14']
the                  ['14']
coral                ['14']
.                    []
--Previous--
as                   []
the                  []
temperature          ['3']
of                   ['3']
water                ['3']
increases            ['3']
.                    []

{'Causer:3->Result:6', 'Causer:1->Result:3'}
{'Causer:3->Result:6'}
corals               ['6']
stress               ['6']
from                 []
sea                  ['3']
temperatures         ['3']
increasing           ['3']
.                    []
--Previous--
plus                 []
,                    []
climate              []
change               []
affects              []
coral                ['50']
ble

## TODO 
- Re-train tagging model, adding tags where reg tag is missing but is included in a causer or result tag. 
- Also include explicit in the predicted tags.
- Need to handle relations where same code -> same code

## 4 Errors Below Look Are from Non-Projective Parses
**NOTES**
With only 4 errors as 4 missed relations, hardly worth worrying about. 
One solution would be to train a forward and a backward parser, parse the sentence in both directions and merge the deps. In each case that would pick up all deps.

In [106]:
for i, (essay_ix, sent_ix, supported_causal, tag_seq) in enumerate(relations[:]):
    supported_causal = sorted(supported_causal)
    crels = [normalize_cr(crel) for crel in supported_causal]
    for l,r in crels:
        if l == r:
            print(i, l,r)
            print(relations[i])
            print()

37 50 50
(24, 13, {'Causer:1->Result:50', 'Causer:50->Result:50'}, ['explicit', '1', '50', 'explicit', '50'])

493 50 50
(189, 4, {'Causer:13->Result:50', 'Causer:50->Result:50'}, ['explicit', '13', 'explicit', '50', 'explicit', '50'])

527 50 50
(197, 10, {'Causer:5b->Result:50', 'Causer:50->Result:50'}, ['5b', 'explicit', '50', 'explicit', '50'])

766 11 11
(276, 12, {'Causer:11->Result:11', 'Causer:3->Result:4'}, ['4', '3', 'explicit', '11'])



In [107]:
#Why is the last one missing the 11->11 relation?
tagged_essays[276].sentences[12]
#looks to be an unsupported relation

[('balance', set()),
 ('between', set()),
 ('co2', {'4', 'Causer:3->Result:4', 'Result', 'Result:4'}),
 ('and', {'Causer:3->Result:4'}),
 ('water', {'3', 'Causer', 'Causer:3', 'Causer:3->Result:4'}),
 ('temperature', {'3', 'Causer', 'Causer:3', 'Causer:3->Result:4'}),
 ('is', set()),
 ('also', set()),
 ('threaten', {'explicit'}),
 ('by', {'explicit'}),
 ('extreme',
  {'11',
   'Causer',
   'Causer:11',
   'Causer:11->Result:11',
   'Result',
   'Result:11'}),
 ('storms',
  {'11',
   'Causer',
   'Causer:11',
   'Causer:11->Result:11',
   'Result',
   'Result:11'}),
 ('.', set())]

In [203]:
errors = 0
exs = []
for e_ix,s_ix, supported_causal, tag_seq in relations[:]:
    # remove indexes
    tag_seq = list(zip(*tag_seq))[0]
    
    supported_causal = sorted(supported_causal)
    crels = [normalize_cr(crel) for crel in supported_causal]
    
    try:
        success = test_oracle(tag_seq, crels, Oracle, verbose=False)
    except Exception as e:
        exs.append(e)
        success = False
        
    if not success:
        errors += 1
        print("Error for relations:", e_ix, ",", s_ix)
        pprint(crels)
        pprint(tag_seq)
        #print()
        #success = test_oracle(tag_seq, crels, Oracle, verbose=True)
        #break

Error for relations: 8 , 0
[('7', '50')]
('50', '50', 'explicit', '7', '50')
Error for relations: 23 , 3
[('11', '12'), ('12', '13'), ('13', '50')]
('11', '11', 'explicit', '12', 'explicit', '13', 'explicit', '50')
Error for relations: 24 , 13
[('1', '50'), ('50', '50')]
('explicit', '1', '50', 'explicit', '50')
Error for relations: 33 , 3
[('1', '50'), ('1', '7'), ('3', '50'), ('3', '7')]
('1', '3', 'explicit', '50', 'explicit', '1', '3', '7')
Error for relations: 50 , 7
[('11', '13')]
('11', '11', 'explicit', '13')
Error for relations: 51 , 6
[('3', '50')]
('3', '3', 'explicit', '50')
Error for relations: 61 , 0
[('5', '7'), ('7', '50')]
('50', '50', 'explicit', '7', 'explicit', '5')
Error for relations: 68 , 1
[('6', '50'), ('6', '7')]
('6', 'explicit', '50', 'explicit', '6', 'explicit', '7')
Error for relations: 75 , 2
[('11', '12'), ('11', '13'), ('12', '13'), ('13', '14')]
('13', 'explicit', '11', 'explicit', '12', 'explicit', '13', 'explicit', '14')
Error for relations: 126 , 1


In [204]:
errors

89

In [205]:
crels     = [('11', '13')]
tag_seq   = ('11', '11', 'explicit', '13')
test_oracle(tag_seq, crels, Oracle, verbose=True)

DEPS
	('11', '13')

---------------------------------
11
---------------------------------
SHIFT   : Push 11    || STACK : root|11
---------------------------------
11
---------------------------------
SHIFT   : Push 11    || STACK : root|11|11
---------------------------------
explicit
---------------------------------
SKIP    : item explicit || STACK : root|11|11
---------------------------------
13
---------------------------------
L ARC   : 11->13     || STACK : root|11


AssertionError: Arc already processed ('11', '13')

In [119]:
crels = [('1', '3'), ('1', '50')]
tag_seq = ['1', 'explicit', '50', '1', 'explicit', '3']
test_oracle(tag_seq, crels, Oracle, verbose=True)

DEPS
	('1', '3')
	('1', '50')

-------------------------------------
1
-------------------------------------
SHIFT   : Push 1     || STACK : root|1
-------------------------------------
explicit
-------------------------------------
SKIP    : item explicit || STACK : root|1
-------------------------------------
50
-------------------------------------
R ARC   : 1<-50      || STACK : root|1|50
-------------------------------------
1
-------------------------------------


AssertionError: Arc already processed ('1', '50')

## <span style="color:red">NEED to determine if all errors are non-projective<span>

In [189]:
test_oracle(['5', '50'], [('5', '50')], Oracle2, verbose=True)

DEPS
	('5', '50')

-----------------------------
5
-----------------------------
SHIFT   : Push 5     || STACK : root|5
-----------------------------
50
-----------------------------
L ARC   : 5->50      || STACK : root
L ARC   : 5->50      || STACK : root

*****************************
Stack
	root
DEPS Actual
	('5', '50')
DEPS Pred
	('5', '50')
Actions
	SHIFT   : Push 5
	L ARC   : 5->50

Ordered Match?    True
Un Ordered Match? True


True

In [199]:
errors = 0
exs = []
for e_ix, s_ix, supported_causal, tag_seq in relations[:]:
    
    # GET INITIAL TAGS (ignore indexes)
    tag_seq = list(zip(*tag_seq))[0]
    
    supported_causal = sorted(supported_causal)
    crels = [normalize_cr(crel) for crel in supported_causal]

    try:
        success = test_oracle(tag_seq, crels, Oracle2, verbose=False)
    except Exception as e:
        exs.append(e)
        success = False
        
    if not success:
        errors += 1
        print("Error for relations:")
        pprint(crels)
        pprint(tag_seq)
        #print()
        #success = test_oracle(tag_seq, crels, Oracle, verbose=True)
        #break
errors

Error for relations:
[('3', '4')]
('explicit', '3', 'explicit', '4')
Error for relations:
[('1', '50'), ('11', '50'), ('13', '50')]
('explicit', '50', '13', '1', '11')
Error for relations:
[('7', '50')]
('50', '50', 'explicit', '7', '50')
Error for relations:
[('3', '4')]
('explicit', '3', 'explicit', '4')
Error for relations:
[('1', '50'), ('3', '1')]
('explicit', '50', '1', 'explicit', '3', 'explicit', '1')
Error for relations:
[('1', '3')]
('explicit', '1', '3')
Error for relations:
[('3', '4')]
('explicit', '3', 'explicit', '4')
Error for relations:
[('3', '50')]
('explicit', '50', '3')
Error for relations:
[('3', '50')]
('explicit', '3', 'explicit', '50')
Error for relations:
[('13', '50')]
('explicit', '13', 'explicit', '50')
Error for relations:
[('3', '7'), ('7', '50')]
('explicit', '3', 'explicit', '7', 'explicit', '50')
Error for relations:
[('11', '12'), ('12', '13'), ('13', '50')]
('11', '11', 'explicit', '12', 'explicit', '13', 'explicit', '50')
Error for relations:
[('3',

[('7', '50')]
('explicit', '7', 'explicit', '50')
Error for relations:
[('1', '50')]
('explicit', '1', '50')
Error for relations:
[('4', '50')]
('explicit', '4', 'explicit', '50')
Error for relations:
[('6', '7')]
('explicit', '7', 'explicit', '6')
Error for relations:
[('1', '50')]
('explicit', '50', '1')
Error for relations:
[('3', '50')]
('explicit', '3', 'explicit', '50')
Error for relations:
[('11', '50')]
('explicit', '50', '11')
Error for relations:
[('6', '7')]
('explicit', '6', 'explicit', '7')
Error for relations:
[('1', '2')]
('explicit', '1', '2')
Error for relations:
[('1', '3')]
('explicit', '1', 'explicit', '3')
Error for relations:
[('1', '3')]
('explicit', '1', '3')
Error for relations:
[('1', '3'), ('3', '50')]
('explicit', '1', '3', 'explicit', '50')
Error for relations:
[('1', '50'), ('7', '50')]
('explicit', '50', '1', '7')
Error for relations:
[('1', '50')]
('explicit', '50', '1')
Error for relations:
[('1', '2')]
('1', '1', 'explicit', '2')
Error for relations:
[

660

In [192]:
relations[0]

(1, 1, {'Causer:5->Result:50'}, [('5', 2), ('explicit', 9), ('50', 11)])