In [1]:
from collections import defaultdict
from typing import Any, List, Set, Tuple

import dill
import numpy as np
import pandas as pd

from Settings import Settings
from crel_helper import get_cr_tags
from results_procesor import is_a_regular_code
from load_data import load_process_essays
from window_based_tagger_config import get_config
from wordtagginghelper import merge_dictionaries
from BrattEssay import ANAPHORA

DATASET = "CoralBleaching" # SkinCancer

settings = Settings()
root_folder = settings.data_directory + DATASET + "/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"
# NOTE: These predictions are generated from the "./notebooks/SEARN/Keras - Train Tagger and Save CV Predictions For Word Tags.ipynb" notebook
# used as inputs to parsing model
coref_root = root_folder + "CoReference/"
coref_output_folder = coref_root + "CRel/"

training_folder, test_folder

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


('/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Training/',
 '/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Test/')

In [2]:
train_fname = coref_output_folder + "training_crel_anatagged_essays.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)
    
print(len(pred_tagged_essays_train), len(pred_tagged_essays_test))

902 226


In [3]:
all_essays = pred_tagged_essays_train + pred_tagged_essays_test
len(all_essays)

1128

In [4]:
num_essays = 0
num_sents = 0
num_words = 0

words_with_anaphora_tag = 0
sents_with_anaphora_tag = 0
essays_with_anaphora_tag = 0

words_with_cc = 0
cc_with_anaphora_tag = 0
words_with_crel = 0
words_with_crel_ana = 0

reg_code_sent_tally = defaultdict(int)
ana_code_sent_tally = defaultdict(int)

crel_tally = defaultdict(int)
ana_crel_tally = defaultdict(int)

all_tags = defaultdict(int)

for e in all_essays:
    num_essays +=1
    essay_has_ana = False
    
    for sent in e.sentences:
        sent_has_ana = False
        num_sents +=1
        unique_tags = set()
        all_non_cr_tags = set()
        
        for wd, tags in sent:
            num_words +=1
            
            has_cc_code = False
            has_crel_code = False
            has_crel_ana_code = False
            
            for t in tags:
                all_tags[t] +=1 
                t_lower = t.lower()
                if "other" in t_lower or "rhet" in t_lower or "change" in t_lower:
                    continue
                unique_tags.add(t)
                
                if is_a_regular_code(t):
                    has_cc_code = True
                    
                if "->" in t:
                    has_crel_code = True
                    if ANAPHORA in t:
                        has_crel_ana_code = True
                else:
                    all_non_cr_tags.add(t)

            if ANAPHORA in tags:
                words_with_anaphora_tag +=1
                essay_has_ana = True
                sent_has_ana = True
            
            if has_cc_code:
                words_with_cc += 1
                
            if has_crel_code:
                words_with_crel +=1
            
            if has_crel_ana_code:
                words_with_crel_ana +=1
                
            if ANAPHORA in tags and has_cc_code:
                cc_with_anaphora_tag +=1

        # end for each word in sentence
        if sent_has_ana:
            sents_with_anaphora_tag +=1
        
        unique_crels = set()
        unique_ccodes = set()
        for t in unique_tags:
            # Causal?
            if "->" in t:
                prefix = "Anaphor["
                if prefix in t:
                    k_fixed = t.replace(prefix, "").replace("]","")
                    if ANAPHORA not in k_fixed:
                        l,r = k_fixed.split("->")
                        l_code = l.replace("Causer:","")
                        r_code = r.replace("Result:","")
                        assert is_a_regular_code(l_code), l_code
                        assert is_a_regular_code(r_code), r_code
                        ana_crel_tally[t] +=1
                        unique_crels.add(t)
                elif ANAPHORA not in t:
                    crel_tally[t] +=1
                    unique_crels.add(t)
            else: # "->" not in t
                assert "->" not in t
                if "Anaphor:[" in t:
                    ana_code_sent_tally[t] +=1
                    unique_ccodes.add(t)
                elif is_a_regular_code(t):
                    reg_code_sent_tally[t]+=1
                    unique_ccodes.add(t)
                
    # end for each sentence
    if essay_has_ana:
        essays_with_anaphora_tag +=1
                    
num_words, words_with_anaphora_tag, num_sents, sents_with_anaphora_tag, num_essays, essays_with_anaphora_tag

(167865, 377, 10210, 319, 1128, 237)

## Look at Anaphora Tags without Associated Codes

In [13]:
count_ana = 0
count_ana_code = 0
count_ana_no_code = 0

all_tags = defaultdict(int)

for e in all_essays:
    for sent in e.sentences:        
        for wd, tags in sent:
            has_ana = "Anaphor" in tags
            has_ana_code = False
            for t in tags:
                all_tags[t] +=1
                if "Anaphor:[" in t:
                    has_ana_code = True
            if has_ana:
                count_ana +=1
            if has_ana_code:
                count_ana_code += 1
            if has_ana and not has_ana_code:
                count_ana_no_code += 1
            
count_ana, count_ana_code, count_ana_no_code

(377, 376, 1)

In [12]:
all_tags

defaultdict(int,
            {'1': 4834,
             '11': 1043,
             '12': 833,
             '13': 2055,
             '14': 2421,
             '2': 960,
             '3': 5704,
             '4': 2081,
             '5': 683,
             '50': 13592,
             '5b': 649,
             '6': 1466,
             '7': 3730,
             'Anaphor': 377,
             'Anaphor:[11]': 9,
             'Anaphor:[12]': 11,
             'Anaphor:[13]': 35,
             'Anaphor:[14]': 30,
             'Anaphor:[1]': 70,
             'Anaphor:[2]': 10,
             'Anaphor:[3]': 45,
             'Anaphor:[4]': 21,
             'Anaphor:[50]': 51,
             'Anaphor:[5]': 17,
             'Anaphor:[5b]': 7,
             'Anaphor:[6]': 19,
             'Anaphor:[7]': 58,
             'Anaphor:[other]': 13,
             'COMPILED': 1123,
             'Causer': 15954,
             'Causer:1': 3010,
             'Causer:1->Result:11': 13,
             'Causer:1->Result:13': 35,
           

In [7]:
reg_code_sent_tally

defaultdict(int,
            {'1': 1172,
             '11': 371,
             '12': 122,
             '13': 483,
             '14': 334,
             '2': 158,
             '3': 1261,
             '4': 395,
             '5': 170,
             '50': 3847,
             '5b': 99,
             '6': 407,
             '7': 729})

In [32]:
ana_code_sent_tally

defaultdict(int,
            {'Anaphor:[11]': 9,
             'Anaphor:[12]': 10,
             'Anaphor:[13]': 30,
             'Anaphor:[14]': 24,
             'Anaphor:[1]': 53,
             'Anaphor:[2]': 7,
             'Anaphor:[3]': 39,
             'Anaphor:[4]': 19,
             'Anaphor:[50]': 48,
             'Anaphor:[5]': 16,
             'Anaphor:[5b]': 7,
             'Anaphor:[6]': 15,
             'Anaphor:[7]': 44})

In [33]:
crel_tally

defaultdict(int,
            {'Causer:1->Result:11': 1,
             'Causer:1->Result:13': 2,
             'Causer:1->Result:14': 3,
             'Causer:1->Result:2': 122,
             'Causer:1->Result:3': 279,
             'Causer:1->Result:4': 9,
             'Causer:1->Result:5': 6,
             'Causer:1->Result:50': 455,
             'Causer:1->Result:6': 3,
             'Causer:1->Result:7': 7,
             'Causer:11->Result:1': 1,
             'Causer:11->Result:11': 1,
             'Causer:11->Result:12': 94,
             'Causer:11->Result:13': 109,
             'Causer:11->Result:14': 25,
             'Causer:11->Result:3': 6,
             'Causer:11->Result:4': 2,
             'Causer:11->Result:50': 78,
             'Causer:11->Result:6': 3,
             'Causer:12->Result:11': 1,
             'Causer:12->Result:13': 89,
             'Causer:12->Result:14': 8,
             'Causer:12->Result:50': 9,
             'Causer:12->Result:5b': 1,
             'Causer:12->Result

In [34]:
ana_crel_tally

defaultdict(int,
            {'Causer:1->Result:Anaphor[3]': 1,
             'Causer:1->Result:Anaphor[50]': 14,
             'Causer:11->Result:Anaphor[14]': 6,
             'Causer:11->Result:Anaphor[1]': 1,
             'Causer:11->Result:Anaphor[50]': 4,
             'Causer:13->Result:Anaphor[14]': 1,
             'Causer:13->Result:Anaphor[50]': 4,
             'Causer:2->Result:Anaphor[50]': 1,
             'Causer:3->Result:Anaphor[4]': 1,
             'Causer:3->Result:Anaphor[50]': 9,
             'Causer:3->Result:Anaphor[7]': 1,
             'Causer:4->Result:Anaphor[50]': 1,
             'Causer:4->Result:Anaphor[5]': 3,
             'Causer:5->Result:Anaphor[50]': 1,
             'Causer:5b->Result:Anaphor[14]': 1,
             'Causer:6->Result:Anaphor[50]': 2,
             'Causer:6->Result:Anaphor[7]': 3,
             'Causer:7->Result:Anaphor[50]': 10,
             'Causer:7->Result:Anaphor[6]': 1,
             'Causer:Anaphor[11]->Result:12': 6,
             'Causer:

# Counts by Document, Sentence, Word

In [57]:
def pct(cnt_cond, cnt):
    return str(round(100 * (cnt_cond / cnt),2)) + "%"

cnts = {
    "_Type": "Cnts.",
    "Essays":    essays_with_anaphora_tag,
    "Sentences": sents_with_anaphora_tag,
    "Words": words_with_anaphora_tag
}
totals = {
    "_Type": "Ttls.",
    "Essays":    num_essays,
    "Sentences": num_sents,
    "Words":     num_words
}
pcts = {
    "_Type": "Pct.",
    "Essays": pct(essays_with_anaphora_tag, num_essays),
    "Sentences": pct(sents_with_anaphora_tag, num_sents),
    "Words": pct(words_with_anaphora_tag, num_words)
}

df_counts = pd.DataFrame([cnts, pcts, totals])
df_counts = df_counts.transpose()

## Concept Codes (Sentences)

In [37]:
sum_reg_code = sum(reg_code_sent_tally.values())
sum_ana_code = sum(ana_code_sent_tally.values())
sum_cc_code = sum_ana_code + sum_reg_code
sum_reg_code, sum_ana_code, sum_cc_code

(9548, 321, 9869)

## Causal Relations (Sentences)

In [38]:
sum_reg_crel = sum(crel_tally.values())
sum_ana_crel = sum(ana_crel_tally.values())
sum_crel = sum_reg_crel + sum_ana_crel
sum_reg_crel, sum_ana_crel, sum_crel

(3878, 276, 4154)

In [58]:
cnts = {
    "_Type": "Cnts.",
    "CCodes": sum_ana_code,
    "CRels": sum_ana_crel,
}
totals = {
    "_Type": "Ttls.",
    "CCodes": sum_cc_code,
    "CRels": sum_crel,
}
pcts = {
    "_Type": "Pct.",
    "CCodes": pct(sum_ana_code, sum_cc_code),
    "CRels": pct(sum_ana_crel,  sum_crel),
}

df_codes = pd.DataFrame([cnts, pcts, totals])
df_codes = df_codes.transpose()

# Frequencies

In [59]:
df_counts

Unnamed: 0,0,1,2
Essays,237,21.01%,1128
Sentences,319,3.12%,10210
Words,377,0.22%,167865
_Type,Cnts.,Pct.,Ttls.


# Codes

In [61]:
df_codes

Unnamed: 0,0,1,2
CCodes,321,3.25%,9869
CRels,276,6.64%,4154
_Type,Cnts.,Pct.,Ttls.


# Compute CRels that Cross Sentences

In [145]:
cross_sent_crel_count = 0
prefix = "Anaphor["

for e in all_essays:
    for sent in e.sentences:
        unique_tags = set()
        all_non_cr_tags = set()
        
        for wd, tags in sent:            
            for t in tags:

                t_lower = t.lower()
                if "other" in t_lower or "rhet" in t_lower or "change" in t_lower:
                    continue
                unique_tags.add(t)
                                    
                if "->" not in t:                
                    all_non_cr_tags.add(t)

        unique_crels = set()
        unique_ccodes = set()
        for t in unique_tags:
            # Causal?
            if "->" in t:               
                if prefix in t:
                    k_fixed = t.replace(prefix, "").replace("]","")
                    if ANAPHORA not in k_fixed:
                        unique_crels.add(t)
                    else:
                        # ignore ones like this - Causer:Anaphor->Result:Anaphor[50]
                        pass
                elif ANAPHORA not in t:
                    unique_crels.add(t)
            else: # "->" not in t
                assert "->" not in t
                if "Anaphor:[" in t:
                    unique_ccodes.add(t.replace(":","")) # Anaphor tags have a subltely different pattern
                elif is_a_regular_code(t):
                    unique_ccodes.add(t)
                elif "Causer:" in t:
                    fixed = t.replace("Causer:","")
                    if is_a_regular_code(fixed):
                        unique_ccodes.add(fixed)
                elif "Result:" in t:
                    fixed = t.replace("Result:","")
                    if is_a_regular_code(fixed):
                        unique_ccodes.add(fixed)
        
        for crel in unique_crels:
            if prefix in crel:
                continue
            l,r = crel.replace("Causer:","").replace("Result:","").split("->")  
            assert l
            assert r
            # MOST of them exist because there is also a crel including an anaphora. So ignore these when computing these counts
            if l not in unique_ccodes and r not in unique_ccodes:
                lhs, rhs = crel.split("->")
                ana_lhs = lhs.replace(l, "Anaphor[{code}]".format(code=l))
                ana_rhs = rhs.replace(r, "Anaphor[{code}]".format(code=r))
                crel_ana_l = ana_lhs + "->" + rhs
                if crel_ana_l in unique_crels:
                    continue
                crel_ana_r = lhs + "->" + ana_rhs
                if crel_ana_r in unique_crels:
                    continue
                crel_ana_both = ana_lhs + "->" + ana_rhs
                if crel_ana_both in unique_crels:
                    continue
                print("|{l},{r}|".format(l=l,r=r).ljust(10), "\t", crel.ljust(20), "\t", 
                      ("|".join(unique_ccodes)).ljust(30), "  ", " | ".join(unique_crels))
                cross_sent_crel_count +=1    
            elif l not in unique_ccodes:             
                # need to handle 5 and 5b
                crel_ana = crel.replace(l, "Anaphor[{code}]".format(code=l), 1) # only replate the first occurence
                if crel_ana in unique_crels:
                    continue
#                 print("|{code}|".format(code=l).ljust(10), "\t", crel.ljust(20), "\t", ("|".join(unique_ccodes)).ljust(30), "  ", " | ".join(unique_crels))
                cross_sent_crel_count +=1
            elif r not in unique_ccodes: #elif - don't count twice if both
                lhs, rhs = crel.split("->")
                ana_rhs = rhs.replace(r, "Anaphor[{code}]".format(code=r))
                crel_ana = lhs + "->" + ana_rhs
                if crel_ana in unique_crels:
                    continue
#                 print("|{code}|".format(code=r).ljust(10), "\t", crel.ljust(20), "\t", ("|".join(unique_ccodes)).ljust(30), "  ", " | ".join(unique_crels), "\t\t", crel_ana)
                cross_sent_crel_count +=1

cross_sent_crel_count, sum_crel , str(round(100 * (cross_sent_crel_count / sum_crel),2)) + "% cross sentence non ana relations"

|11,50|    	 Causer:11->Result:50 	                                   Causer:11->Result:50


(159, 4154, '3.83% cross sentence non ana relations')