In [1]:
from collections import defaultdict
from typing import Any, List, Set, Tuple

import dill
import numpy as np
import pandas as pd

from Settings import Settings
from crel_helper import get_cr_tags
from results_procesor import is_a_regular_code
from load_data import load_process_essays
from window_based_tagger_config import get_config
from wordtagginghelper import merge_dictionaries
from BrattEssay import ANAPHORA

DATASET = "SkinCancer" # CoralBleaching

settings = Settings()
root_folder = settings.data_directory + DATASET + "/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"
# NOTE: These predictions are generated from the "./notebooks/SEARN/Keras - Train Tagger and Save CV Predictions For Word Tags.ipynb" notebook
# used as inputs to parsing model
coref_root = root_folder + "CoReference/"
coref_output_folder = coref_root + "CRel/"

training_folder, test_folder

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


('/Users/simon.hughes/Google Drive/Phd/Data/SkinCancer/Thesis_Dataset/Training/',
 '/Users/simon.hughes/Google Drive/Phd/Data/SkinCancer/Thesis_Dataset/Test/')

In [2]:
train_fname = coref_output_folder + "training_crel_anatagged_essays.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)
    
print(len(pred_tagged_essays_train), len(pred_tagged_essays_test))

870 218


In [3]:
all_essays = pred_tagged_essays_train + pred_tagged_essays_test
len(all_essays)

1088

In [4]:
num_essays = 0
num_sents = 0
num_words = 0

words_with_anaphora_tag = 0
sents_with_anaphora_tag = 0
essays_with_anaphora_tag = 0

words_with_cc = 0
cc_with_anaphora_tag = 0
words_with_crel = 0
words_with_crel_ana = 0

reg_code_sent_tally = defaultdict(int)
ana_code_sent_tally = defaultdict(int)

crel_tally = defaultdict(int)
ana_crel_tally = defaultdict(int)

for e in all_essays:
    num_essays +=1
    essay_has_ana = False
    
    for sent in e.sentences:
        sent_has_ana = False
        num_sents +=1
        unique_tags = set()
        
        for wd, tags in sent:
            num_words +=1
            
            has_cc_code = False
            has_crel_code = False
            has_crel_ana_code = False
            
            for t in tags:

                t_lower = t.lower()
                if "other" in t_lower or "rhet" in t_lower or "change" in t_lower:
                    continue
                unique_tags.add(t)
                
                if is_a_regular_code(t):
                    has_cc_code = True
                    
                if "->" in t:
                    has_crel_code = True
                    if ANAPHORA in t:
                        has_crel_ana_code = True

            if ANAPHORA in tags:
                words_with_anaphora_tag +=1
                essay_has_ana = True
                sent_has_ana = True
            
            if has_cc_code:
                words_with_cc += 1
                
            if has_crel_code:
                words_with_crel +=1
            
            if has_crel_ana_code:
                words_with_crel_ana +=1
                
            if ANAPHORA in tags and has_cc_code:
                cc_with_anaphora_tag +=1

        # end for each word in sentence
        if sent_has_ana:
            sents_with_anaphora_tag +=1
        
        for t in unique_tags:
            if "->" in t:
                prefix = "Anaphor["
                if prefix in t:
                    k_fixed = t.replace(prefix, "").replace("]","")
                    if ANAPHORA not in k_fixed:
                        l,r = k_fixed.split("->")
                        l_code = l.replace("Causer:","")
                        r_code = r.replace("Result:","")
                        assert is_a_regular_code(l_code), l_code
                        assert is_a_regular_code(r_code), r_code
                        ana_crel_tally[t] +=1
                elif ANAPHORA not in t:
                    crel_tally[t] +=1
            else: # "->" not in t
                assert "->" not in t
                if "Anaphor:[" in t:
                    ana_code_sent_tally[t] +=1
                elif is_a_regular_code(t):
                    reg_code_sent_tally[t]+=1
                
    # end for each sentence
    if essay_has_ana:
        essays_with_anaphora_tag +=1
                    
num_words, words_with_anaphora_tag, num_sents, sents_with_anaphora_tag, num_essays, essays_with_anaphora_tag

(180873, 585, 10670, 437, 1088, 324)

## Look at Anaphora Tags without Associated Codes

In [7]:
count_ana = 0
count_ana_code = 0
count_ana_no_code = 0

all_tags = defaultdict(int)

for e in all_essays:
    for sent in e.sentences:        
        for wd, tags in sent:
            has_ana = "Anaphor" in tags
            has_ana_code = False
            for t in tags:
                all_tags[t] +=1
                if "Anaphor:[" in t:
                    has_ana_code = True
            if has_ana:
                count_ana +=1
            if has_ana_code:
                count_ana_code += 1
            if has_ana and not has_ana_code:
                count_ana_no_code += 1
            
count_ana, count_ana_code, count_ana_no_code

(585, 585, 0)

In [8]:
all_tags

defaultdict(int,
            {'1': 5431,
             '11': 415,
             '12': 411,
             '2': 5528,
             '3': 3474,
             '4': 2740,
             '5': 4844,
             '50': 10948,
             '6': 2898,
             'Anaphor': 585,
             'Anaphor:[11]': 40,
             'Anaphor:[12]': 1,
             'Anaphor:[1]': 71,
             'Anaphor:[2]': 25,
             'Anaphor:[3]': 56,
             'Anaphor:[4]': 22,
             'Anaphor:[50]': 256,
             'Anaphor:[5]': 79,
             'Anaphor:[6]': 30,
             'Anaphor:[rhetorical]': 7,
             'COMPILED': 69,
             'Causer': 17970,
             'Causer:1': 5018,
             'Causer:1->Result:2': 7004,
             'Causer:1->Result:3': 1133,
             'Causer:1->Result:4': 69,
             'Causer:1->Result:5': 347,
             'Causer:1->Result:50': 6528,
             'Causer:1->Result:Anaphor': 394,
             'Causer:1->Result:Anaphor[2]': 6,
             'Cause

In [5]:
reg_code_sent_tally

defaultdict(int,
            {'1': 1342,
             '11': 176,
             '12': 169,
             '2': 1745,
             '3': 1146,
             '4': 718,
             '5': 2181,
             '50': 3024,
             '6': 631})

In [6]:
ana_code_sent_tally

defaultdict(int,
            {'Anaphor:[11]': 36,
             'Anaphor:[12]': 1,
             'Anaphor:[1]': 37,
             'Anaphor:[2]': 25,
             'Anaphor:[3]': 41,
             'Anaphor:[4]': 21,
             'Anaphor:[50]': 188,
             'Anaphor:[5]': 73,
             'Anaphor:[6]': 23})

In [7]:
crel_tally

defaultdict(int,
            {'Causer:1->Result:2': 638,
             'Causer:1->Result:3': 88,
             'Causer:1->Result:4': 5,
             'Causer:1->Result:5': 27,
             'Causer:1->Result:50': 589,
             'Causer:11->Result:12': 208,
             'Causer:11->Result:3': 14,
             'Causer:11->Result:4': 1,
             'Causer:11->Result:5': 5,
             'Causer:11->Result:50': 69,
             'Causer:12->Result:12': 1,
             'Causer:12->Result:2': 19,
             'Causer:12->Result:3': 293,
             'Causer:12->Result:4': 2,
             'Causer:12->Result:5': 5,
             'Causer:12->Result:50': 21,
             'Causer:2->Result:1': 3,
             'Causer:2->Result:11': 1,
             'Causer:2->Result:2': 2,
             'Causer:2->Result:3': 283,
             'Causer:2->Result:4': 83,
             'Causer:2->Result:5': 123,
             'Causer:2->Result:50': 709,
             'Causer:2->Result:6': 4,
             'Causer:3->Result:1

In [8]:
ana_crel_tally

defaultdict(int,
            {'Causer:1->Result:Anaphor[2]': 1,
             'Causer:1->Result:Anaphor[3]': 2,
             'Causer:1->Result:Anaphor[50]': 39,
             'Causer:11->Result:Anaphor[12]': 1,
             'Causer:11->Result:Anaphor[3]': 1,
             'Causer:11->Result:Anaphor[50]': 5,
             'Causer:12->Result:Anaphor[2]': 1,
             'Causer:12->Result:Anaphor[3]': 3,
             'Causer:12->Result:Anaphor[50]': 2,
             'Causer:2->Result:Anaphor[1]': 1,
             'Causer:2->Result:Anaphor[3]': 3,
             'Causer:2->Result:Anaphor[4]': 3,
             'Causer:2->Result:Anaphor[50]': 47,
             'Causer:2->Result:Anaphor[5]': 5,
             'Causer:3->Result:Anaphor[4]': 2,
             'Causer:3->Result:Anaphor[50]': 33,
             'Causer:3->Result:Anaphor[6]': 1,
             'Causer:4->Result:Anaphor[50]': 8,
             'Causer:4->Result:Anaphor[5]': 9,
             'Causer:5->Result:Anaphor[50]': 73,
             'Causer:5->R

In [9]:
sum_reg_code = sum(reg_code_sent_tally.values())
sum_ana_code = sum(ana_code_sent_tally.values())
sum_reg_code, sum_ana_code, round(100* sum_ana_code/(sum_ana_code+sum_reg_code),4)

(11132, 445, 3.8438)

# Counts by Document, Sentence and Word

In [15]:
def pct(cnt_cond, cnt):
    return str(round(100 * (cnt_cond / cnt),2)) + "%"

cnts = {
    "_Type": "Cnts.",
    "Essays":    essays_with_anaphora_tag,
    "Sentences": sents_with_anaphora_tag,
    "Words": words_with_anaphora_tag
}
totals = {
    "_Type": "Ttls.",
    "Essays":    num_essays,
    "Sentences": num_sents,
    "Words":     num_words
}
pcts = {
    "_Type": "Pct.",
    "Essays": pct(essays_with_anaphora_tag, num_essays),
    "Sentences": pct(sents_with_anaphora_tag, num_sents),
    "Words": pct(words_with_anaphora_tag, num_words)
}

df_counts = pd.DataFrame([cnts, pcts, totals])
df_counts = df_counts.transpose()

## Concept Codes (Sentences)

In [11]:
sum_reg_code = sum(reg_code_sent_tally.values())
sum_ana_code = sum(ana_code_sent_tally.values())
sum_cc_code = sum_ana_code + sum_reg_code
sum_reg_code, sum_ana_code, sum_cc_code

(11132, 445, 11577)

## Causal Relations (Sentences)

In [12]:
sum_reg_crel = sum(crel_tally.values())
sum_ana_crel = sum(ana_crel_tally.values())
sum_crel = sum_reg_crel + sum_ana_crel
sum_reg_crel, sum_ana_crel, sum_crel

(6549, 470, 7019)

In [16]:
cnts = {
    "_Type": "Cnts.",
    "CCodes": sum_ana_code,
    "CRels": sum_ana_crel,
}
totals = {
    "_Type": "Ttls.",
    "CCodes": sum_cc_code,
    "CRels": sum_crel,
}
pcts = {
    "_Type": "Pct.",
    "CCodes": pct(sum_ana_code, sum_cc_code),
    "CRels": pct(sum_ana_crel,  sum_crel),
}

df_codes = pd.DataFrame([cnts, pcts, totals])
df_codes = df_codes.transpose()

# Frequencies

In [17]:
df_counts

Unnamed: 0,0,1,2
Essays,324,29.78%,1088
Sentences,437,4.1%,10670
Words,585,0.32%,180873
_Type,Cnts.,Pct.,Ttls.


# Codes

In [18]:
df_codes

Unnamed: 0,0,1,2
CCodes,445,3.84%,11577
CRels,470,6.7%,7019
_Type,Cnts.,Pct.,Ttls.


# Compute CRels that Cross Sentences

In [21]:
cross_sent_crel_count = 0
prefix = "Anaphor["

for e in all_essays:
    for sent in e.sentences:
        unique_tags = set()
        all_non_cr_tags = set()
        
        for wd, tags in sent:            
            for t in tags:

                t_lower = t.lower()
                if "other" in t_lower or "rhet" in t_lower or "change" in t_lower:
                    continue
                unique_tags.add(t)
                                    
                if "->" not in t:                
                    all_non_cr_tags.add(t)

        unique_crels = set()
        unique_ccodes = set()
        for t in unique_tags:
            # Causal?
            if "->" in t:               
                if prefix in t:
                    k_fixed = t.replace(prefix, "").replace("]","")
                    if ANAPHORA not in k_fixed:
                        unique_crels.add(t)
                    else:
                        # ignore ones like this - Causer:Anaphor->Result:Anaphor[50]
                        pass
                elif ANAPHORA not in t:
                    unique_crels.add(t)
            else: # "->" not in t
                assert "->" not in t
                if "Anaphor:[" in t:
                    unique_ccodes.add(t.replace(":","")) # Anaphor tags have a subltely different pattern
                elif is_a_regular_code(t):
                    unique_ccodes.add(t)
                elif "Causer:" in t:
                    fixed = t.replace("Causer:","")
                    if is_a_regular_code(fixed):
                        unique_ccodes.add(fixed)
                elif "Result:" in t:
                    fixed = t.replace("Result:","")
                    if is_a_regular_code(fixed):
                        unique_ccodes.add(fixed)
        
        for crel in unique_crels:
            if prefix in crel:
                continue
            l,r = crel.replace("Causer:","").replace("Result:","").split("->")  
            assert l
            assert r
            # MOST of them exist because there is also a crel including an anaphora. So ignore these when computing these counts
            if l not in unique_ccodes and r not in unique_ccodes:
                lhs, rhs = crel.split("->")
                ana_lhs = lhs.replace(l, "Anaphor[{code}]".format(code=l))
                ana_rhs = rhs.replace(r, "Anaphor[{code}]".format(code=r))
                crel_ana_l = ana_lhs + "->" + rhs
                if crel_ana_l in unique_crels:
                    continue
                crel_ana_r = lhs + "->" + ana_rhs
                if crel_ana_r in unique_crels:
                    continue
                crel_ana_both = ana_lhs + "->" + ana_rhs
                if crel_ana_both in unique_crels:
                    continue
#                 print("|{l},{r}|".format(l=l,r=r).ljust(10), "\t", crel.ljust(20), "\t", ("|".join(unique_ccodes)).ljust(30), "  ", " | ".join(unique_crels))
                cross_sent_crel_count +=1    
            elif l not in unique_ccodes:             
                # need to handle 5 and 5b
                crel_ana = crel.replace(l, "Anaphor[{code}]".format(code=l), 1) # only replate the first occurence
                if crel_ana in unique_crels:
                    continue
#                 print("|{code}|".format(code=l).ljust(10), "\t", crel.ljust(20), "\t", ("|".join(unique_ccodes)).ljust(30), "  ", " | ".join(unique_crels))
                cross_sent_crel_count +=1
            elif r not in unique_ccodes: #elif - don't count twice if both
                lhs, rhs = crel.split("->")
                ana_rhs = rhs.replace(r, "Anaphor[{code}]".format(code=r))
                crel_ana = lhs + "->" + ana_rhs
                if crel_ana in unique_crels:
                    continue
#                 print("|{code}|".format(code=r).ljust(10), "\t", crel.ljust(20), "\t", ("|".join(unique_ccodes)).ljust(30), "  ", " | ".join(unique_crels), "\t\t", crel_ana)
                cross_sent_crel_count +=1

cross_sent_crel_count, sum_crel , str(round(100 * (cross_sent_crel_count / sum_crel),2)) + "% cross sentence non ana relations"

(378, 7019, '5.39% cross sentence non ana relations')