# Grab All Matching Files

In [161]:
from FindFiles import find_files

folder = "/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/BrattData/EBA1415_Merged/"
files = find_files(folder, ".*\.ann", True)
#files

# How Many Essays and Lines Have Anaphors and Rhetoricals in them?

In [47]:
from collections import defaultdict

to_match = ["rhetorical", "Anaphor"]
matching_files = defaultdict(set)
matching_lines = defaultdict(list)

total_line_count = 0
for f in files:
    with open(f, "r+") as fin:
        for line in fin.readlines():
            total_line_count += 1
            for annot in to_match:
                if "\t%s " % annot in line:
                    matching_files[annot].add(f)
                    matching_lines[annot].append(line)
                
NUM_DIGITS = 2
for annot in to_match:
    m_files = matching_files[annot]
    m_lines = matching_lines[annot]
    print annot.ljust(10), #len(m_files), len(files), 
    print str(round(100.0*float(len(m_files)) / len(files),NUM_DIGITS)) + "% of essays",
    print str(round(100.0*float(len(m_lines)) / total_line_count,NUM_DIGITS)) + "% of lines"

rhetorical 6.59% of essays 0.59% of lines
Anaphor    20.62% of essays 1.2% of lines


# How Many Sentences Have Cross-Sentence Causal Relations?
We can't compute this directly without manually analyzing the data. But we can approximate it by looking at sentences containing some causal relation where one or both parts of the relation are not contained within the sentence

In [150]:
#Load output file - has original annotation data in it
import pandas as pd

#fname = "/Users/simon.hughes/Google Drive/PhD/Data/CoralBleaching/Results/predictions.txt"
fname = "/Users/simon.hughes/Google Drive/PhD/Data/CoralBleaching/Results/predictions-causal_relations.txt"

data = pd.read_csv(fname, sep="|")
data["Concept Codes"] = data["Concept Codes"].astype(str).apply(lambda s: "" if s == "nan" else s)
data["Predictions"] = data["Predictions"].astype(str).apply(lambda s: "" if s == "nan" else s)
data[["Essay","Concept Codes", "Predictions"]].head(10)

Unnamed: 0,Essay,Concept Codes,Predictions
0,EBA1415_AEKD_4_CB_ES-05568.ann,50,50
1,EBA1415_AEKD_4_CB_ES-05568.ann,,
2,EBA1415_AEKD_4_CB_ES-05568.ann,50,50
3,EBA1415_AEKD_4_CB_ES-05568.ann,50,50
4,EBA1415_AEKD_4_CB_ES-05572.ann,5,
5,EBA1415_AEKD_4_CB_ES-05572.ann,"5,50,_C->R,_CRel,_RRel,Causer,Result,explicit,...",50
6,EBA1415_AEKD_4_CB_ES-05572.ann,,
7,EBA1415_AEKD_4_CB_ES-05572.ann,4,"Causer,Result"
8,EBA1415_AEKD_4_CB_ES-05572.ann,,
9,EBA1415_AEKD_4_CB_ES-05572.ann,11,11


In [151]:
def to_concepts_only(s):
    splt = s.split(",")
    if not splt:
        return set()
    filtered = filter(lambda s: s and s[0].isdigit() and not "-" in s, splt)
    return set(filtered)

def is_causal(s):
    return s.startswith("Causer:") and "->Result:" in s

def to_relations_only(s):
    splt = s.split(",")
    if not splt:
        return set()
    filtered = filter(lambda s: s and is_causal(s), splt)
    cleaned = map(lambda s: s.replace("Causer:","").replace("Result:",""), filtered)
    return set(cleaned)

def set_to_str(a_set):
    if len(a_set) == 0:
        return "-"
    return ",".join(sorted(a_set))

data["Codes"] = data["Concept Codes"].apply(to_concepts_only)
data["sCodes"] = data["Codes"].apply(set_to_str)
data["Causal"] = data["Concept Codes"].apply(to_relations_only)
data["sCausal"] = data["Causal"].apply(set_to_str)
#del data["Concept Codes"]
#data["Flt_Preds"] = data["Predictions"].apply(to_concepts_only)
#del data["Predictions"]

In [152]:
data[["Codes", "sCodes", "Causal", "sCausal"]].head()

Unnamed: 0,Codes,sCodes,Causal,sCausal
0,set([50]),50,set([]),-
1,set([]),-,set([]),-
2,set([50]),50,set([]),-
3,set([50]),50,set([]),-
4,set([5]),5,set([]),-


In [153]:
data[data["sCausal"] != "-"]["Concept Codes"].values

array(['5,50,_C->R,_CRel,_RRel,Causer,Result,explicit,Causer:5->Result:50',
       '50,_C->R,_CRel,_RRel,Causer,Result,explicit,Causer:7->Result:50',
       '3,4,_C->R,_CRel,_RRel,Causer,Result,explicit,Causer:3->Result:4',
       ...,
       '1,3,50,_C->R,_CRel,_RRel,Causer,Result,explicit,Causer:1->Result:50,Causer:3->Result:50',
       '6,14,50,_C->R,_CRel,_RRel,Causer,Result,explicit,Causer:6->Result:14,Causer:14->Result:50',
       '7,50,_C->R,_CRel,_RRel,Causer,Result,explicit,Causer:7->Result:50'], dtype=object)

In [154]:
causal = data[data["sCausal"] != "-"]
causal[["sCodes","sCausal"]].head()

Unnamed: 0,sCodes,sCausal
5,550,5->50
14,50,7->50
19,34,3->4
20,507,7->50
25,-,7->50


In [155]:
from __future__ import division
len(causal), len(data), round(100* len(causal)/ len(data),2)

(2724, 10209, 26.68)

In [156]:
#causal[causal["sCodes"] == "-"][["sCausal", "sCodes"]]
len(causal[causal["sCodes"] == "-"])

253

In [166]:
cross_sent_count = 0
examplars = []
for i, row in causal[["Codes", "sCausal"]].iterrows():
    codes = row["Codes"]
    s_causal_relns = row["sCausal"]
    #if causal == "-":
    #    continue
    if not codes:
        cross_sent_count +=1
        continue
    for rel in s_causal_relns.split(","):
        rcodes = rel.split("->")
        assert len(rcodes) == 2, "wrong number of codes: {0}, relation: {1}".format(len(rcodes), rel)
        assert all(map(lambda s: s[0].isdigit(), rcodes)), "not all codes are numeric {0}".format(rel)
        if not all(map(lambda c: c in codes, rcodes)):
            examplars.append((codes, rel, rcodes))
            cross_sent_count +=1
            break
cross_sent_count, len(examplars)

(550, 297)

In [167]:
#Sanity Check
examplars[0:20]

[({'50'}, '7->50', ['7', '50']),
 ({'50'}, '7->50', ['7', '50']),
 ({'3'}, '3->5', ['3', '5']),
 ({'50'}, '5b->50', ['5b', '50']),
 ({'50', '7'}, '5b->50', ['5b', '50']),
 ({'50'}, '1->50', ['1', '50']),
 ({'1', '5', '6'}, '1->50', ['1', '50']),
 ({'1'}, '1->2', ['1', '2']),
 ({'2'}, '1->2', ['1', '2']),
 ({'6'}, '6->7', ['6', '7']),
 ({'7'}, '6->7', ['6', '7']),
 ({'3'}, '1->3', ['1', '3']),
 ({'50'}, '7->50', ['7', '50']),
 ({'3'}, '3->5', ['3', '5']),
 ({'1', '3', '50'}, '3->7', ['3', '7']),
 ({'50'}, '1->50', ['1', '50']),
 ({'1', '13'}, '1->50', ['1', '50']),
 ({'3'}, '3->5', ['3', '5']),
 ({'3'}, '3->5', ['3', '5']),
 ({'3'}, '3->5', ['3', '5'])]

In [168]:
pctCrossing = 100.0 * cross_sent_count / len(causal)
pctCrossing

20.190895741556535

## ** CONCLUSION ** Around 20% of the Causal Relations Are Cross Sentences

## For Comparison with the Anaphora and Rhetorical Counts, We need to Compute % of Lines

In [173]:
print round(100*cross_sent_count / total_line_count,2), "% of lines have cross-sentence causal relations"

1.97 % of lines have cross-sentence causal relations
