In [1]:
from BrattEssay import load_bratt_essays
from collections import defaultdict
from IterableFP import flatten
from Settings import Settings

settings = Settings()

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [2]:
# DATASET = "CoralBleaching"
DATASET = "SkinCancer"

tr_folder = settings.data_directory + DATASET + "/Thesis_Dataset/Training"
br_train_essays = load_bratt_essays(tr_folder)

test_folder = settings.data_directory + DATASET + "/Thesis_Dataset/Test"
br_test_essays = load_bratt_essays(test_folder)

essays = br_train_essays + br_test_essays

870 files found
870 essays processed
218 files found
218 essays processed


In [3]:
len(br_train_essays), len(br_test_essays), len(essays)

(870, 218, 1088)

In [4]:
from BrattEssay import load_bratt_essays
from collections import defaultdict
from IterableFP import flatten

wd_sent_freq = defaultdict(int)
all_codes = set()
#Stores all words for the spelling corrector
words = []
all_sentences = []
sentencesForCode = defaultdict(list)
for essay in essays:
    for sentence in essay.tagged_sentences:
        wdsInSent = set()
        codes4sentence = set()
        sent = []
        for w, tags in sentence:
            words.append(w)
            all_codes.update(tags)
            codes4sentence.update(tags)
            if w not in wdsInSent:
                wdsInSent.add(w)
                wd_sent_freq[w] += 1
            sent.append(w)
        all_sentences.append(sent)
        for code in codes4sentence:
            sentencesForCode[code].append(sentence)

### Compute Stats over the Essays

In [5]:
wd_counts = []
sent_counts = []
concept_codes = []
cr_concept_codes = []
sent_multi_word_tags = {}
sent_codes = []
sent_cr_codes = []

num_sents = 0
un_wd_counts = []
vocab = set()
for e_ix, essay in enumerate(essays):
    wds = 0
    un_words = set()
    for i, sentence in enumerate(essay.tagged_sentences):
        num_sents += 1
        sent_tags = set()
        sent_cr_tags = set()
        for w, tags in sentence:
            un_words.add(w)
            vocab.add(w)
            wds += 1
            ccodes = [t for t in tags if t[0].isdigit()]
            if ccodes:
                sent_tags.update(ccodes)
                concept_codes.append(ccodes)
                if len(ccodes) > 1:
                    sent_multi_word_tags[(e_ix, i)] = [(w,[tag for tag in t if tag[0].isdigit()]) for w,t in sentence]
            cr_codes = [t for t in tags if t[0].isdigit() or t == "Causer" or t == "Result" or t == "explicit"]
            if cr_codes:
                cr_concept_codes.append(cr_codes)
                sent_cr_tags.update(cr_codes)
        if len(sent_tags) > 0:
            sent_codes.append(sent_tags)
            if len(sent_cr_tags) > 0:
                sent_cr_codes.append(sent_cr_tags)
                
    un_wd_counts.append(len(un_words))
    sent_counts.append(len(essay.tagged_sentences))
    wd_counts.append(wds)

In [6]:
print(len(sent_multi_word_tags), num_sents)
list(sent_multi_word_tags.items())[0]

1 10670


((354, 9),
 [('Sunburn', ['5']),
  ('happens', []),
  ('when', []),
  ('the', []),
  ('body', []),
  ('directs', []),
  ('blood', []),
  ('to', []),
  ('try', []),
  ('to', []),
  ('repair', ['6']),
  ('or', ['6']),
  ('remove', ['6']),
  ('cells', ['6', '4']),
  ('damaged', ['6', '4']),
  ('by', []),
  ('the', []),
  ('sun', ['2']),
  ('.', [])])

In [7]:
len(sent_codes), len([tags for tags in sent_codes if len(tags) > 1])

(6671, 4555)

In [8]:
# sent_multi_word_tags

In [9]:
list(sent_multi_word_tags.items())[0]

((354, 9),
 [('Sunburn', ['5']),
  ('happens', []),
  ('when', []),
  ('the', []),
  ('body', []),
  ('directs', []),
  ('blood', []),
  ('to', []),
  ('try', []),
  ('to', []),
  ('repair', ['6']),
  ('or', ['6']),
  ('remove', ['6']),
  ('cells', ['6', '4']),
  ('damaged', ['6', '4']),
  ('by', []),
  ('the', []),
  ('sun', ['2']),
  ('.', [])])

### Essay Length Statistics

In [10]:
len(essays)

1088

In [11]:
148.76308784383318 * len(essays)

161854.2395740905

In [12]:
import numpy as np
print(np.mean(wd_counts), np.median(wd_counts), np.min(wd_counts), np.max(wd_counts), np.std(wd_counts))
print(np.mean(sent_counts), np.median(sent_counts), np.min(sent_counts), np.max(sent_counts), np.std(sent_counts))
# 166.26746323529412 157.0 4 479 82.44964577496842
# 9.806985294117647 9.0 1 36 5.026169284480411

166.26746323529412 157.0 4 479 82.44964577496842
9.806985294117647 9.0 1 36 5.026169284480411


### Vocabulary Size

In [13]:
print("Total Unique Words:" , len(vocab))

Total Unique Words: 4702


In [14]:
np.min(un_wd_counts), np.max(un_wd_counts), np.mean(un_wd_counts), np.median(un_wd_counts), np.std(un_wd_counts)
#(4, 215, 90.16360294117646, 88.0, 36.15236444087542)

(4, 215, 90.16360294117646, 88.0, 36.15236444087542)

In [15]:
from IterableFP import flatten
unique = set(flatten(concept_codes))
unique

{'1', '11', '12', '2', '3', '4', '5', '50', '6'}

### How Many Tagged Words Have Multiple Codes?

In [16]:
print(len(concept_codes), sum(wd_counts), round(len(concept_codes) / float(sum(wd_counts)),2))
multiple = [tags for tags in concept_codes if len(tags) > 1]
print(len(multiple), len(multiple) / float(len(concept_codes)))
multiple

43024 180899 0.24
2 4.6485682409817774e-05


[['6', '4'], ['6', '4']]

In [17]:
unique_cr = set(flatten(cr_concept_codes))
unique_cr

{'1',
 '11',
 '12',
 '2',
 '3',
 '4',
 '5',
 '50',
 '6',
 'Causer',
 'Result',
 'explicit'}

In [18]:
print(len(cr_concept_codes), sum(wd_counts), round(len(cr_concept_codes) / float(sum(wd_counts)),2))
multiple = [tags for tags in cr_concept_codes if len(tags) > 1]
print(len(multiple), len(multiple) / float(len(cr_concept_codes)))

50260 180899 0.28
34170 0.6798647035415838


### What Proportion of Sentences With Codes Have Multiple Codes?

In [19]:
num_sents = float(num_sents)
print(len(sent_codes) / num_sents)
num_multiple_codes = len([tags for tags in sent_codes if len(tags) > 1])
num_multiple_codes / float(len(sent_codes))

0.6252108716026242


0.6828061759856093

In [20]:
len(sent_cr_codes) / num_sents
num_multiple_codes = len([tags for tags in sent_cr_codes if len(tags) > 1])
num_multiple_codes / float(len(sent_cr_codes))

0.7507120371758357

### Conditional Probabilities of Codes

In [21]:
from collections import defaultdict
priors = defaultdict(float)
joints = defaultdict(float)

for sent in sent_codes:
    for a in sorted(sent):
        priors[a] += 1
        for b in sorted(sent):
            if b >= a:
                break
            joints[(b,a)] +=1

conditional = {}
for a, cnt in priors.items():
    for b in priors.keys():
        if a == b:
            continue
        """ p(A/B) = p(B/A)p(A) / p(B) """
        """ p(A/B) = p(B/A)p(A) """
        if a < b:
            joint = joints[(a,b)]
        else:
            joint = joints[(b,a)]
        conditional[(a,b)] = joint / priors[b]
    
lifts = {}
total = float(sum(joints.values()))
totalPrior = float(sum(priors.values()))
for (a,b),cnt in joints.items():
    joint = cnt / total
    pA = priors[a] / totalPrior
    pB = priors[b] / totalPrior
    lift = joint / (pA * pB)
    if lift:
        lifts[(a,b)] = lift

In [22]:
def get_num(a):
    s = ""
    for c in a:
        if c.isdigit():
            s += c
    return int(s)

consec_pmi = {}
for (a,b), lift in lifts.items():
    ia = get_num(a)
    ib = get_num(b)
    diff = abs(ia-ib)
    pmi = np.log(lift)
    if "Coral" in DATASET:
        if diff == 1 and b != "6" and b != "5b":
            consec_pmi[(a,b)] = pmi
        elif a =="5" and b =="5b":
            consec_pmi[(a,b)] = pmi
        elif a =="14" and b =="5b":
            consec_pmi[(a,b)] = pmi
        elif a =="14" and b =="6":
            consec_pmi[(a,b)] = pmi
        elif a =="50" and b =="7":
            consec_pmi[(a,b)] = pmi
    elif "Skin" in DATASET:
        if diff == 1:
            consec_pmi[(a,b)] = pmi
        elif a == "12" and b == "6":
            consec_pmi[(a,b)] = pmi
        elif a == "50" and b == "6":
            consec_pmi[(a,b)] = pmi        
            
for k,v in sorted(consec_pmi.items(), key = lambda tpl: (min(int(tpl[0][0]),int(tpl[0][1].replace("b",""))))):
    a = k[0]
    b = k[1]
    if len(a) > len(b.replace("b","")):
        a,b = b,a
    print("&", str(a).ljust(5), "&", str(b).ljust(5), "&", str(round(v,2)).rjust(5), "\\\\")

& 1     & 2     &  1.83 \\
& 2     & 3     &  1.19 \\
& 3     & 4     &  1.54 \\
& 4     & 5     &  1.39 \\
& 5     & 6     &  1.69 \\
& 6     & 50    &   1.1 \\
& 6     & 12    &  -1.5 \\
& 11    & 12    &  2.82 \\


In [23]:
np.mean(list(lifts.values()))
#sorted(lifts.items(), key=lambda (tpl,p):-p)

2.511176574779474

In [24]:
np.mean(list(map(lambda l: np.log(l), lifts.values())))

0.2957990343006834

## Compute Relative Class Frequencies At Word and Sentence Levels (Class Imbalances)

In [25]:
wd_cc_tally = defaultdict(int)
sent_cc_tally = defaultdict(int)
wd_cnt = 0
sent_cnt = 0

for e_ix, essay in enumerate(essays):

    for i, sentence in enumerate(essay.tagged_sentences):
        sent_cnt += 1
        sent_tags = set()
        
        for w, tags in sentence:
            wd_cnt += 1
            # Concept Codes
            ccodes = [t for t in tags if t[0].isdigit()]
            if ccodes:
                for cc in ccodes:
                    wd_cc_tally[cc] += 1
                    sent_tags.add(cc)
            #Causal Codes
            #This is wrong
            #cr_codes = [t for t in tags if t[0].isdigit() or t == "Causer" or t == "Result" or t == "explicit"]
            #if cr_codes:
                 
        if len(sent_tags) > 0:
            for tag in sent_tags:
                sent_cc_tally[tag] +=1            

In [26]:
print(wd_cnt, sent_cnt)
#Code, Number of Words, Proportion
for code, wd_tally in sorted(wd_cc_tally.items(), key = lambda tpl: int(tpl[0].replace("b",""))):
    wd_pct = 100* (float(wd_tally) / wd_cnt)
    sent_pct = 100*(float(sent_cc_tally[code]) / sent_cnt)
    #print(code.ljust(2), str(tally).ljust(6), "{0:.4f}".format(float(tally)/wd_cnt))
    # Print Latex Output (Near End of chapter 2)
    print("{code} & {wd_pct:.2f} & {sent_pct:.2f} \\\\".format(code=code.ljust(3),wd_pct=wd_pct, sent_pct=sent_pct))

180899 10670
1   & 3.38 & 13.46 \\
2   & 3.33 & 17.22 \\
3   & 2.30 & 12.69 \\
4   & 1.86 & 7.71 \\
5   & 2.82 & 21.94 \\
6   & 2.88 & 9.20 \\
11  & 0.33 & 2.98 \\
12  & 0.52 & 6.08 \\
50  & 6.36 & 29.29 \\


### Print Counts for Metrics By Class Notebook (See Mongo Queries)

In [27]:
from pprint import pprint
print(wd_cnt)
print("")
pprint(dict(wd_cc_tally))

180899

{'1': 6115,
 '11': 599,
 '12': 944,
 '2': 6025,
 '3': 4155,
 '4': 3366,
 '5': 5106,
 '50': 11514,
 '6': 5202}


## Compute Relative Frequencies of Each Causal Relation (Word and Sentence Level)

In [28]:
wd_cr_tally = defaultdict(int)
sent_cr_tally = defaultdict(int)
wd_cr_cnt = 0
sent_cr_cnt = 0

for e_ix, essay in enumerate(essays):

    for i, sentence in enumerate(essay.tagged_sentences):
        sent_cr_cnt += 1
        sent_tags = set()
        sent_cr_tags = set()
        
        for w, tags in sentence:
            wd_cr_cnt += 1
            # Concept Codes
            cr_codes = list((t for t in tags if ( "->" in t) and not "factor" in t and not "Anaphor" in t and not "other" in t and not "rhetorical" in t))
            if cr_codes:
                for cr in cr_codes:
                    wd_cr_tally[cr] += 1
                    sent_tags.add(cr)           
        if len(sent_tags) > 0:
            for tag in sent_tags:
                sent_cr_tally[tag] +=1            

In [29]:
def cc_to_float(cc):
    if "b" in cc:
        cc = cc.replace("b","")
        cc += ".1"
    return float(cc)

def sort_key(tpl):
    cr,cnt = tpl
    cr = cr.replace("Causer:","")
    cr = cr.replace("Result:","")
    l,r = cr.split("->")
    l,r = cc_to_float(l), cc_to_float(r)
    return (l,r)

srtd_cnts = sorted(wd_cr_tally.items(), key = sort_key)

prev = ""

lines = []
for cr, wd_tally in srtd_cnts:
    l,r = cr.split("->")
    if l != prev:
        #print("\cmidrule{1-3}")
        prev = l
    wd_pct = 100 * (float(wd_tally) / wd_cr_cnt)
    sent_pct = 100 *(float(sent_cr_tally[cr]) / sent_cnt)
    # Print Latex Output (Near End of chapter 2)
    cr = cr.replace("->","\\textrightarrow ")
    cr = cr.replace("Causer:","").replace("Result:","")
    lines.append("{cr} & {wd_pct:.2f} & {sent_pct:.2f}".format(cr=cr.ljust(25),wd_pct=wd_pct, sent_pct=sent_pct))

In [30]:
len(lines)

49

In [48]:
half = int(len(lines)+1)//2
if len(lines) % 2 != 0:
    half += 1

for i in range(0,half):
    left = lines[i]
    right = ""
    if i + half  < (len(lines)):
        right = lines[i+half]
    print(left + " & & " + right + " \\\\")

1\textrightarrow 2        & 3.77 & 5.78 & & 5\textrightarrow 4        & 0.29 & 0.63 \\
1\textrightarrow 3        & 0.57 & 0.73 & & 5\textrightarrow 5        & 0.04 & 0.07 \\
1\textrightarrow 4        & 0.03 & 0.04 & & 5\textrightarrow 6        & 3.19 & 5.03 \\
1\textrightarrow 5        & 0.19 & 0.25 & & 5\textrightarrow 12       & 0.01 & 0.01 \\
1\textrightarrow 50       & 3.37 & 5.11 & & 5\textrightarrow 50       & 5.31 & 8.76 \\
2\textrightarrow 1        & 0.01 & 0.02 & & 6\textrightarrow 3        & 0.00 & 0.01 \\
2\textrightarrow 2        & 0.01 & 0.02 & & 6\textrightarrow 4        & 0.01 & 0.02 \\
2\textrightarrow 3        & 1.36 & 2.61 & & 6\textrightarrow 5        & 0.05 & 0.07 \\
2\textrightarrow 4        & 0.44 & 0.74 & & 6\textrightarrow 50       & 2.24 & 4.01 \\
2\textrightarrow 5        & 0.61 & 1.03 & & 11\textrightarrow 3       & 0.09 & 0.12 \\
2\textrightarrow 6        & 0.03 & 0.03 & & 11\textrightarrow 4       & 0.01 & 0.01 \\
2\textrightarrow 11       & 0.00 & 0.01 & &

## Compute Aggregate Frequences of Concept Codes (Word and Sentence Level)

In [32]:
l = list(range(10))
print(l)
l.pop?
print(l.pop())
print(l)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
9
[0, 1, 2, 3, 4, 5, 6, 7, 8]


[0;31mDocstring:[0m
L.pop([index]) -> item -- remove and return item at index (default last).
Raises IndexError if list is empty or index is out of range.
[0;31mType:[0m      builtin_function_or_method


In [33]:
num_sents = 0
sents_with_codes = 0
# sents where one or more words has overlapping codes
sents_with_overlapping_codes = 0
# sents with >1 code
sents_with_multiple_codes = 0

num_words = 0
words_with_codes = 0
words_with_overlapping_codes = 0

seq_lens = defaultdict(list)
long_seqs = defaultdict(set)

for e_ix, essay in enumerate(essays):
    for i, sentence in enumerate(essay.tagged_sentences):    
        has_code_in_sent = False
        has_overlapping_codes_in_sent = False
        unique_ccodes = set()
        
        last_ccodes = set()
        for w, tags in sentence:
            # Concept Codes
            ccodes = set([t for t in tags if t[0].isdigit()])
            # update word stats
            if ccodes:
                unique_ccodes.update(ccodes)
                has_code_in_sent = True
                words_with_codes +=1
                
                if len(ccodes) > 1:
                    words_with_overlapping_codes +=1
                    has_overlapping_codes_in_sent = True
            num_words +=1
            # end update word stats
            
            # Update seq length stats
            for code in ccodes:
                if code in last_ccodes:
                    # get list
                    lst = seq_lens[code]
                    # remove last entry
                    last_item = lst.pop()
                    lst.append(last_item+1)
                    # store indexes for debugging
                    #if last_item > 15:
                    #    long_seqs[code].add((e_ix,i))
                else:
                    seq_lens[code].append(1)
            # reset last_ccodes
            last_ccodes = ccodes            
            # end update seq length stats
        # update sent stats
        if has_code_in_sent:
            sents_with_codes +=1
            if has_overlapping_codes_in_sent:
                sents_with_overlapping_codes +=1
            if len(unique_ccodes) > 1:
                sents_with_multiple_codes +=1
        num_sents +=1
        # end update sent       

In [34]:
num_words, words_with_codes, words_with_overlapping_codes

(180899, 43024, 2)

In [35]:
num_sents, sents_with_codes, sents_with_overlapping_codes, sents_with_multiple_codes

(10670, 6671, 1, 4555)

In [36]:
from __future__ import division

print("% words with codes: \t \t \t{pct:.3f}%".format(pct=100*words_with_codes/ num_words) )
print("% words with multiple codes: \t \t{pct:.4f}%".format(pct=100*words_with_overlapping_codes/ num_words) )
print("")
print("% sents with codes: \t \t \t{pct:.3f}%".format(pct=100*sents_with_codes/ num_sents) )
print("% sents with wds. with mult. codes: \t{pct:.3f}%".format(pct=100*sents_with_overlapping_codes/ num_sents) )
print("% sents with multiple codes: \t \t{pct:.3f}%".format(pct=100*sents_with_multiple_codes/ num_sents) )


% words with codes: 	 	 	23.783%
% words with multiple codes: 	 	0.0011%

% sents with codes: 	 	 	62.521%
% sents with wds. with mult. codes: 	0.009%
% sents with multiple codes: 	 	42.690%


### Sequence Lengths

In [46]:
all_lens = []
for code in sorted(seq_lens.keys()):
    print("{code} - {mean_len:.2f}".format(code=code.ljust(10), mean_len=np.mean(seq_lens[code])))
    all_lens.extend(seq_lens[code])
print
print("{code} - {mean_len:.2f}".format(code="All".ljust(10), mean_len=np.mean(all_lens)))

1          - 3.55
11         - 1.78
12         - 1.39
2          - 3.00
3          - 2.85
4          - 4.00
5          - 2.03
50         - 3.56
6          - 4.84
All        - 3.10


## Compute Aggregate Frequencies of Causal Relations (Word and Sentence Level)

In [38]:
num_sents = 0
sents_with_causal = 0
sents_with_overlapping_causal = 0
sents_with_multiple_causal = 0

num_words = 0
words_with_causal = 0
words_with_overlapping_causal = 0

cr_sents = []

cr_seq_lens = defaultdict(list)

for e_ix, essay in enumerate(essays):

    for i, sentence in enumerate(essay.tagged_sentences):    
        has_cr_in_sent = False
        has_overlapping_cr_in_sent = False
        unique_causal = set()
        cr_codes_seq = []
        
        for w, tags in sentence:
            # Concept Codes
            cr_codes = set((t for t in tags if ( "->" in t) 
                             and not "factor" in t 
                             and not "Anaphor" in t 
                             and not "other" in t 
                             and not "rhetorical" in t))
            # update word stats
            if cr_codes:
                unique_causal.update(cr_codes)
                has_cr_in_sent = True
                words_with_causal +=1
                
                if len(cr_codes) > 1:
                    words_with_overlapping_causal +=1
                    has_overlapping_cr_in_sent = True
            num_words +=1
            # end update word stats
            
            # Update seq length stats
            for code in cr_codes:
                # go upto 2 back as can be 1 word gaps for punct
                if (len(cr_codes_seq) > 0 and code in cr_codes_seq[-1]) \
                    or (len(cr_codes_seq) > 1 and code in cr_codes_seq[-2]):
                    # get list
                    lst = cr_seq_lens[code]
                    # remove last entry
                    last_item = lst.pop()
                    lst.append(last_item+1)
                else:
                    cr_seq_lens[code].append(1)
            # reset last_ccodes
            cr_codes_seq.append(cr_codes)
                    
        # update sent stats
        if has_cr_in_sent:
            cr_sents.append(sentence)
            sents_with_causal +=1
            if has_overlapping_cr_in_sent:
                sents_with_overlapping_causal +=1
            if len(unique_causal) > 1:
                sents_with_multiple_causal +=1
        num_sents +=1
        # end update sent       

In [39]:
num_words, words_with_causal/float(num_words)

(180899, 0.2970829026141659)

In [40]:
num_sents, sents_with_causal/ float(num_sents)

(10670, 0.4201499531396439)

In [41]:
num_words, words_with_causal, words_with_overlapping_causal, words_with_overlapping_causal/float(words_with_causal)

(180899, 53742, 7287, 0.13559227419895054)

In [42]:
num_sents, sents_with_causal, sents_with_overlapping_causal, sents_with_multiple_causal, sents_with_overlapping_causal/ float(sents_with_causal)

(10670, 4483, 1221, 1334, 0.27236225741690834)

In [43]:
from __future__ import division

print("% words with cr codes: \t \t \t {pct:.4f}%".format(pct=100*words_with_causal/ num_words) )
print("% words with multiple cr codes: \t {pct:.4f}%".format(pct=100*words_with_overlapping_causal/ num_words) )
print("")
print("% sents with cr codes: \t \t \t {pct:.4f}%".format(pct=100*sents_with_causal/ num_sents) )
print("% sents with multiple cr codes: \t {pct:.4f}%".format(pct=100*sents_with_multiple_causal/ num_sents) )


% words with cr codes: 	 	 	 29.7083%
% words with multiple cr codes: 	 4.0282%

% sents with cr codes: 	 	 	 42.0150%
% sents with multiple cr codes: 	 12.5023%


### Causal Relation Sequence Lengths

In [54]:
len(cr_seq_lens)

49

In [45]:
all_cr_lens = []
for code in sorted(cr_seq_lens.keys()):
    #print("{code} - {mean_len:.2f}".format(code=code.ljust(10), mean_len=np.mean(cr_seq_lens[code])))
    all_cr_lens.extend(cr_seq_lens[code])
print("{code} - {mean_len:.2f}".format(code="All".ljust(10), mean_len=np.mean(all_cr_lens)))

All        - 10.00
