In [1]:
import dill

#from gensim.models import Word2Vec
from window_based_tagger_config import get_config
from Rpfa import micro_rpfa

import logging
import datetime
import pickle

from CoRefHelper import parse_stanfordnlp_tagged_essays, bratt_essays_2_hash_map
from CrossValidation import cross_validation
from BrattEssay import load_bratt_essays
from load_data import load_process_essays
from collections import defaultdict
from IterableFP import flatten
from Settings import Settings
from FindFiles import find_files

CV_FOLDS = 5
DEV_SPLIT = 0.1

settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


## Load CoRef Parsed Essays

In [2]:
coref_folder_train = root_folder + "CoReference/Training"
coref_files_train = find_files(coref_folder_train, ".*\.tagged")
len(coref_files_train)

902

In [3]:
coref_folder_test = root_folder + "CoReference/Test"
coref_files_test = find_files(coref_folder_test, ".*\.tagged")
len(coref_files_test)

226

In [4]:
essay2coref_train = parse_stanfordnlp_tagged_essays(coref_files_train)
print(len(essay2coref_train))

902


In [6]:
essay2coref_test = parse_stanfordnlp_tagged_essays(coref_files_test)
print(len(essay2coref_test))

226


### Examine Data Structure

In [5]:
e = essay2coref_train["EBA1415_AEKD_4_CB_ES-05568"]
e[0]

[('what', {'NER': 'O', 'POS': 'WDT'}),
 ('leads', {'NER': 'O', 'POS': 'VBZ'}),
 ('to', {'NER': 'O', 'POS': 'TO'}),
 ('differences', {'NER': 'O', 'POS': 'NNS'}),
 ('in', {'NER': 'O', 'POS': 'IN'}),
 ('the', {'NER': 'O', 'POS': 'DT'}),
 ('rates', {'NER': 'O', 'POS': 'NNS'}),
 ('of', {'NER': 'O', 'POS': 'IN'}),
 ('coral', {'COREF_ID': '2', 'NER': 'O', 'POS': 'NN'}),
 ('bleaching', {'COREF_ID': '2', 'NER': 'O', 'POS': 'NN'}),
 ('.', {'NER': 'O', 'POS': '.'})]

### Notes on CoRef Datastructure
- Dictionary of esssays, keyed by name
- Each essay is a list of sentences
- Each sentence is a list of words
- words are mapped to a tag dict
  - tag dict - contains
    - NER tag (most are O - none)
    - POS tag
    - If a Co-Reference such as an anaphor (mostly pronouns)
      - COREF_PHRASE - phrase referred to by coref
      - COREF_REF - Id of referenced phrase
    - else if it is a phrase that is referenced:
      - COREF_ID - id of the co-reference, referenced in the COREF_REF tag

In [18]:
COREF_ID = "COREF_ID"
COREF_REF = "COREF_REF"

id2coref_tally = {}
id2fwdrefs = defaultdict(int)

essays_with_dupes = defaultdict(set)

for ename, list_sent in list(essay2coref_train.items()):
    matching_ix = -1
    essay_ref_tally = defaultdict(int)
    seen_ids = set()
    for sentix, sent in enumerate(list_sent):
        
        current_ref = ""
        current_id = ""
        
        for wd, tag_dict in sent:
            if COREF_ID in tag_dict:
                idval = tag_dict[COREF_ID]
                if idval != current_id:
                    # new/changed
                    current_id = idval
                    if idval in seen_ids:
                        essays_with_dupes[ename].add(idval)
                    seen_ids.add(idval)
            else:
                current_id = ""
            if COREF_REF in tag_dict:
                refval = tag_dict[COREF_REF]
                if refval != current_ref:
                    essay_ref_tally[refval] += 1
                    # new/changed
                    current_ref = refval
            else:
                current_ref = ""
        
        for ref, cnt in sent_ref_tally.items():
            if cnt > 0:
                id2coref_tally[(ename,sentix,ref)] = cnt
        

## How Many Mentions on Average Per Sentence?

In [12]:
import numpy as np
# number of references per id, ONLY for sentences that have one or more co-refs
vals = list(id2coref_tally.values())
np.mean(vals)

1.2058823529411764

In [15]:
def print_corefs_in_essay(essay):
    sent_ixs = set()
    for sentix, sent in enumerate(essay):      
        for wd, tag_dict in sent:
            if "COREF_ID" in tag_dict:
                sent_ixs.add(sentix)
            if "COREF_REF" in tag_dict:
                sent_ixs.add(sentix)
                
    for sentix, sent in enumerate(essay):
        if sentix in sent_ixs:
            for wd, tag_dict in sent:
                copy = dict([(k,v) for k,v in tag_dict.items() if k in {"COREF_ID", "COREF_REF", "COREF_PHRASE"}])
                print(wd.ljust(20), copy if copy else "")
            print("*" * 80)
                

In [20]:
essays_with_dupes

defaultdict(set,
            {'EBA1415_BLRW_3_CB_ES-05173': {'3'},
             'EBA1415_BLRW_3_CB_ES-05185': {'1'},
             'EBA1415_BLRW_3_CB_ES-05480': {'8'},
             'EBA1415_BLRW_5_CB_ES-05198': {'2'},
             'EBA1415_KYLS_5_CB_ES-05647': {'5'},
             'EBA1415_KYNS_4_CB_ES-05393': {'1'},
             'EBA1415_RCGJ_4a_CB_ES-04684': {'5'},
             'EBA1415_SDLC_8_CB_ES-05920': {'9'},
             'EBA1415_SWCT_7_CB-04891': {'3'},
             'EBA1415_TWMD_45_CB_ES-04994': {'3'},
             'EBA1415_WSKT_6_CB_ES-05343': {'1'},
             'EBA1415post_TWDG_11_CB_ES-05461': {'1'}})

In [22]:
ename = "EBA1415_KYNS_4_CB_ES-05393"
print(essays_with_dupes[ename])
print_corefs_in_essay(essay2coref_train[ename])

{'1'}
looking              
at                   
the                  
graph                
of                   
trade                
winds                
and                  
coral                
bleaching            
it                   
helps                
show                 
what                 
leads                
to                   
differences          
in                   
the                  
rates                
of                   
coral                {'COREF_ID': '4'}
bleaching            {'COREF_ID': '4'}
.                    
********************************************************************************
in                   
0000                 
trade                
winds                
got                  
severly              
weaker               
going                
down                 
to                   
-                    
0                    {'COREF_ID': '7'}
.                    
************************************************

In [28]:
for ename, list_sent in list(essay2coref_train.items())[400:][:100]:
    matching_ix = -1
    for ix, sent in enumerate(list_sent):
        found_id = False
        found_ref = False
        for wd, tag_dict in sent:
            if "COREF_ID" in tag_dict:
                found_id = True
            if "COREF_REF" in tag_dict:
                found_ref = True
        if found_id and found_ref:
            matching_ix = ix
            break
            
    if found_id and found_ref:
        print(ename)
        print()
        for sent in list_sent: #[:matching_ix+1]:
            for wd, tag_dict in sent:
                copy = dict([(k,v) for k,v in tag_dict.items() if k in {"COREF_ID", "COREF_REF", "COREF_PHRASE"}])
                print(wd.ljust(20), copy if copy else "")
            print("*" * 80)
        break

EBA1415_SEAL_34_CB_ES-04796

what                 
leads                
to                   
differences          
in                   
the                  
rate                 
of                   
coral                {'COREF_REF': '5', 'COREF_PHRASE': 'the_coral_bleaching'}
bleaching            {'COREF_REF': '5', 'COREF_PHRASE': 'the_coral_bleaching'}
?                    
********************************************************************************
coral                {'COREF_REF': '5', 'COREF_PHRASE': 'the_coral_bleaching'}
bleaching            {'COREF_REF': '5', 'COREF_PHRASE': 'the_coral_bleaching'}
is                   
a                    
phenomenon           
in                   
which                
coral                {'COREF_REF': '3', 'COREF_PHRASE': 'the_coral'}
lose                 
it                   {'COREF_REF': '5', 'COREF_PHRASE': 'the_coral_bleaching'}
color                
.                    
****************************************************