In [1]:
import dill

#from gensim.models import Word2Vec
from window_based_tagger_config import get_config
from Rpfa import micro_rpfa

import logging
import datetime
import pickle

from CoRefHelper import parse_stanfordnlp_tagged_essays, bratt_essays_2_hash_map
from CrossValidation import cross_validation
from BrattEssay import load_bratt_essays
from load_data import load_process_essays
from collections import defaultdict
from IterableFP import flatten
from Settings import Settings
from FindFiles import find_files

CV_FOLDS = 5
DEV_SPLIT = 0.1

settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/SXH1M01/GitHub/NlpResearch/
Public Data: /Users/SXH1M01/GitHub/NlpResearch/Data/PublicDatasets/


## Load CoRef Parsed Essays

In [2]:
coref_folder_train = root_folder + "CoReference/Training"
coref_files_train = find_files(coref_folder_train, ".*\.tagged")
len(coref_files_train)

902

In [3]:
coref_folder_test = root_folder + "CoReference/Test"
coref_files_test = find_files(coref_folder_test, ".*\.tagged")
len(coref_files_test)

226

In [4]:
essay2coref_train = parse_stanfordnlp_tagged_essays(coref_files_train)
print(len(essay2coref_train))

902


In [5]:
essay2coref_test = parse_stanfordnlp_tagged_essays(coref_files_test)
print(len(essay2coref_test))

226


### Examine Data Structure

### Notes on CoRef Datastructure
- Dictionary of esssays, keyed by name
- Each essay is a list of sentences
- Each sentence is a list of words
- words are mapped to a tag dict
  - tag dict - contains
    - NER tag (most are O - none)
    - POS tag
    - If a Co-Reference such as an anaphor (mostly pronouns)
      - COREF_PHRASE - phrase referred to by coref
      - COREF_REF - Id of referenced phrase
    - else if it is a phrase that is referenced:
      - COREF_ID - id of the co-reference, referenced in the COREF_REF tag

In [8]:
COREF_ID = "COREF_ID"
COREF_REF = "COREF_REF"

id2coref_tally = {}
id2fwdrefs = defaultdict(int)

essays_with_dupes = defaultdict(set)

for ename, list_sent in list(essay2coref_train.items()):
    matching_ix = -1
    essay_ref_tally = defaultdict(int)
    seen_ids = set()
    for sentix, sent in enumerate(list_sent):
        
        current_ref = ""
        current_id = ""
        
        for wd, tag_dict in sent:
            if COREF_ID in tag_dict:
                idval = list(tag_dict[COREF_ID])[0]
                if idval != current_id:
                    # new/changed
                    current_id = idval
                    if idval in seen_ids:
                        essays_with_dupes[ename].add(idval)
                    seen_ids.add(idval)
            else:
                current_id = ""
                
            if COREF_REF in tag_dict:
                refval = list(tag_dict[COREF_REF])[0]
                if refval != current_ref:
                    essay_ref_tally[refval] += 1
                    # new/changed
                    current_ref = refval
            else:
                current_ref = ""
        
#         for ref, cnt in sent_ref_tally.items():
#             if cnt > 0:
#                 id2coref_tally[(ename,sentix,ref)] = cnt

## How Many Mentions on Average Per Sentence?

In [9]:
import numpy as np
# number of references per id, ONLY for sentences that have one or more co-refs
vals = list(id2coref_tally.values())
np.mean(vals)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


nan

In [10]:
def print_corefs_in_essay(essay):
    sent_ixs = set()
    for sentix, sent in enumerate(essay):      
        for wd, tag_dict in sent:
            if "COREF_ID" in tag_dict:
                sent_ixs.add(sentix)
            if "COREF_REF" in tag_dict:
                sent_ixs.add(sentix)
                
    for sentix, sent in enumerate(essay):
        if sentix in sent_ixs:
            for wd, tag_dict in sent:
            
                copy = dict([(k,v) for k,v in tag_dict.items() if k in {"COREF_ID", "COREF_REF", "COREF_PHRASE"}])
                print(wd.ljust(20),  str(copy if copy else "").ljust(20))
            print("*" * 80)
                

In [11]:
tag_dict

defaultdict(set, {'POS': {'.'}, 'NER': {'O'}})

In [12]:
essays_with_dupes

defaultdict(set,
            {'EBA1415_KYNS_3_CB_ES-05376': {'6'},
             'EBA1415_KYNS_4_CB_ES-05393': {'1'},
             'EBA1415_SEKL_1_CB-04820': {'2'},
             'EBA1415_SWCT_6_CB-04877': {'2'},
             'EBA1415_WSAL_1_CB_ES-05487': {'7'}})

In [13]:
ename = "EBA1415_KYNS_4_CB_ES-05393"
print(essays_with_dupes[ename])
print_corefs_in_essay(essay2coref_train[ename])

{'1'}
looking                                  
at                                       
the                                      
graph                                    
of                                       
trade                                    
winds                                    
and                                      
coral                {'COREF_ID': {'3'}} 
bleaching            {'COREF_ID': {'3'}} 
it                   {'COREF_REF': {'3'}, 'COREF_PHRASE': {'coral_bleaching'}}
helps                                    
show                                     
what                                     
leads                                    
to                                       
differences                              
in                                       
the                                      
rates                                    
of                                       
coral                                    
bleaching                        

In [14]:
for ename, list_sent in list(essay2coref_train.items())[400:][:100]:
    matching_ix = -1
    for ix, sent in enumerate(list_sent):
        found_id = False
        found_ref = False
        for wd, tag_dict in sent:
            if "COREF_ID" in tag_dict:
                found_id = True
            if "COREF_REF" in tag_dict:
                found_ref = True
        if found_id and found_ref:
            matching_ix = ix
            break
            
    if found_id and found_ref:
        print(ename)
        print()
        for sent in list_sent: #[:matching_ix+1]:
            for wd, tag_dict in sent:
                copy = dict([(k,v) for k,v in tag_dict.items() if k in {"COREF_ID", "COREF_REF", "COREF_PHRASE"}])
                print(wd.ljust(20), copy if copy else "")
            print("*" * 80)
        break

EBA1415_SEAL_34_CB_ES-04796

what                 
leads                
to                   
differences          
in                   
the                  
rate                 
of                   
coral                
bleaching            
?                    
********************************************************************************
coral                {'COREF_ID': {'2'}}
bleaching            {'COREF_ID': {'2'}}
is                   
a                    
phenomenon           
in                   
which                
coral                
lose                 
it                   {'COREF_REF': {'2'}, 'COREF_PHRASE': {'coral_bleaching'}}
color                
.                    
********************************************************************************
coral                {'COREF_REF': {'3'}, 'COREF_PHRASE': {'the_coral_bleaching'}}
bleaching            {'COREF_REF': {'3'}, 'COREF_PHRASE': {'the_coral_bleaching'}}
is                   
mostly              