# Train a Sequential Based Classier on the Coral Bleaching Data

Setup:
------

In [1]:
""" Imports """
from collections import defaultdict

import numpy as np
from numpy import random

from Metrics import rpf1a
from Rpfa import rpfa, weighted_mean_rpfa
from BrattEssay import load_bratt_essays
from WindowSplitter import split_into_windows

from IterableFP import flatten

from nltk import PorterStemmer

""" TODO 
    Try dependency parse features from this python dependency parser: https://github.com/syllog1sm/redshift
"""
None

In [2]:
""" Settings """
""" Start Script """
WINDOW_SIZE = 7

MIN_SENTENCE_FREQ = 1
PCT_VALIDATION = 0.25

STEM = True

Load the Essays
---------------

In [5]:
""" Load Essays """
essays = load_bratt_essays("/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/BrattData/EBA1415_Merged/")

all_codes = set()
all_words = []

for essay in essays:
    for sentence in essay.tagged_sentences:
        for w, tags in sentence:
            all_words.append(w)
            all_codes.update(tags)
                
# Correct miss-spellings
from SpellingCorrector import SpellingCorrector

print "Running Spelling Correction.."
corrector = SpellingCorrector(all_words)
corrections = defaultdict(int)
code_freq = defaultdict(int)
for essay in essays:
    for i, sentence in enumerate(essay.tagged_sentences):
        for j, (w, tags) in enumerate(sentence):
            for t in tags:
                code_freq[t] += 1
            # common error is ..n't and ..nt
            if w.endswith("n't") or w.endswith("n'"):
                cw = w[:-3] + "nt"
            elif w.endswith("'s"):
                cw = w[:-2]
            else:
                cw = corrector.correct(w)
            if cw != w:
                corrections[(w,cw)] += 1
                sentence[j] = (cw, tags)            
            
wd_sent_freq = defaultdict(int)
for essay in essays:
    for sentence in essay.tagged_sentences:
        wds, tag_list = zip(*sentence)
        unique_wds = set(wds)
        for w in unique_wds: 
            wd_sent_freq[w] += 1
            
numeric_codes = set([c for c in all_codes if not c.isalpha()])
print "Done"

1154 files found
Skipping /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/BrattData/EBA1415_Merged/EBA1415_AEKD_4_CB_ES-05571.ann file as .txt file is no essay'
Skipping /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/BrattData/EBA1415_Merged/EBA1415_AEKD_4_CB_ES-05904.ann file as .txt file is no essay'
Skipping /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/BrattData/EBA1415_Merged/EBA1415_BGJD_1_CB_ES-05733.ann file as .txt file is no essay //'
Skipping /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/BrattData/EBA1415_Merged/EBA1415_ERSK_7_CB_ES-05798.ann file as .txt file is no essay //'
Skipping /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/BrattData/EBA1415_Merged/EBA1415_KYLS_5_CB_ES-05671.ann file as .txt file is no essay //'
Skipping /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/BrattData/EBA1415_Merged/EBA1415_LRJE_5_CB_ES-05128.ann file as .txt file is no essay'
Skipping /Users/simon.hughes/Google Drive/Phd/Data/Coral

Verify Spelling Corrections
---------------------------

In [None]:
from DictionaryHelper import *
cor_srtd = sort_by_value(corrections, reverse = True)
print MIN_SENTENCE_FREQ
cor_srtd[0:20]

Create Corpus in CRF Format (list of list of tuples(word,tag))
--------------------------------------------------------------

In [106]:
tagged_sentences_by_code = defaultdict(list)

#store raw sentence
ix2sentence = {}
ix2tags = {}

#TODO Try IOBE encoding
out_tag = 'O'
ix = -1
#Keep untagged sentences for testing
sentences = []
tags4sentences = []
CODE_ALL_NUMERIC = "All_Numeric_Codes"
for essay in essays:
    for i, sentence in enumerate(essay.tagged_sentences):
        ix+= 1
        
        sent = [(w,t) for (w,t) in sentence if wd_sent_freq[w] >= MIN_SENTENCE_FREQ]
        if not sent:
            print "No words above minimum frequency", zip(*sentence)[0]
            continue
        words, tags = zip(*sent)
        
        ix2sentence[ix] = words
        sentences.append(words)
        
        utags = set(flatten(tags))
        tags4sentences.append(utags)
        
        for code in utags:
            tagged = []
            for w, tags in sentence:
                tagged.append((w, code if code in tags else out_tag))
            tagged_sentences_by_code[code].append(tagged)
            
        tagged_with_numeric = []
        for w, tags in sentence:
            intersect = tags & numeric_codes
            if intersect:
                code = list(intersect)[0]
                if len(intersect) != 1:
                    # Choose the least frequent
                    code = sorted(intersect, key = lambda c: code_freq[c])[0]
                    print "Error, more than 1 numeric code\n\t", intersect
                    print "\tChose:", code
                tagged_with_numeric.append((w, code))
            else:
                tagged_with_numeric.append((w, out_tag))
        tagged_sentences_by_code[CODE_ALL_NUMERIC].append(tagged_with_numeric)
            
        no_tags = [(w,out_tag) for w in words]
        missing_tags = all_codes - utags
        for code in missing_tags:
            tagged_sentences_by_code[code].append(no_tags)

#To numpy so we can filter using indexes
for code in list(all_codes) + [CODE_ALL_NUMERIC]:
    sents = tagged_sentences_by_code[code]
    tagged_sentences_by_code[code] = np.asarray(sents)

sentences = np.asarray(sentences)
tags4sentences = np.asarray(tags4sentences)

def tags_to_binary_matches(tags_lst, code):
    return np.asarray([1 if code in tags else 0 for tags in tags_lst])

binary_lbls_for_code = {}
for code in all_codes:
    binary_lbls_for_code[code] = tags_to_binary_matches(tags4sentences, code)

Error, more than 1 numeric code
	set(['50', '7'])
	Chose: 7


In [8]:
# Verify all codes same length, types are the same (np.array of lists of tuples of (word,tag) pairs)
for code in sorted(all_codes):
    print code.ljust(20), len(tagged_sentences_by_code[code]), \
      type(tagged_sentences_by_code[code]), \
            type(tagged_sentences_by_code[code][0]), \
                type(tagged_sentences_by_code[code][0][0]), \
                    type(tagged_sentences_by_code[code][0][0][0]),\
                        type(tagged_sentences_by_code[code][0][0][1])

1                    2144 <type 'numpy.ndarray'> <type 'list'> <type 'tuple'> <type 'str'> <type 'str'>
11                   2144 <type 'numpy.ndarray'> <type 'list'> <type 'tuple'> <type 'str'> <type 'str'>
12                   2144 <type 'numpy.ndarray'> <type 'list'> <type 'tuple'> <type 'str'> <type 'str'>
13                   2144 <type 'numpy.ndarray'> <type 'list'> <type 'tuple'> <type 'str'> <type 'str'>
14                   2144 <type 'numpy.ndarray'> <type 'list'> <type 'tuple'> <type 'str'> <type 'str'>
2                    2144 <type 'numpy.ndarray'> <type 'list'> <type 'tuple'> <type 'str'> <type 'str'>
3                    2144 <type 'numpy.ndarray'> <type 'list'> <type 'tuple'> <type 'str'> <type 'str'>
4                    2144 <type 'numpy.ndarray'> <type 'list'> <type 'tuple'> <type 'str'> <type 'str'>
5                    2144 <type 'numpy.ndarray'> <type 'list'> <type 'tuple'> <type 'str'> <type 'str'>
50                   2144 <type 'numpy.ndarray'> <type 'list'> <

Extract Features
----------------

In [114]:
""" TODO:
        Extract features for numbers
        Extract features for years
        Extract features that are temperatures (look for degree\degrees in subsequent words, along with C or F)
"""
stemmer = PorterStemmer()

WINDOW_OFFSET = int(WINDOW_SIZE / 2)
PAD_START = ["START" for i in range(WINDOW_OFFSET)]
PAD_END = ["END" for i in range(WINDOW_OFFSET)]
POS_TAG = 'x'

def extract_features(words, index):
    """ Takes a list of string and the index for the word in the list
    """
    
    wds = PAD_START + list(words[::]) + PAD_END
    ix_offset = index + WINDOW_OFFSET
    start = ix_offset - WINDOW_OFFSET
    end   = start + WINDOW_SIZE
    
    feats = {}
    for i in range(start, end):
        word = wds[i].strip()
        if STEM:
            word = stemmer.stem(word)
        feats["WD" + str(i - ix_offset) + ":" + word] = POS_TAG
    
    #word_features = extract_word_features(words, feature_val=1)
    #features.update(word_features)    
    return feats

Test Feature Extractor
----------------------

In [118]:
def comparator(kvp):
    k,v = kvp
    num = (k[2:]).split(":")[0]
    return int(num)

def test_feature_extractor_on_sentence(extractor, sent):
    sent = sent.split(" ")
    for i in range(len(sent)):
        print sent[i].ljust(10),
        s = sorted(extractor(sent, i).items(), key = comparator)
        print map(lambda item: str(item).ljust(10),zip(*s)[0])

sent1 = "the cat sat on the mat"
sent2 = "coral bleaching"
sent3 = "president obama approached the senate, ..."
test_feature_extractor_on_sentence(extract_features, sent1)
print ""
test_feature_extractor_on_sentence(extract_features, sent2)
print ""
test_feature_extractor_on_sentence(extract_features, sent3)
print ""

the        ['WD-3:START', 'WD-2:START', 'WD-1:START', 'WD0:the   ', 'WD1:cat   ', 'WD2:sat   ', 'WD3:on    ']
cat        ['WD-3:START', 'WD-2:START', 'WD-1:the  ', 'WD0:cat   ', 'WD1:sat   ', 'WD2:on    ', 'WD3:the   ']
sat        ['WD-3:START', 'WD-2:the  ', 'WD-1:cat  ', 'WD0:sat   ', 'WD1:on    ', 'WD2:the   ', 'WD3:mat   ']
on         ['WD-3:the  ', 'WD-2:cat  ', 'WD-1:sat  ', 'WD0:on    ', 'WD1:the   ', 'WD2:mat   ', 'WD3:END   ']
the        ['WD-3:cat  ', 'WD-2:sat  ', 'WD-1:on   ', 'WD0:the   ', 'WD1:mat   ', 'WD2:END   ', 'WD3:END   ']
mat        ['WD-3:sat  ', 'WD-2:on   ', 'WD-1:the  ', 'WD0:mat   ', 'WD1:END   ', 'WD2:END   ', 'WD3:END   ']

coral      ['WD-3:START', 'WD-2:START', 'WD-1:START', 'WD0:coral ', 'WD1:bleach', 'WD2:END   ', 'WD3:END   ']
bleaching  ['WD-3:START', 'WD-2:START', 'WD-1:coral', 'WD0:bleach', 'WD1:END   ', 'WD2:END   ', 'WD3:END   ']

president  ['WD-3:START', 'WD-2:START', 'WD-1:START', 'WD0:presid', 'WD1:obama ', 'WD2:approach', 'WD3:the   ']
obama 

Training
========

In [127]:
from nltk.tag.crf import MalletCRF
from nltk.tag.hmm import HiddenMarkovModelTagger

n = len(sentences)
ixs = range(n)
num_train = int((1 - PCT_VALIDATION) * n)
ix_train = ixs[:num_train]
ix_valid = ixs[num_train:]
print "#train", len(ix_train), "#validation", len(ix_valid), "#combined", len(ix_train + ix_valid), "#total", len(ixs)

def train(codes):
    code2tagger = {}
    for code in sorted(codes):

        print "Training on", code
        data = tagged_sentences_by_code[code]
        #Training Data
        train = data[ix_train]
        #tagger = HiddenMarkovModelTagger.train(train, verbose=True)
        tagger = MalletCRF.train(feature_detector= extract_features, corpus=train, 
                      filename=None, weight_groups=None, gaussian_variance=1, default_label='O',
                      transduction_type='VITERBI', max_iterations=500,
                      add_start_state=True, add_end_state=True, trace=1)
        code2tagger[code] = tagger
    return code2tagger

#train 1608 #validation 536 #combined 2144 #total 2144


In [128]:
from Metrics import rpf1a
from Rpfa import rpfa, weighted_mean_rpfa

def test_for_code(code, ixs, code2tagger):
    #Validation Data
    valid_sentences = sentences[ixs]
    act_ys = binary_lbls_for_code[code][ixs]
    
    tagger = code2tagger[code]
    
    #pred_tags = [tagger.tag(s) for s in valid_sentences]
    pred_tags = tagger.batch_tag(valid_sentences)
    
    tags_list = map(lambda tag_sent: set(zip(*tag_sent)[1]) , pred_tags)
    pred_ys = tags_to_binary_matches(tags_list, code)
    
    num_codes = len([y for y in act_ys if y == 1])
    r,p,f1,a = rpf1a(act_ys, pred_ys)
    print "code:      ", code
    print "recall:    ", r
    print "precision: ", p
    print "f1:        ", f1
    print "accuracy:  ", a
    print "sentences: ", num_codes
    print ""
    return rpfa(r,p,f1,a,num_codes)

print ""
print "total sent:", len(ix_valid)
print ""


total sent: 536



Training Data Performance
-------------------------

In [129]:
reg_codes = [c for c in all_codes if c.isdigit() or c == "explicit"]

code2tagger = train(reg_codes)

td_metrics = []
for c in sorted(reg_codes):
    td_metrics.append(test_for_code(c, ix_train, code2tagger))

print ""
td_wt_mn_prfa = weighted_mean_rpfa(td_metrics)
print "Training Data: "
print td_wt_mn_prfa

Training on 1
Training on 11
Training on 12
Training on 13
Training on 14
Training on 2
Training on 3
Training on 4
Training on 5
Training on 50
Training on 6
Training on 7
Training on explicit
code:       1
recall:     0.914634146341
precision:  0.920245398773
f1:         0.917431192661
accuracy:   0.983208955224
sentences:  164

code:       11
recall:     0.452380952381
precision:  0.904761904762
f1:         0.603174603175
accuracy:   0.984452736318
sentences:  42

code:       12
recall:     0.944444444444
precision:  0.944444444444
f1:         0.944444444444
accuracy:   0.998756218905
sentences:  18

code:       13
recall:     0.62962962963
precision:  0.871794871795
f1:         0.731182795699
accuracy:   0.984452736318
sentences:  54

code:       14
recall:     1.0
precision:  0.473684210526
f1:         0.642857142857
accuracy:   0.993781094527
sentences:  9

code:       2
recall:     0.678571428571
precision:  0.59375
f1:         0.633333333333
accuracy:   0.98631840796
sentences:

Validation Data Performance
---------------------------

In [131]:
vd_metrics = []
#for c in sorted(all_codes):
for c in sorted(reg_codes):
#for c in sorted(numeric_codes):
    vd_metrics.append(test_for_code(c, ix_valid, code2tagger))

vd_wt_mn_prfa = weighted_mean_rpfa(vd_metrics)
print "Validation Data:"
print vd_wt_mn_prfa

code:       1
recall:     0.879518072289
precision:  0.503448275862
f1:         0.640350877193
accuracy:   0.847014925373
sentences:  83

code:       11
recall:     0.153846153846
precision:  0.0363636363636
f1:         0.0588235294118
accuracy:   0.880597014925
sentences:  13

code:       12
recall:     0.818181818182
precision:  0.145161290323
f1:         0.246575342466
accuracy:   0.897388059701
sentences:  11

code:       13
recall:     0.411764705882
precision:  0.111111111111
f1:         0.175
accuracy:   0.876865671642
sentences:  17

code:       14
recall:     0.909090909091
precision:  0.243902439024
f1:         0.384615384615
accuracy:   0.940298507463
sentences:  11

code:       2
recall:     0.631578947368
precision:  0.328767123288
f1:         0.432432432432
accuracy:   0.882462686567
sentences:  38

code:       3
recall:     0.773913043478
precision:  0.640287769784
f1:         0.700787401575
accuracy:   0.858208955224
sentences:  115

code:       4
recall:     0.9
precis

**CRF - reg codes** - MIN_SENT_FREQ = 1, STEM = False, PCT_VALIDATION = 0.25  
Validation Data:
Recall: 0.6892, Precision: 0.7949, **F1: 0.7212**, Accuracy: 0.9061, Codes:   798

**HMM - reg codes ** - MIN_SENT_FREQ = 1, STEM = True, PCT_VALIDATION = 0.25  
Validation Data:
Recall: 0.6905, Precision: 0.5425, **F1: 0.5790**, Accuracy: 0.8379, Codes:   798

**CRF - numeric codes ** - MIN_SENT_FREQ = 1, STEM = False, PCT_VALIDATION = 0.25  
Validation Data:
Recall: 0.7731, Precision: 0.8286, **F1: 0.7875**, Accuracy: 0.9507, Codes:   648

**TODO**  
Try training it on all codes rather than one at a time, ignoring where multiple (so ignore word codes, restrict to codes with a number in them somewhere).  
Try IOBE encoding  
Try wordnet synonyms, and word2vec learned synonyms (boost recall)

Test Tagger Trained on Numeric Concept Codes (None-Overlapping)
---------------------------------------------------------------

In [121]:
def test_for_numeric_codes(code, ixs, tagger):
    #Validation Data
    valid_sentences = sentences[ixs]
    act_ys = binary_lbls_for_code[code][ixs]
    
    pred_tags = tagger.batch_tag(valid_sentences)
    tags_list = map(lambda tag_sent: set(zip(*tag_sent)[1]) , pred_tags)
    pred_ys = tags_to_binary_matches(tags_list, code)
    
    num_codes = len([y for y in act_ys if y == 1])
    r,p,f1,a = rpf1a(act_ys, pred_ys)
    print "code:      ", code
    print "recall:    ", r
    print "precision: ", p
    print "f1:        ", f1
    print "accuracy:  ", a
    print "sentences: ", num_codes
    print ""
    return rpfa(r,p,f1,a,num_codes)

In [122]:
c2tagger = train([CODE_ALL_NUMERIC])
tagger = c2tagger[CODE_ALL_NUMERIC]

td_metrics = []
for c in sorted(numeric_codes):
    td_metrics.append(test_for_numeric_codes(c, ix_train, tagger))

td_wt_mn_prfa = weighted_mean_rpfa(td_metrics)
print "Training Data: "
print td_wt_mn_prfa

Training on All_Numeric_Codes
[MalletCRF] Training a new CRF: /var/folders/97/_9xfnqln35vb6b1ng9czmr2d4x97nt/T/modelx2SzvL.crf
[MalletCRF] Calling mallet to train CRF...
[MalletCRF]   Number of weights = 14028
[MalletCRF]   CRF about to train with 500 iterations
[MalletCRF]   CRF finished one iteration of maximizer, i=0
[MalletCRF]   CRF finished one iteration of maximizer, i=1
[MalletCRF]   CRF finished one iteration of maximizer, i=2
[MalletCRF]   CRF finished one iteration of maximizer, i=3
[MalletCRF]   CRF finished one iteration of maximizer, i=4
[MalletCRF]   CRF finished one iteration of maximizer, i=5
[MalletCRF]   CRF finished one iteration of maximizer, i=6
[MalletCRF]   CRF finished one iteration of maximizer, i=7
[MalletCRF]   CRF finished one iteration of maximizer, i=8
[MalletCRF]   CRF finished one iteration of maximizer, i=9
[MalletCRF]   CRF finished one iteration of maximizer, i=10
[MalletCRF]   CRF finished one iteration of maximizer, i=11
[MalletCRF]   CRF finished 

In [123]:
tagger = c2tagger[CODE_ALL_NUMERIC]

vd_metrics = []
for c in sorted(numeric_codes):
    vd_metrics.append(test_for_numeric_codes(c, ix_valid, tagger))

vd_wt_mn_prfa = weighted_mean_rpfa(vd_metrics)
print "Validation Data: "
print vd_wt_mn_prfa

code:       1
recall:     0.903614457831
precision:  0.688073394495
f1:         0.78125
accuracy:   0.921641791045
sentences:  83

code:       11
recall:     0.615384615385
precision:  1.0
f1:         0.761904761905
accuracy:   0.990671641791
sentences:  13

code:       12
recall:     0.454545454545
precision:  1.0
f1:         0.625
accuracy:   0.988805970149
sentences:  11

code:       13
recall:     0.470588235294
precision:  0.727272727273
f1:         0.571428571429
accuracy:   0.977611940299
sentences:  17

code:       14
recall:     0.454545454545
precision:  0.714285714286
f1:         0.555555555556
accuracy:   0.985074626866
sentences:  11

code:       2
recall:     0.5
precision:  0.95
f1:         0.655172413793
accuracy:   0.962686567164
sentences:  38

code:       3
recall:     0.721739130435
precision:  0.741071428571
f1:         0.73127753304
accuracy:   0.886194029851
sentences:  115

code:       4
recall:     0.75
precision:  1.0
f1:         0.857142857143
accuracy:   0.9