# Train a Sequential Based Classier on the Coral Bleaching Data

Setup:
------

In [1]:
from Decorators import memoize_to_disk
from sent_feats_for_stacking import *
from load_data import load_process_essays, extract_features

from featurevectorizer import FeatureVectorizer
from featureextractionfunctions import *
from CrossValidation import cross_validation
from wordtagginghelper import *
from IterableFP import flatten
from predictions_to_file import predictions_to_file
from results_procesor import ResultsProcessor
# Classifiers

from window_based_tagger_config import get_config
from tag_frequency import get_tag_freq, regular_tag
# END Classifiers

import Settings
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()

Load the Essays
---------------

In [2]:
# Create persister (mongo client) - fail fast if mongo service not initialized
processor = ResultsProcessor()

# not hashed as don't affect persistence of feature processing
SPARSE_WD_FEATS     = True

MIN_FEAT_FREQ       = 5        # 5 best so far
CV_FOLDS            = 5

MIN_TAG_FREQ        = 5
LOOK_BACK           = 0     # how many sentences to look back when predicting tags
# end not hashed

# construct unique key using settings for pickling

settings = Settings.Settings()
folder =                            settings.data_directory + "CoralBleaching/BrattData/EBA1415_Merged/"
processed_essay_filename_prefix =   settings.data_directory + "CoralBleaching/BrattData/Pickled/essays_proc_pickled_"
features_filename_prefix =          settings.data_directory + "CoralBleaching/BrattData/Pickled/feats_pickled_"
models_folder =                     settings.data_directory + "CoralBleaching/models/CRF"
out_metrics_file     =              settings.data_directory + "CoralBleaching/Results/metrics.txt"

config = get_config(folder)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [3]:
config

{'folder': '/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/BrattData/EBA1415_Merged/',
 'include_normal': False,
 'include_vague': True,
 'lower_case': True,
 'min_df': 2,
 'min_sentence_length': 3,
 'remove_infrequent': False,
 'remove_punctuation': False,
 'remove_stop_words': False,
 'replace_nums': True,
 'spelling_correct': True,
 'stem': False,
 'window_size': 7}

In [4]:
mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays)
tagged_essays = mem_process_essays( **config )
logger.info("Essays loaded")
len(tagged_essays)

('Pickle Key:', 'folder_/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/BrattData/EBA1415_Merged/_include_normal_False_include_vague_True_lower_case_True_min_df_2_min_sentence_length_3_remove_infrequent_False_remove_punctuation_False_remove_stop_words_False_replace_nums_True_spelling_correct_True_stem_False_window_size_7')


1128

Create Corpus in CRF Format (list of list of tuples(word,tag))
--------------------------------------------------------------

In [5]:
tagged_essays[0].sentences[0]

[('what', set()),
 ('leads', set()),
 ('to', set()),
 ('differences', set()),
 ('in', set()),
 ('the', set()),
 ('rates', set()),
 ('of', set()),
 ('coral', {'50'}),
 ('bleaching', {'50'}),
 ('.', set())]

In [30]:
from collections import defaultdict
from IterableFP import flatten

def tally_code_frequencies(tagged_essays):
    freq = defaultdict(int)
    all_codes = set()
    for essay in tagged_essays:
        for i, sentence in enumerate(essay.sentences):
            words, tags = zip(*sentence)
            utags = set(flatten(tags))
            all_codes.update(utags)
            for t in utags:
                freq[t] += 1
    return freq

code_freq = tally_code_frequencies(tagged_essays)

In [19]:
regular_tags = list((t for t in all_codes if t[0].isdigit()))
sorted(regular_tags, key = lambda s: (len(s.replace("b","")), s))

['1', '2', '3', '4', '5', '5b', '6', '7', '11', '12', '13', '14', '50']

In [53]:
INSIDE = "I"
OUTSIDE = "O"

def to_tagged_sentences_by_code(essays, codes):
    code2sents = defaultdict(list)
    for essay in essays:
        for i, sentence in enumerate(essay.sentences):
            for code in codes:
                sent = []
                for wd, tags in sentence:
                    if code in tags:
                        sent.append((unicode(wd), INSIDE))
                    else:
                        sent.append((wd, OUTSIDE))
                code2sents[code].append(sent)
    return code2sents

def to_most_common_code_tagged_sentences(essays, codes, code_freq):
    codes = set(codes)
    tagged = []
    for essay in essays:
        for i, sentence in enumerate(essay.sentences):
            sent = []
            for wd, tags in sentence:
                # filter to target codes only
                tags = codes.intersection(tags)
                if len(tags) > 0:
                    most_common = max(tags, key = lambda tag: code_freq[tag])
                    sent.append((wd, most_common))
                else:
                    sent.append((wd, OUTSIDE))
            tagged.append(sent)
    return tagged

def to_label_powerset_tagged_sentences(essays, codes):
    codes = set(codes)
    tagged = []
    for essay in essays:
        for i, sentence in enumerate(essay.sentences):
            sent = []
            for wd, tags in sentence:
                # filter to target codes only
                isect_tags = ",".join(sorted(codes.intersection(tags)))
                if len(isect_tags) > 0:
                    # append as powerset label
                    sent.append((wd, isect_tags))
                else:
                    sent.append((wd, OUTSIDE))
            tagged.append(sent)
    return tagged

def to_sentences(tagged_sentences):
    sents = []
    for sentence in tagged_sentences:
        words, tags = zip(*sentence)
        sents.append(words)
    return sents

# flattens list of sentences to a flattened list of binary tags
def to_flattened_binary_tags(tagged_sentences):
    tags = []
    for sentence in tagged_sentences:
        words, lbls = zip(*sentence)
        tags.extend((1 if t == INSIDE else 0 for t in lbls))
    return tags

def to_flattened_binary_tags_by_code(tagged_sentences, codes):
    code2sents = defaultdict(list)
    for sentence in tagged_sentences:
        words, lbls = zip(*sentence)
        # for each word's tag (expects a single tag)
        for t in lbls:
            if type(t) != set:
                t = set(t.split(","))
            for code in codes:
                code2sents[code].append(1 if code in t else 0)
    return code2sents

In [54]:
def get_word_features(tokens, idx):
    """
    Extract basic features about this word
    :return : a list which contains the features
    :rtype : list(str)
    """ 
    token = tokens[idx]

    feature_list = []
    feature_list.append('WORD_' + token )
    return feature_list

## Train Tagger on Single Word Features Only

In [24]:
from nltk.tag.crf import CRFTagger
from wordtagginghelper import merge_dictionaries
from numpy.random import randint
import os

fold_models = []
cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list)
cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

folds = cross_validation(tagged_essays, CV_FOLDS)
for fold,(essays_TD, essays_VD) in enumerate(folds):
    td_sents_by_code = to_tagged_sentences_by_code(essays_TD, regular_tags)
    vd_sents_by_code = to_tagged_sentences_by_code(essays_VD, regular_tags)

    code2model = dict()
    fold_models.append(code2model)
    
    wd_td_ys_bytag = dict()
    wd_vd_ys_bytag = dict()
    td_wd_predictions_by_code = dict()
    vd_wd_predictions_by_code = dict()
    
    for code in sorted(regular_tags):
        print("Fold %i Training code: %s" % (fold, code))
        td, vd = td_sents_by_code[code], vd_sents_by_code[code]
        
        model_filename = models_folder + "/" + "%i_%s__%s" % (fold, "power_set", str(randint(0, 9999999)))

        model = CRFTagger(feature_func = get_word_features, verbose=False)
        model.train(td, model_filename)
        code2model[code] = model
        
        os.remove(model_filename)
            
        #TODO - non binary
        wd_td_ys_bytag[code] = to_flattened_binary_tags(td)
        wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd)
        
        td_predictions = model.tag_sents(to_sentences(td))
        vd_predictions = model.tag_sents(to_sentences(vd))

        td_wd_predictions_by_code[code] = to_flattened_binary_tags(td_predictions)
        vd_wd_predictions_by_code[code] = to_flattened_binary_tags(vd_predictions)
        
    merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
    merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
    merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag)
    merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag)


Fold 0 Training code: 4
Fold 0 Training code: 5
Fold 0 Training code: 5b
Fold 0 Training code: 50
Fold 0 Training code: 3
Fold 0 Training code: 7
Fold 0 Training code: 2
Fold 0 Training code: 6
Fold 0 Training code: 11
Fold 0 Training code: 13
Fold 0 Training code: 14
Fold 0 Training code: 1
Fold 0 Training code: 12
Fold 1 Training code: 4
Fold 1 Training code: 5
Fold 1 Training code: 5b
Fold 1 Training code: 50
Fold 1 Training code: 3
Fold 1 Training code: 7
Fold 1 Training code: 2
Fold 1 Training code: 6
Fold 1 Training code: 11
Fold 1 Training code: 13
Fold 1 Training code: 14
Fold 1 Training code: 1
Fold 1 Training code: 12
Fold 2 Training code: 4
Fold 2 Training code: 5
Fold 2 Training code: 5b
Fold 2 Training code: 50
Fold 2 Training code: 3
Fold 2 Training code: 7
Fold 2 Training code: 2
Fold 2 Training code: 6
Fold 2 Training code: 11
Fold 2 Training code: 13
Fold 2 Training code: 14
Fold 2 Training code: 1
Fold 2 Training code: 12
Fold 3 Training code: 4
Fold 3 Training code: 

In [25]:
print len(cv_wd_td_ys_by_tag["50"]), len(cv_wd_vd_ys_by_tag["50"])
print len(cv_wd_td_predictions_by_tag["50"]), len(cv_wd_vd_predictions_by_tag["50"])

671460 167865
671460 167865


In [27]:
logger.info("Training completed")

""" Persist Results to Mongo DB """

wd_algo = "CRF"
SUFFIX = "_CRF"
CB_TAGGING_TD, CB_TAGGING_VD, CB_SENT_TD, CB_SENT_VD = "CB_TAGGING_TD" + SUFFIX, "CB_TAGGING_VD" + SUFFIX, "CB_SENT_TD" + SUFFIX, "CB_SENT_VD" + SUFFIX
parameters = dict(config)
#parameters["extractors"] = map(lambda fn: fn.func_name, extractors)
parameters["min_feat_freq"] = MIN_FEAT_FREQ

wd_td_objectid = processor.persist_results(CB_TAGGING_TD, cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag, parameters, wd_algo)
wd_vd_objectid = processor.persist_results(CB_TAGGING_VD, cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag, parameters, wd_algo)

# This outputs 0's for MEAN CONCEPT CODES as we aren't including those in the outputs

print processor.results_to_string(wd_td_objectid,   CB_TAGGING_TD,  wd_vd_objectid,     CB_TAGGING_VD,  "TAGGING")
logger.info("Results Processed")

TAGGING

TAG:       1                     
f1:        0.750773816348        0.727482678984        
recall:    0.67738932561         0.651634257344        
precision: 0.841990228851        0.823314166231        
accuracy:  0.98704911685         0.985941083609        
sentences:                       4834                  

TAG:       2                     
f1:        0.637118193891        0.591245791246        
recall:    0.499739583333        0.457291666667        
precision: 0.878663003663        0.83619047619         
accuracy:  0.996744407709        0.996383999047        
sentences:                       960                   

TAG:       3                     
f1:        0.729700828132        0.707602924972        
recall:    0.643013674614        0.619214586255        
precision: 0.843403276804        0.825426501519        
accuracy:  0.983812885354        0.982611026718        
sentences:                       5704                  

TAG:       4                     
f1:        0

## Train Tagger on Label Powerset Predictions (Slightly Better)

In [None]:
from nltk.tag.crf import CRFTagger
from wordtagginghelper import merge_dictionaries
from numpy.random import randint
import os

fold_models = []
cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list)
cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

folds = cross_validation(tagged_essays, CV_FOLDS)
for fold,(essays_TD, essays_VD) in enumerate(folds):
        
    # For training
    td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    fold_models.append(code2model)
    
    wd_td_ys_bytag = dict()
    wd_vd_ys_bytag = dict()
    
    # Start Training
    print("Fold %i Training code" % (fold))

    model_filename = models_folder + "/" + "%i_%s__%s" % (fold, "power_set", str(randint(0, 9999999)))
    
    model = CRFTagger(feature_func = get_word_features, verbose=False)
    model.train(td_sents, model_filename)
    
    os.remove(model_filename)
    
    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags)
    
    # merge results for fold
    merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
    merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
    merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag)
    merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag)

Fold 0 Training code
Fold 1 Training code
Fold 2 Training code


In [None]:
logger.info("Training completed")

""" Persist Results to Mongo DB """

wd_algo = "CRF_lbl_powerset"
SUFFIX = "_CRF_LBL_POWERSET_TEST"
CB_TAGGING_TD, CB_TAGGING_VD, CB_SENT_TD, CB_SENT_VD = "CB_TAGGING_TD" + SUFFIX, "CB_TAGGING_VD" + SUFFIX, "CB_SENT_TD" + SUFFIX, "CB_SENT_VD" + SUFFIX
parameters = dict(config)
#parameters["extractors"] = map(lambda fn: fn.func_name, extractors)
parameters["min_feat_freq"] = MIN_FEAT_FREQ

wd_td_objectid = processor.persist_results(CB_TAGGING_TD, cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag, parameters, wd_algo)
wd_vd_objectid = processor.persist_results(CB_TAGGING_VD, cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag, parameters, wd_algo)

# This outputs 0's for MEAN CONCEPT CODES as we aren't including those in the outputs

print processor.results_to_string(wd_td_objectid,   CB_TAGGING_TD,  wd_vd_objectid,     CB_TAGGING_VD,  "TAGGING")
logger.info("Results Processed")

## Train Tagger on Most Frequent Label (based on label priors) 

In [None]:
from nltk.tag.crf import CRFTagger
from wordtagginghelper import merge_dictionaries
from numpy.random import randint
import os

fold_models = []
#TD, act vs pred
cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list)
#VD, act vs pred
cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

folds = cross_validation(tagged_essays, CV_FOLDS)
for fold,(essays_TD, essays_VD) in enumerate(folds):
        
    # For training
    td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq)
    vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq)

    fold_models.append(code2model)
    
    wd_td_ys_bytag = dict()
    wd_vd_ys_bytag = dict()
    
    # Start Training
    print("Fold %i Training code" % (fold))

    model_filename = models_folder + "/" + "%i_%s__%s" % (fold, "most_freq_code", str(randint(0, 9999999)))
    
    model = CRFTagger(feature_func = get_word_features, verbose=False)
    model.train(td_sents, model_filename)
    
    os.remove(model_filename)
    
    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags)
    
    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags)
    
    # merge results for fold
    merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
    merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
    merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag)
    merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag)

In [None]:
logger.info("Training completed")

""" Persist Results to Mongo DB """

wd_algo = "CRF_MOST_FREQUENT_TAG"
SUFFIX = "_CRF_LBL_POWERSET_TEST"
CB_TAGGING_TD, CB_TAGGING_VD, CB_SENT_TD, CB_SENT_VD = "CB_TAGGING_TD" + SUFFIX, "CB_TAGGING_VD" + SUFFIX, "CB_SENT_TD" + SUFFIX, "CB_SENT_VD" + SUFFIX
parameters = dict(config)
#parameters["extractors"] = map(lambda fn: fn.func_name, extractors)
parameters["min_feat_freq"] = MIN_FEAT_FREQ

wd_td_objectid = processor.persist_results(CB_TAGGING_TD, cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag, parameters, wd_algo)
wd_vd_objectid = processor.persist_results(CB_TAGGING_VD, cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag, parameters, wd_algo)

# This outputs 0's for MEAN CONCEPT CODES as we aren't including those in the outputs

print processor.results_to_string(wd_td_objectid,   CB_TAGGING_TD,  wd_vd_objectid,     CB_TAGGING_VD,  "TAGGING")
logger.info("Results Processed")

Extract Features
----------------

Test Feature Extractor
----------------------

In [118]:
def comparator(kvp):
    k,v = kvp
    num = (k[2:]).split(":")[0]
    return int(num)

def test_feature_extractor_on_sentence(extractor, sent):
    sent = sent.split(" ")
    for i in range(len(sent)):
        print sent[i].ljust(10),
        s = sorted(extractor(sent, i).items(), key = comparator)
        print map(lambda item: str(item).ljust(10),zip(*s)[0])

sent1 = "the cat sat on the mat"
sent2 = "coral bleaching"
sent3 = "president obama approached the senate, ..."
test_feature_extractor_on_sentence(extract_features, sent1)
print ""
test_feature_extractor_on_sentence(extract_features, sent2)
print ""
test_feature_extractor_on_sentence(extract_features, sent3)
print ""

the        ['WD-3:START', 'WD-2:START', 'WD-1:START', 'WD0:the   ', 'WD1:cat   ', 'WD2:sat   ', 'WD3:on    ']
cat        ['WD-3:START', 'WD-2:START', 'WD-1:the  ', 'WD0:cat   ', 'WD1:sat   ', 'WD2:on    ', 'WD3:the   ']
sat        ['WD-3:START', 'WD-2:the  ', 'WD-1:cat  ', 'WD0:sat   ', 'WD1:on    ', 'WD2:the   ', 'WD3:mat   ']
on         ['WD-3:the  ', 'WD-2:cat  ', 'WD-1:sat  ', 'WD0:on    ', 'WD1:the   ', 'WD2:mat   ', 'WD3:END   ']
the        ['WD-3:cat  ', 'WD-2:sat  ', 'WD-1:on   ', 'WD0:the   ', 'WD1:mat   ', 'WD2:END   ', 'WD3:END   ']
mat        ['WD-3:sat  ', 'WD-2:on   ', 'WD-1:the  ', 'WD0:mat   ', 'WD1:END   ', 'WD2:END   ', 'WD3:END   ']

coral      ['WD-3:START', 'WD-2:START', 'WD-1:START', 'WD0:coral ', 'WD1:bleach', 'WD2:END   ', 'WD3:END   ']
bleaching  ['WD-3:START', 'WD-2:START', 'WD-1:coral', 'WD0:bleach', 'WD1:END   ', 'WD2:END   ', 'WD3:END   ']

president  ['WD-3:START', 'WD-2:START', 'WD-1:START', 'WD0:presid', 'WD1:obama ', 'WD2:approach', 'WD3:the   ']
obama 