In [1]:
import datetime
import logging
from collections import defaultdict

import dill
import numpy as np
import pymongo
import pandas as pd
from sklearn.linear_model import LogisticRegression
from typing import Any

from CrossValidation import cross_validation
from Settings import Settings
from cost_functions import *
from crel_helper import get_cr_tags, get_tag_freq
from function_helpers import get_function_names, get_functions_by_name
from results_procesor import ResultsProcessor, __MICRO_F1__
from searn_parser import SearnModelTemplateFeatures
from template_feature_extractor import *
from window_based_tagger_config import get_config
from wordtagginghelper import merge_dictionaries

In [2]:
# Data Set Partition
CV_FOLDS = 5
MIN_FEAT_FREQ = 5

# Global settings
settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"

coref_root = root_folder + "CoReference/"
coref_output_folder = coref_root + "CRel/"

config = get_config(training_folder)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [3]:
train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)

len(pred_tagged_essays_train),len(pred_tagged_essays_test)

(902, 226)

In [4]:
EMPTY = "Empty"
from BrattEssay import ANAPHORA

def to_is_valid_crel(tags):
    filtered = set()
    for t in tags:
        t_lower = t.lower()
        if "rhetorical" in t_lower or "change" in t_lower or "other" in t_lower:
            continue
        if "->" in t and ANAPHORA not in t:
            filtered.add(t)
    return filtered

def get_crel_tags_by_sent(essays_a):
    crels_by_sent = []
    for ea in essays_a:
        for asent in ea.sentences:
            all_atags = set()
            for awd, atags in asent:
                all_atags.update(to_is_valid_crel(atags))
            crels_by_sent.append(all_atags)
    return crels_by_sent

In [5]:
tag_freq = get_tag_freq(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)

In [6]:
cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)
set_cr_tags = set(cr_tags)
cr_tags[0:10]

['Causer:5->Result:50',
 'Causer:7->Result:50',
 'Causer:3->Result:4',
 'Causer:11->Result:50',
 'Causer:1->Result:50',
 'Causer:13->Result:50',
 'Causer:6->Result:50',
 'Causer:3->Result:5',
 'Causer:4->Result:14',
 'Causer:3->Result:1']

In [7]:
total = 0
for cr in cr_tags:
    l,r = cr.replace("Causer:","").replace("Result:","").split("->")
    total += tag_freq[cr]
    if l == r:
        print(cr, tag_freq[cr])
total

Causer:50->Result:50 19
Causer:11->Result:11 2


43227

In [22]:
def evaluate_model(
        folds: List[Tuple[Any, Any]],
        max_epochs: int) -> float:

    serial_results = [
        model_train_predict(essays_TD, essays_VD, max_epochs)
        for essays_TD, essays_VD in folds
    ]

    cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict(list), defaultdict(list)
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

    # record the number of features in each fold
    number_of_feats = []

    # Parallel is almost 5X faster!!!
    cv_td_preds_by_sent = []
    cv_vd_preds_by_sent = []
    for (num_feats,
         sent_td_ys_bycode, sent_vd_ys_bycode,
         sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode, td_preds_by_sent, vd_preds_by_sent) in serial_results:
        number_of_feats.append(num_feats)

        merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag)
        merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag)
        merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag)
        merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag)
        
        cv_td_preds_by_sent.append(td_preds_by_sent)
        cv_vd_preds_by_sent.append(vd_preds_by_sent)

    # print(processor.results_to_string(sent_td_objectid, CB_SENT_TD, sent_vd_objectid, CB_SENT_VD, "SENTENCE"))
    return cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_td_preds_by_sent, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag, cv_vd_preds_by_sent

def add_cr_labels(observed_tags, ys_bytag_sent):
    global set_cr_tags
    for tag in set_cr_tags:
        if tag in observed_tags:
            ys_bytag_sent[tag].append(1)
        else:
            ys_bytag_sent[tag].append(0)

def get_label_data(tagged_essays):
    global set_cr_tags
    # outputs
    ys_bytag_sent = defaultdict(list)

    for essay in tagged_essays:
        for sentence in essay.sentences:
            unique_cr_tags = set()
            for word, tags in sentence:
                unique_cr_tags.update(set_cr_tags.intersection(tags))
            add_cr_labels(unique_cr_tags, ys_bytag_sent)
    return ys_bytag_sent

def get_label_data_essay_level(tagged_essays):
    global set_cr_tags
    # outputs
    ys_bytag_essay = defaultdict(list)

    for essay in tagged_essays:
        unique_cr_tags = set()
        for sentence in essay.sentences:
            for word, tags in sentence:
                unique_cr_tags.update(set_cr_tags.intersection(tags))
        add_cr_labels(unique_cr_tags, ys_bytag_essay)
    return ys_bytag_essay

def essay_to_crels(tagged_essays):
    global set_cr_tags
    # outputs
    name2crels = defaultdict(set)
    for essay in tagged_essays:
        unique_cr_tags = set()
        for sentence in essay.sentences:
            for word, tags in sentence:
                unique_cr_tags.update(set_cr_tags.intersection(tags))
        name2crels[essay.name] = unique_cr_tags
    return name2crels

In [9]:
from featurevectorizer import FeatureVectorizer

def metrics_to_df(metrics):
    import Rpfa

    rows = []
    for k,val in metrics.items():
        if type(val) == Rpfa.rpfa:
            d = dict(val.__dict__) # convert obj to dict
        elif type(val) == dict:
            d = dict(val)
        else:
            d = dict()
        d["code"] = k
        rows.append(d)
    return pd.DataFrame(rows)

def get_micro_metrics(df):
    return df[df.code == "MICRO_F1"][["accuracy", "f1_score", "recall", "precision"]]

In [35]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

class DependencyClassifier(object):
    def __init__(self, classifier_fn=LogisticRegression, 
                 negative_label=0, sentence_span=2, 
                 min_feat_freq=10,
                 log_fn=lambda s: print(s), ):
        self.log = log_fn
        self.epoch = 0
        self.negative_label = negative_label
        self.sentence_span = sentence_span
        self.min_feat_freq=min_feat_freq
        self.vectorizer = FeatureVectorizer(min_feature_freq=min_feat_freq)
        self.clf = classifier_fn()
    
    def __fill_in_gaps__(self, tag_seq):
        new_tag_seq = []
        for i, tag in enumerate(tag_seq):
            if tag == EMPTY \
                and i > 0 \
                and tag_seq[i-1] != EMPTY \
                and i < len(tag_seq)-1 \
                and tag_seq[i-1] == tag_seq[i+1]:
                    tag = tag_seq[i-1]

            new_tag_seq.append(tag)
        return new_tag_seq

    def __compute_tag_2_spans__(self, essay):
        sent_tag2spans = []
        wd_ix = -1
        essay_words = []
        essay_ptags = []
        for sent_ix in range(len(essay.sentences)):
            words, tag_seq = zip(*essay.sentences[sent_ix])

            tag2spans = [] # maps to list of start and end spans for each tag
            sent_tag2spans.append(tag2spans)

            last_tag = EMPTY
            tag_start = None
            ptags_sent = self.__fill_in_gaps__(essay.pred_tagged_sentences[sent_ix])
            current_crel_tags = set()
            for i, ptag in enumerate(ptags_sent):
                wd_ix += 1
                essay_words.append(words[i])
                essay_ptags.append(ptag)
                # Tag changed
                if ptag != last_tag:
                    if last_tag != EMPTY:
                        tag2spans.append((last_tag, tag_start, wd_ix-1, sent_ix, current_crel_tags))                    
                    tag_start = wd_ix
                    current_crel_tags = set()
                current_crel_tags.update(to_is_valid_crel(tag_seq[i]))
                last_tag = ptag
            if last_tag != EMPTY:
                tag2spans.append((last_tag, tag_start, wd_ix, len(essay.sentences)-1, current_crel_tags))
        assert len(essay_words) == len(essay_ptags)
        return sent_tag2spans, essay_words, essay_ptags
    
    def __combine_feats__(self, ftsa, ftsb):
        fts = {}
        for a, aval in ftsa.items():
            for b, bval in ftsb.items():
                fts[a + "|" + b] = aval * bval
        return fts
    
    def create_features(self, causer_tag, result_tag, causer_words, between_words, result_words, 
                        causer_first, sentences_between, codes_between):
        feats = {}
        crel = "Causer:{a}->Result:{b}".format(a=causer_tag, b=result_tag)
        feats[crel] = 1
        feats["Causer:{tag}".format(tag=causer_tag)] = 1
        feats["Result:{tag}".format(tag=result_tag)] = 1
        cs_fts, res_fts = {},{}
        for wd in causer_words:
            cs_fts["Causer:{wd}".format(wd=wd)] = 1
        feats.update(cs_fts)
        for wd in result_words:
            res_fts["Result:{wd}".format(wd=wd)] = 1
        feats.update(res_fts)
        feats.update(self.__combine_feats__(cs_fts, res_fts))
        btwn_fts = {}
        for wd in between_words:
            btwn_fts["Between:{wd}".format(wd=wd)] = 1
        feats.update(btwn_fts)
#         feats.update(self.__combine_feats__(cs_fts, btwn_fts))
#         feats.update(self.__combine_feats__(res_fts, btwn_fts))
        if causer_first:
            feats["Left2Right"] = 1
        else:
            feats["Right2Left"] = 1

        if sentences_between == 0:
            feats["SameSentence"] = 1
        feats["SentBetween"] = sentences_between
        if sentences_between <= 1:
            feats["SentBetween<=1"] = 1
        if sentences_between <= 2:
            feats["SentBetween<=2"] = 1
        else:
            feats["SentBetween>2"] = 1
            
        feats["CodesBetween=" + str(codes_between)] = 1
        if codes_between <= 1:
            feats["CodesBetween<=1"] = 1
        if codes_between <= 2:
            feats["CodesBetween<=2"] = 1
        else:
            feats["CodesBetween>2"] = 1
        return feats, crel
    
    def __generate_training_data__(self, essays):
        xs, ys, essay_sent_crel = [],[],[]
        for essay_ix, essay in enumerate(essays):
            sent_tag2spans, essay_words, essay_ptags = self.__compute_tag_2_spans__(essay)
            for sent_ix in range(len(sent_tag2spans)):
                # tag 2 spans for sentence
                next_tag2spans = []
                # grab next few sentences' predicted tags
                for offset in range(0, self.sentence_span+1):
                    if (sent_ix+offset) < len(sent_tag2spans):
                        next_tag2spans.extend(sent_tag2spans[sent_ix+offset])
                
                for ltag_ix, (ltag, lstart_ix, lend_ix, lsent_ix, lcrels) in enumerate(sent_tag2spans[sent_ix]):
                    for codes_between, (rtag, rstart_ix, rend_ix, rsent_ix, rcrels) in enumerate(next_tag2spans[ltag_ix+1:]):
                        sent_between  = rsent_ix - lsent_ix

                        ltag_words    = essay_words[lstart_ix:lend_ix+1]
                        between_words = essay_words[lend_ix+1:rstart_ix]
                        rtag_words    = essay_words[rstart_ix:rend_ix+1]
                        
                        lbls = set(lcrels).union(rcrels)
                        x,ft_crel = self.create_features(
                                causer_tag=ltag, result_tag=rtag, 
                                causer_words=ltag_words, between_words=between_words, result_words=rtag_words, 
                                causer_first=True, sentences_between=sent_between, codes_between=codes_between)
                        xs.append(x)
                        ys.append(1 if ft_crel in lbls else self.negative_label)
                        essay_sent_crel.append((essay.name, lsent_ix, rsent_ix, ft_crel))
                        
                        x,ft_crel = self.create_features(
                                causer_tag=rtag, result_tag=ltag, 
                                causer_words=rtag_words, between_words=between_words, result_words=ltag_words, 
                                causer_first=False, sentences_between=sent_between, codes_between=codes_between)
                        xs.append(x)
                        ys.append(1 if ft_crel in lbls else self.negative_label)
                        essay_sent_crel.append((essay.name, lsent_ix, rsent_ix, ft_crel))
        return xs, ys, essay_sent_crel
    
    def train(self, train_essays, sent_span=2):
        # Note that there are a small number of crels that span 2 sentences
        xs, ys, essay_sent_crel = self.__generate_training_data__(essays=train_essays)
        xs_array = self.vectorizer.fit_transform(xs)
        self.clf.fit(X=xs_array, y=ys)
        preds = self.clf.predict(xs_array)
        
    def predict_probability(self, tagged_essays, min_prob=0.1):
        # Get predicted probabilities
        xs, _, essay_sent_crel = self.__generate_training_data__(essays=tagged_essays)
        xs_array = self.vectorizer.transform(xs)
        probs = self.clf.predict_proba(xs_array)[:,1]
        
        name2pred = defaultdict(lambda : defaultdict(list))
        for (name, lsent_ix, rsent_ix, crel), prob in zip(essay_sent_crel, probs):
            if prob >= min_prob:
                name2pred[name][crel].append(prob)
        return name2pred

    def evaluate(self, tagged_essays, print_classification_report=True):
        # Note that there are a small number of crels that span 2 sentences
        xs, ys, essay_sent_crel = self.__generate_training_data__(essays=tagged_essays)
        xs_array = self.vectorizer.transform(xs)
        preds = self.clf.predict(xs_array)
        if print_classification_report:
            print(classification_report(y_true=ys, y_pred=preds))

        #TODO - This doesn't work
        namesent2pred = defaultdict(set)
        for (name, lsent_ix, rsent_ix, crel), pred in zip(essay_sent_crel, preds):
            if pred == 1:
                namesent2pred[(name, lsent_ix)].add(crel)
                namesent2pred[(name, rsent_ix)].add(crel)

        pred_ys_bytag_sent = defaultdict(list)
        for essay in tagged_essays:
            for sent_ix, sentence in enumerate(essay.sentences):
                unique_cr_tags = namesent2pred[(essay.name, sent_ix)]
                add_cr_labels(unique_cr_tags, pred_ys_bytag_sent)
        return pred_ys_bytag_sent
    
    def evaluate_essay_level(self, tagged_essays, print_classification_report=True):
        # Note that there are a small number of crels that span 2 sentences
        xs, ys, essay_sent_crel = self.__generate_training_data__(essays=tagged_essays)
        xs_array = self.vectorizer.transform(xs)
        preds = self.clf.predict(xs_array)
        if print_classification_report:
            print(classification_report(y_true=ys, y_pred=preds))

        #TODO - This doesn't work
        namesent2pred = defaultdict(set)
        for (name, lsent_ix, rsent_ix, crel), pred in zip(essay_sent_crel, preds):
            if pred == 1:
                namesent2pred[name].add(crel)

        pred_ys_bytag_essay = defaultdict(list)
        for essay in tagged_essays:
            unique_cr_tags = namesent2pred[essay.name]
            add_cr_labels(unique_cr_tags, pred_ys_bytag_essay)
        return pred_ys_bytag_essay 


In [36]:
def compute_sentence_accuracy(parser, essays):
    ys_bytag_sent = get_label_data(tagged_essays=essays)
    pred_ys_bytag_sent = parser.evaluate(tagged_essays=essays, print_classification_report=False)
    mean_metrics = ResultsProcessor.compute_mean_metrics(ys_bytag_sent, pred_ys_bytag_sent)
    return get_micro_metrics(metrics_to_df(mean_metrics))

def compute_essay_accuracy(parser, essays):
    ys_bytag_essay = get_label_data_essay_level(tagged_essays=essays)
    pred_ys_bytag_essay = parser.evaluate_essay_level(tagged_essays=essays, print_classification_report=False)
    mean_metrics = ResultsProcessor.compute_mean_metrics(ys_bytag_essay, pred_ys_bytag_essay)
    return get_micro_metrics(metrics_to_df(mean_metrics))


## Sentence Level Accuracy is Equivalent to Parser Model (Or Very Close) When We Don't Look Across Sentences

In [37]:
parser = DependencyClassifier(sentence_span=0)
parser.train(pred_tagged_essays_train)
pred_ys_bytag_sent = parser.evaluate(pred_tagged_essays_train)
pred_ys_bytag_sent_test = parser.evaluate(pred_tagged_essays_test)

compute_sentence_accuracy(parser, pred_tagged_essays_test)

             precision    recall  f1-score   support

          0       0.97      0.96      0.97      9036
          1       0.88      0.91      0.89      2916

avg / total       0.95      0.95      0.95     11952

             precision    recall  f1-score   support

          0       0.93      0.91      0.92      1868
          1       0.73      0.78      0.75       586

avg / total       0.88      0.88      0.88      2454



Unnamed: 0,accuracy,f1_score,recall,precision
95,0.997886,0.691213,0.653481,0.73357


In [38]:
compute_essay_accuracy(parser, pred_tagged_essays_test)

Unnamed: 0,accuracy,f1_score,recall,precision
95,0.985996,0.725191,0.694698,0.758483


In [24]:
from pprint import pprint

# look at the number of predicted items
lens = []
for ename, dct in probs.items():
    lens.append(len(dct))
    if len(dct) > 8:
#         pprint(dct)
        pass
        
np.mean(lens), np.median(lens), np.max(lens), np.percentile(lens, 75)

from itertools import combinations

def get_all_combos(items):
    cbos = []
    for i in range(1, len(items)+1):
        cbos.extend(combinations(items,i))
    return cbos

cbos = get_all_combos(range(3))
print(len(cbos)) # 2**len(items)-1
if len(cbos) < 1000:
    for cbo in sorted(cbos, key = lambda l: (len(l), l)):
        print(cbo)
        
probs = parser.predict_probability(pred_tagged_essays_test)

7
(0,)
(1,)
(2,)
(0, 1)
(0, 2)
(1, 2)
(0, 1, 2)


In [30]:
parser2 = DependencyClassifier(sentence_span=2)
parser2.train(pred_tagged_essays_train)
pred_ys_bytag_sent2 = parser2.evaluate(pred_tagged_essays_train)
pred_ys_bytag_sent_test2 = parser2.evaluate(pred_tagged_essays_test)

compute_sentence_accuracy(parser2, pred_tagged_essays_test)

             precision    recall  f1-score   support

          0       0.97      0.98      0.97     44627
          1       0.85      0.80      0.82      7487

avg / total       0.95      0.95      0.95     52114

             precision    recall  f1-score   support

          0       0.94      0.94      0.94      9280
          1       0.62      0.60      0.61      1484

avg / total       0.89      0.89      0.89     10764



Unnamed: 0,accuracy,f1_score,recall,precision
95,0.995256,0.498789,0.651899,0.403922


In [31]:
compute_essay_accuracy(parser2, pred_tagged_essays_test)

Unnamed: 0,accuracy,f1_score,recall,precision
95,0.981183,0.670077,0.718464,0.627796


# Try Structured Perceptron

In [None]:
"""
Structured perceptron classifier. Implementation geared for simplicity rather than
efficiency.
"""
from collections import defaultdict
import pickle

class StructuredPerceptron(object):

    '''A structured perceptron, as implemented by Matthew Honnibal.
    See more implementation details here:
        http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
    '''

    def __init__(self, learning_rate):
        # Each feature gets its own weight
        self.weights = defaultdict(float)
        self.learning_rate = learning_rate
        # The accumulated values, for the averaging. These will be keyed by
        # feature/clas tuples
        self._totals = defaultdict(int)
        # The last time the feature was changed, for the averaging. Also
        # keyed by feature/clas tuples
        # (tstamps is short for timestamps)
        self._tstamps = defaultdict(int)
        # Number of instances seen
        self.i = 0

    def rank(self, features_array):
        '''Dot-product the features and current weights and return the best label.'''
        scores2index = {}
        for i, feats in enumerate(features_array):
            scores2index[i] = self.decision_function(feats)
        # return a ranking of the scores, by best to worse

        return [ ix for ix, score in sorted(scores2index.items(), key = lambda tpl: -tpl[-1]) ]

    def train(self, best_feats, other_feats_array):
        best_ix = self.rank([best_feats] + list(other_feats_array))
        if best_ix != 0:
            predicted_feats = other_feats_array[best_ix-1]
            self.update(best_feats=best_feats, highest_ranked_feats=predicted_feats)

    def decision_function(self, features):
        '''Dot-product the features and current weights and return the score.'''
        score = 0.0
        for feat, value in features.items():
            if feat not in self.weights or value == 0:
                continue
            score += self.weights[feat] * value
        return score

    def update(self, best_feats, highest_ranked_feats):
        '''Update the feature weights.'''
        #TODO - weight the weight update by the difference in errors
        def upd_feat(feat, val):
            w = self.weights[feat]
            # update the totals by the number of timestamps the current value has survived * val
            self._totals[feat] += (self.i - self._tstamps[feat]) * w
            # store latest update timestamp
            self._tstamps[feat] = self.i
            # finally, update the current weight
            self.weights[feat] = w + (self.learning_rate * val)

        self.i += 1
        for feat, weight in self.weights.items():
            val = best_feats[feat] - highest_ranked_feats[feat]
            upd_feat(feat, val)
        return None

    def average_weights(self):
        '''Average weights from all iterations.'''
        new_feat_weights = defaultdict(float)
        for feat, weight in self.weights.items():
            total = self._totals[feat]
            total += (self.i - self._tstamps[feat]) * weight
            averaged = round(total / float(self.i), 5)
            if averaged != 0.0:
                new_feat_weights[feat] = averaged
        self.weights = new_feat_weights
        return None

    def save(self, path):
        '''Save the pickled model weights.'''
        return pickle.dump(dict(self.weights), open(path, 'w'))

    def load(self, path):
        '''Load the pickled model weights.'''
        self.weights = pickle.load(open(path))
        return None
