In [10]:
import datetime
import logging
from collections import defaultdict

import dill
import numpy as np
import pymongo
import pandas as pd
from sklearn.linear_model import LogisticRegression
from typing import Any

from CrossValidation import cross_validation
from Settings import Settings
from cost_functions import *
from crel_helper import get_cr_tags, get_tag_freq
from function_helpers import get_function_names, get_functions_by_name
from results_procesor import ResultsProcessor, __MICRO_F1__
from searn_parser import SearnModelTemplateFeatures
from template_feature_extractor import *
from window_based_tagger_config import get_config
from wordtagginghelper import merge_dictionaries

In [2]:
# Data Set Partition
CV_FOLDS = 5
MIN_FEAT_FREQ = 5

# Global settings
settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"

coref_root = root_folder + "CoReference/"
coref_output_folder = coref_root + "CRel/"

config = get_config(training_folder)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [3]:
train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)

len(pred_tagged_essays_train),len(pred_tagged_essays_test)

(902, 226)

In [89]:
EMPTY = "Empty"
from BrattEssay import ANAPHORA

def to_is_valid_crel(tags):
    filtered = set()
    for t in tags:
        t_lower = t.lower()
        if "rhetorical" in t_lower or "change" in t_lower or "other" in t_lower:
            continue
        if "->" in t and ANAPHORA not in t:
            filtered.add(t)
    return filtered

def get_crel_tags_by_sent(essays_a):
    crels_by_sent = []
    for ea in essays_a:
        for asent in ea.sentences:
            all_atags = set()
            for awd, atags in asent:
                all_atags.update(to_is_valid_crel(atags))
            crels_by_sent.append(all_atags)
    return crels_by_sent

In [11]:
tag_freq = get_tag_freq(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)

In [17]:
cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)
set_cr_tags = set(cr_tags)
cr_tags[0:10]

['Causer:5->Result:50',
 'Causer:7->Result:50',
 'Causer:3->Result:4',
 'Causer:13->Result:50',
 'Causer:11->Result:50',
 'Causer:1->Result:50',
 'Causer:6->Result:50',
 'Causer:3->Result:5',
 'Causer:4->Result:14',
 'Causer:3->Result:1']

In [15]:
total = 0
for cr in cr_tags:
    l,r = cr.replace("Causer:","").replace("Result:","").split("->")
    total += tag_freq[cr]
    if l == r:
        print(cr, tag_freq[cr])
total

Causer:50->Result:50 19
Causer:11->Result:11 2


43227

In [21]:
def evaluate_model(
        folds: List[Tuple[Any, Any]],
        max_epochs: int) -> float:

    serial_results = [
        model_train_predict(essays_TD, essays_VD, max_epochs)
        for essays_TD, essays_VD in folds
    ]

    cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict(list), defaultdict(list)
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

    # record the number of features in each fold
    number_of_feats = []

    # Parallel is almost 5X faster!!!
    cv_td_preds_by_sent = []
    cv_vd_preds_by_sent = []
    for (num_feats,
         sent_td_ys_bycode, sent_vd_ys_bycode,
         sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode, td_preds_by_sent, vd_preds_by_sent) in serial_results:
        number_of_feats.append(num_feats)

        merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag)
        merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag)
        merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag)
        merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag)
        
        cv_td_preds_by_sent.append(td_preds_by_sent)
        cv_vd_preds_by_sent.append(vd_preds_by_sent)

    # print(processor.results_to_string(sent_td_objectid, CB_SENT_TD, sent_vd_objectid, CB_SENT_VD, "SENTENCE"))
    return cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_td_preds_by_sent, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag, cv_vd_preds_by_sent

def add_cr_labels(observed_tags, ys_bytag_sent):
    global set_cr_tags
    for tag in set_cr_tags:
        if tag in observed_tags:
            ys_bytag_sent[tag].append(1)
        else:
            ys_bytag_sent[tag].append(0)

def get_label_data(tagged_essays):
    global set_cr_tags
    # outputs
    ys_bytag_sent = defaultdict(list)

    for essay in tagged_essays:
        for sentence in essay.sentences:
            unique_cr_tags = set()
            for word, tags in sentence:
                unique_cr_tags.update(set_cr_tags.intersection(tags))
            add_cr_labels(unique_cr_tags, ys_bytag_sent)
    return ys_bytag_sent

In [140]:
from featurevectorizer import FeatureVectorizer

In [141]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

class DependencyClassifier(object):
    def __init__(self, classifier_fn=LogisticRegression, 
                 negative_label=0, sentence_span=2, 
                 min_feat_freq=10,
                 log_fn=lambda s: print(s), ):
        self.log = log_fn
        self.epoch = 0
        self.negative_label = negative_label
        self.sentence_span = sentence_span
        self.min_feat_freq=min_feat_freq
        self.vectorizer = FeatureVectorizer(min_feature_freq=min_feat_freq)
        self.clf = classifier_fn()
    
    def __fill_in_gaps__(self, tag_seq):
        new_tag_seq = []
        for i, tag in enumerate(tag_seq):
            if tag == EMPTY \
                and i > 0 \
                and tag_seq[i-1] != EMPTY \
                and i < len(tag_seq)-1 \
                and tag_seq[i-1] == tag_seq[i+1]:
                    tag = tag_seq[i-1]

            new_tag_seq.append(tag)
        return new_tag_seq

    def __compute_tag_2_spans__(self, essay):
        sent_tag2spans = []
        wd_ix = -1
        essay_words = []
        for sent_ix in range(len(essay.sentences)):
            words, tag_seq = zip(*essay.sentences[sent_ix])

            tag2spans = [] # maps to list of start and end spans for each tag
            sent_tag2spans.append(tag2spans)

            last_tag = EMPTY
            tag_start = None
            ptags_sent = self.__fill_in_gaps__(essay.pred_tagged_sentences[sent_ix])
            current_crel_tags = set()
            for i, ptag in enumerate(ptags_sent):
                wd_ix += 1
                essay_words.append(words[i])
                # Tag changed
                if ptag != last_tag:
                    if last_tag != EMPTY:
                        tag2spans.append((last_tag, tag_start, wd_ix-1, current_crel_tags))                    
                    tag_start = wd_ix
                    current_crel_tags = set()
                current_crel_tags.update(to_is_valid_crel(tag_seq[i]))
                last_tag = ptag
            if last_tag != EMPTY:
                tag2spans.append((last_tag, tag_start, wd_ix, current_crel_tags))
        return sent_tag2spans, essay_words
    
    def __combine_feats__(self, ftsa, ftsb):
        fts = {}
        for a, aval in ftsa.items():
            for b, bval in ftsb.items():
                fts[a + "|" + b] = aval * bval
        return fts
    
    def create_features(self, causer_tag, result_tag, causer_words, between_words, result_words, causer_first):
        feats = {}
        crel = "Causer:{a}->Result:{b}".format(a=causer_tag, b=result_tag)
        feats[crel] = 1
        feats["Causer:{tag}".format(tag=causer_tag)] = 1
        feats["Result:{tag}".format(tag=result_tag)] = 1
        cs_fts, res_fts = {},{}
        for wd in causer_words:
            cs_fts["Causer:{wd}".format(wd=wd)] = 1
        feats.update(cs_fts)
        for wd in result_words:
            res_fts["Result:{wd}".format(wd=wd)] = 1
        feats.update(res_fts)
        feats.update(self.__combine_feats__(cs_fts, res_fts))
        btwn_fts = {}
        for wd in between_words:
            btwn_fts["Between:{wd}".format(wd=wd)] = 1
        feats.update(btwn_fts)
        feats.update(self.__combine_feats__(cs_fts, btwn_fts))
        feats.update(self.__combine_feats__(res_fts, btwn_fts))
        if causer_first:
            feats["Left2Right"] = 1
        else:
            feats["Right2Left"] = 1
        return feats, crel
    
    def __generate_training_data__(self, essays):
        xs, ys, essay_sent_crel = [],[],[]
        for essay_ix, essay in enumerate(essays):
            sent_tag2spans, essay_words = self.__compute_tag_2_spans__(essay)
            for sent_ix in range(len(sent_tag2spans)):
                # tag 2 spans for sentence
                next_tag2spans = []
                # grab next few sentences' predicted tags
                for offset in range(0, self.sentence_span+1):
                    if (sent_ix+offset) < len(sent_tag2spans):
                        next_tag2spans.extend(sent_tag2spans[sent_ix+offset])
                
                for ltag_ix, (ltag, lstart_ix, lend_ix, lcrels) in enumerate(sent_tag2spans[sent_ix]):
                    for rtag, rstart_ix, rend_ix, rcrels in next_tag2spans[ltag_ix+1:]:
                        ltag_words    = essay_words[lstart_ix:lend_ix+1]
                        between_words = essay_words[lend_ix+1:rstart_ix]
                        rtag_words    = essay_words[rstart_ix:rend_ix+1]
                        
                        lbls = set(lcrels).union(rcrels)
                        x,ft_crel = self.create_features(
                                causer_tag=ltag, result_tag=rtag, 
                                causer_words=ltag_words, between_words=between_words, result_words=rtag_words, 
                                causer_first=True)
                        xs.append(x)
                        ys.append(1 if ft_crel in lbls else self.negative_label)
                        essay_sent_crel.append((e.name, sent_ix, ft_crel))
                        
                        x,ft_crel = self.create_features(
                                causer_tag=rtag, result_tag=ltag, 
                                causer_words=rtag_words, between_words=between_words, result_words=ltag_words, 
                                causer_first=False)
                        xs.append(x)
                        ys.append(1 if ft_crel in lbls else self.negative_label)
                        essay_sent_crel.append((e.name, sent_ix, ft_crel))
        return xs, ys, essay_sent_crel
    
    def train(self, train_essays, sent_span=2):
        # Note that there are a small number of crels that span 2 sentences
        xs, ys, essay_sent_crel = self.__generate_training_data__(essays=train_essays)
        xs_array = self.vectorizer.fit_transform(xs)
        self.clf.fit(X=xs_array, y=ys)
        preds = self.clf.predict(xs_array)

    def evaluate(self, tagged_essays):
        # Note that there are a small number of crels that span 2 sentences
        xs, ys, essay_sent_crel = self.__generate_training_data__(essays=tagged_essays)
        xs_array = self.vectorizer.transform(xs)
        preds = self.clf.predict(xs_array)
        print(classification_report(y_true=ys, y_pred=preds))

        #TODO - This doesn't work
        namesent2pred = defaultdict(set)
        for (name, sent_ix, crel), pred in zip(essay_sent_crel, preds):
            if pred == 1:
                namesent2pred[(name, sent_ix)].add(crel)

        pred_ys_bytag_sent = defaultdict(list)
        for essay in pred_tagged_essays_train:
            for sent_ix, sentence in enumerate(essay.sentences):
                unique_cr_tags = namesent2pred[(essay.name, sent_ix)]
                add_cr_labels(unique_cr_tags, pred_ys_bytag_sent)
        return pred_ys_bytag_sent
    
parser = DependencyClassifier()
parser.train(pred_tagged_essays_train)
pred_ys_bytag_sent = parser.evaluate(pred_tagged_essays_train)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     44627
          1       0.98      0.99      0.98      7487

avg / total       1.00      1.00      1.00     52114



In [142]:
pred_ys_bytag_sent_test = parser.evaluate(pred_tagged_essays_test)

             precision    recall  f1-score   support

          0       0.94      0.94      0.94      9280
          1       0.61      0.60      0.61      1484

avg / total       0.89      0.89      0.89     10764



In [139]:
from sklearn.ensemble import RandomForestClassifier
parser = DependencyClassifier(classifier_fn=RandomForestClassifier)
parser.train(pred_tagged_essays_train)
pred_ys_bytag_sent = parser.evaluate(pred_tagged_essays_train)
pred_ys_bytag_sent_test = parser.evaluate(pred_tagged_essays_test)

             precision    recall  f1-score   support

          0       0.99      1.00      1.00     44627
          1       0.99      0.95      0.97      7487

avg / total       0.99      0.99      0.99     52114

             precision    recall  f1-score   support

          0       0.92      0.97      0.94      9280
          1       0.72      0.44      0.54      1484

avg / total       0.89      0.90      0.89     10764



In [117]:
ys_bytag_sent = get_label_data(pred_tagged_essays_train)

In [121]:
mean_metrics = ResultsProcessor.compute_mean_metrics(ys_bytag_sent, pred_ys_bytag_sent)
get_micro_metrics(metrics_to_df(mean_metrics))

Unnamed: 0,accuracy,f1_score,recall,precision
95,0.995669,0.000612,0.000308,0.041667


In [120]:
def model_train_predict(essays_TD, essays_VD, max_epochs):
    
    parse_model = ReRankingParser()
    parse_model.train(essays_TD, max_epochs=max_epochs)

    num_feats = template_feature_extractor.num_features()

    sent_td_ys_bycode = get_label_data(essays_TD)
    sent_vd_ys_bycode = get_label_data(essays_VD)

    sent_td_pred_ys_bycode = parse_model.predict(essays_TD)
    sent_vd_pred_ys_bycode = parse_model.predict(essays_VD)

    td_preds_by_sent = predict_by_sent(essays_TD, parse_model)
    vd_preds_by_sent = predict_by_sent(essays_VD, parse_model)
    
    return num_feats, sent_td_ys_bycode, sent_vd_ys_bycode, sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode, td_preds_by_sent, vd_preds_by_sent

In [119]:
def metrics_to_df(metrics):
    import Rpfa

    rows = []
    for k,val in metrics.items():
        if type(val) == Rpfa.rpfa:
            d = dict(val.__dict__) # convert obj to dict
        elif type(val) == dict:
            d = dict(val)
        else:
            d = dict()
        d["code"] = k
        rows.append(d)
    return pd.DataFrame(rows)

def get_micro_metrics(df):
    return df[df.code == "MICRO_F1"][["accuracy", "f1_score", "recall", "precision"]]

### Note that these are different for Skin Cancer dataset

# Train for Test Set Eval

In [9]:
test_folds     = [(pred_tagged_essays_train, pred_tagged_essays_test)]  # type: List[Tuple[Any,Any]]

In [53]:
%%time
result_test = evaluate_model(folds=test_folds, max_epochs=max_epochs)

CPU times: user 29.2 s, sys: 242 ms, total: 29.5 s
Wall time: 29.4 s


### Test Metrics (All Codes Inc. Ana)

#### Train

In [54]:
cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_td_preds_by_sent, \
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag, cv_vd_preds_by_sent = result_test
    
mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag)
get_micro_metrics(metrics_to_df(mean_metrics))

Unnamed: 0,accuracy,f1_score,recall,precision
95,0.997865,0.741951,0.713494,0.772773


#### Test

In [55]:
cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_td_preds_by_sent, \
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag, cv_vd_preds_by_sent = result_test
    
mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag)
get_micro_metrics(metrics_to_df(mean_metrics))

Unnamed: 0,accuracy,f1_score,recall,precision
95,0.99776,0.701754,0.727848,0.677467


In [56]:
pred_crels_by_sent = cv_vd_preds_by_sent[0]
crels_by_sent = get_crel_tags_by_sent(pred_tagged_essays_test)
len(pred_crels_by_sent), len(crels_by_sent)

(1918, 1918)