In [1]:
import datetime
import logging
from collections import defaultdict

import dill
import numpy as np
import pymongo
import pandas as pd
from sklearn.linear_model import LogisticRegression
from typing import Any

from CrossValidation import cross_validation
from Settings import Settings
from cost_functions import *
from crel_helper import get_cr_tags
from function_helpers import get_function_names, get_functions_by_name
from results_procesor import ResultsProcessor, __MICRO_F1__
from searn_parser import SearnModelTemplateFeatures
from template_feature_extractor import *
from window_based_tagger_config import get_config
from wordtagginghelper import merge_dictionaries

import sys
sys.path.append("/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Co-Reference Resolution/Results")

from results_common import get_essays

In [2]:
client = pymongo.MongoClient()
db = client.metrics

# Data Set Partition
CV_FOLDS = 5
MIN_FEAT_FREQ = 5

# Global settings

settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"
# NOTE: These predictions are generated from the "./notebooks/SEARN/Keras 
#  - Train Tagger and Save CV Predictions For Word Tags.ipynb" notebook
# used as inputs to parsing model
coref_root = root_folder + "CoReference/"
coref_output_folder = coref_root + "CRel/"

config = get_config(training_folder)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [3]:
train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)

len(pred_tagged_essays_train),len(pred_tagged_essays_test)

(902, 226)

In [4]:
# doesn't matter with parser for this
stanford_coref_predictions_folder = root_folder + "CoReference/"

orig_pred_tagged_essays_train = get_essays(stanford_coref_predictions_folder, "Training")
orig_pred_tagged_essays_test  = get_essays(stanford_coref_predictions_folder, "Test")


Found file /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/CoReference/training_processed.dill
Found file /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/CoReference/test_processed.dill


In [5]:
from process_essays_coref import *

def get_predicted_tags(essays):
    preds_by_sent = []
    proc_essays = processed_essays_predict_most_recent_tag(essays=essays, format_ana_tags=True)
    for e in proc_essays:
        for sent in e.pred_tagged_sentences:
            pred_tags = set()
            for tags in sent:
                for t in tags:
                    if t.startswith("Anaphor:["):
                        t = t.replace("Anaphor:[","").replace("]","").strip()
                        pred_tags.add(t)
            preds_by_sent.append(pred_tags)
    return preds_by_sent

In [6]:
EMPTY = "Empty"
from BrattEssay import ANAPHORA

def get_different_predicted_tags_by_sent(essays_a, essays_b):
    assert len(essays_a) == len(essays_b)
    diff_by_sent = []
    for ea, eb in zip(essays_a, essays_b):
        assert len(ea.sentences) == len(eb.sentences)
        assert len(ea.pred_tagged_sentences) == len(eb.pred_tagged_sentences)
        for asent, bsent in zip(ea.pred_tagged_sentences, eb.pred_tagged_sentences):
            unique_atags = set(asent)
            if EMPTY in unique_atags:
                unique_atags.remove(EMPTY)
            unique_btags = set(bsent)
            if EMPTY in unique_btags:
                unique_btags.remove(EMPTY)
            diff_by_sent.append(unique_btags - unique_atags)
    return diff_by_sent

def to_is_valid_crel(tags):
    filtered = set()
    for t in tags:
        t_lower = t.lower()
        if "rhetorical" in t_lower or "change" in t_lower or "other" in t_lower:
            continue
        if "->" in t and ANAPHORA not in t:
            filtered.add(t)
    return filtered

def get_different_crel_tags_by_sent(essays_a, essays_b):
    assert len(essays_a) == len(essays_b)
    diff_by_sent = []
    for ea, eb in zip(essays_a, essays_b):
        assert len(ea.sentences) == len(eb.sentences)
        for asent, bsent in zip(ea.sentences, eb.sentences):
            all_atags, all_btags = set(), set()
            for (awd, atags), (bwd, btags) in zip(asent, bsent):
                assert awd == bwd
                all_atags.update(to_is_valid_crel(atags))
                all_btags.update(to_is_valid_crel(btags))
            
            diff_by_sent.append(all_btags - all_atags)
    return diff_by_sent

def get_crel_tags_by_sent(essays_a):
    crels_by_sent = []
    for ea in essays_a:
        for asent in ea.sentences:
            all_atags = set()
            for awd, atags in asent:
                all_atags.update(to_is_valid_crel(atags))
            crels_by_sent.append(all_atags)
    return crels_by_sent

In [7]:
cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)
# cv_folds  = [(pred_tagged_essays_train, pred_tagged_essays_test)]  # type: List[Tuple[Any,Any]]
cv_folds = cross_validation(pred_tagged_essays_train, CV_FOLDS)

# cv_folds  = [(pred_tagged_essays_train, pred_tagged_essays_test)]  # type: List[Tuple[Any,Any]]

In [8]:
def evaluate_model(
        collection_prefix: str,
        folds: List[Tuple[Any, Any]],
        extractor_fn_names_lst: List[str],
        cost_function_name: str,
        beta: float,
        ngrams: int,
        stemmed: bool,
        max_epochs: int,
        down_sample_rate=1.0) -> float:

    if down_sample_rate < 1.0:
        new_folds = []  # type: List[Tuple[Any, Any]]
        for i, (essays_TD, essays_VD) in enumerate(folds):
            essays_TD = essays_TD[:int(down_sample_rate * len(essays_TD))]
            essays_VD = essays_VD[:int(down_sample_rate * len(essays_VD))]
            new_folds.append((essays_TD, essays_VD))
        folds = new_folds  # type: List[Tuple[Any, Any]]

    serial_results = [
        model_train_predict(essays_TD, essays_VD, extractor_fn_names_lst, cost_function_name, ngrams, stemmed, beta, max_epochs)
        for essays_TD, essays_VD in folds
    ]

    cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict(list), defaultdict(list)
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

    # record the number of features in each fold
    number_of_feats = []

    # Parallel is almost 5X faster!!!
    cv_td_preds_by_sent = []
    cv_vd_preds_by_sent = []
    for (num_feats,
         sent_td_ys_bycode, sent_vd_ys_bycode,
         sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode, td_preds_by_sent, vd_preds_by_sent) in serial_results:
        number_of_feats.append(num_feats)

        merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag)
        merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag)
        merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag)
        merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag)
        
        cv_td_preds_by_sent.append(td_preds_by_sent)
        cv_vd_preds_by_sent.append(vd_preds_by_sent)

    # print(processor.results_to_string(sent_td_objectid, CB_SENT_TD, sent_vd_objectid, CB_SENT_VD, "SENTENCE"))
    return cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_td_preds_by_sent, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag, cv_vd_preds_by_sent

In [9]:
def predict_by_sent(tagged_essays, model):
    predict_by_sent = []
    for essay_ix, essay in enumerate(tagged_essays):
        for sent_ix, taggged_sentence in enumerate(essay.sentences):
            predicted_tags = essay.pred_tagged_sentences[sent_ix]
            pred_relations = model.predict_sentence(taggged_sentence, predicted_tags)
            # Store predictions for evaluation
            predict_by_sent.append(pred_relations)
    return predict_by_sent

def model_train_predict(essays_TD, essays_VD, extractor_names, cost_function_name, ngrams, stemmed, beta, max_epochs):
    extractors = get_functions_by_name(extractor_names, all_extractor_fns)
    # get single cost function
    cost_fn = get_functions_by_name([cost_function_name], all_cost_functions)[0]
    assert cost_fn is not None, "Cost function look up failed"
    # Ensure all extractors located
    assert len(extractors) == len(extractor_names), "number of extractor functions does not match the number of names"

    template_feature_extractor = NonLocalTemplateFeatureExtractor(extractors=extractors)
    if stemmed:
        ngram_extractor = NgramExtractorStemmed(max_ngram_len=ngrams)
    else:
        ngram_extractor = NgramExtractor(max_ngram_len=ngrams)
    parse_model = SearnModelTemplateFeatures(feature_extractor=template_feature_extractor,
                                             cost_function=cost_fn,
                                             min_feature_freq=MIN_FEAT_FREQ,
                                             ngram_extractor=ngram_extractor, cr_tags=cr_tags,
                                             base_learner_fact=BASE_LEARNER_FACT,
                                             beta=beta,
                                             # log_fn=lambda s: print(s))
                                             log_fn=lambda s: None)

    parse_model.train(essays_TD, max_epochs=max_epochs)

    num_feats = template_feature_extractor.num_features()

    sent_td_ys_bycode = parse_model.get_label_data(essays_TD)
    sent_vd_ys_bycode = parse_model.get_label_data(essays_VD)

    sent_td_pred_ys_bycode = parse_model.predict(essays_TD)
    sent_vd_pred_ys_bycode = parse_model.predict(essays_VD)

    td_preds_by_sent = predict_by_sent(essays_TD, parse_model)
    vd_preds_by_sent = predict_by_sent(essays_VD, parse_model)
    
    return num_feats, sent_td_ys_bycode, sent_vd_ys_bycode, sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode, td_preds_by_sent, vd_preds_by_sent

In [17]:
def metrics_to_df(metrics):
    import Rpfa

    rows = []
    for k,val in metrics.items():
        if type(val) == Rpfa.rpfa:
            d = dict(val.__dict__) # convert obj to dict
        elif type(val) == dict:
            d = dict(val)
        else:
            d = dict()
        d["code"] = k
        rows.append(d)
    return pd.DataFrame(rows)

def get_metrics_from_dict(act_ys_bycode, pred_ys_bycode, expected_tags):
    assert len(act_ys_bycode.keys()) == len(pred_ys_bycode.keys()) == len(expected_tags), "Miss-matched codes"
    first_tag = list(expected_tags)[0]
    last_tag  = list(expected_tags)[-1]
    assert len(act_ys_bycode[first_tag]) == len(pred_ys_bycode[first_tag]), "Different numbers of words"
    assert len(act_ys_bycode[last_tag])  == len(pred_ys_bycode[last_tag]),  "Different numbers of words"

    mean_metrics = ResultsProcessor.compute_mean_metrics(act_ys_bycode, pred_ys_bycode)
    return metrics_to_df(mean_metrics)

def compute_metrics_from_essays(tagged_esssays, expected_tags):
    act_ys_bycode  = get_wd_level_lbs(  tagged_esssays, expected_tags)
    pred_ys_bycode = get_wd_level_preds(tagged_esssays, expected_tags)
    return get_metrics_from_dict(act_ys_bycode, pred_ys_bycode, expected_tags)

def get_micro_metrics(df):
    return df[df.code == "MICRO_F1"][["accuracy", "f1_score", "precision", "recall"]]

In [18]:
LINE_WIDTH = 80

# other settings
DOWN_SAMPLE_RATE = 1.0  # For faster smoke testing the algorithm
BASE_LEARNER_FACT = None
COLLECTION_PREFIX = "CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_MOST_RECENT_CODE"

# some of the other extractors aren't functional if the system isn't able to do a basic parse
# so the base extractors are the MVP for getting to a basic parser, then additional 'meta' parse
# features from all_extractors can be included
base_extractors = [
    single_words,
    word_pairs,
    three_words,
    between_word_features
]

all_extractor_fns = base_extractors + [
    word_distance,
    valency,
    unigrams,
    third_order,
    label_set,
    size_features
]

all_cost_functions = [
    micro_f1_cost,
    micro_f1_cost_squared,
    micro_f1_cost_plusone,
    micro_f1_cost_plusepsilon,
    binary_cost,
    inverse_micro_f1_cost,
    uniform_cost
]

all_extractor_fn_names = get_function_names(all_extractor_fns)
base_extractor_fn_names = get_function_names(base_extractors)
all_cost_fn_names = get_function_names(all_cost_functions)

### Note that these are different for Skin Cancer dataset

In [20]:
ngrams = 1
stemmed = True
cost_function_name = micro_f1_cost_plusepsilon.__name__
dual = True
fit_intercept = True
beta = 0.5
max_epochs = 2
C = 0.5
penalty = "l2"

# Train CV

In [32]:
# Note these also differ for SC dataset
BASE_LEARNER_FACT = lambda : LogisticRegression(dual=dual, C=C, penalty=penalty, fit_intercept=fit_intercept)
best_extractor_names = ['single_words', 'between_word_features', 'label_set',
                                    'three_words', 'third_order', 'unigrams'] # type: List[str]
result = evaluate_model(
    collection_prefix=COLLECTION_PREFIX,
    folds=cv_folds,
    extractor_fn_names_lst=best_extractor_names,
    cost_function_name=cost_function_name,
    ngrams=ngrams,
    beta=beta,
    stemmed=stemmed,
    down_sample_rate=DOWN_SAMPLE_RATE,
    max_epochs=max_epochs)

In [33]:
cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_td_preds_by_sent, \
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag, cv_vd_preds_by_sent = result

### CV Metrics - All CRel Codes (Inc. Anaphora)

#### CV Train

In [34]:
mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag)
get_micro_metrics(metrics_to_df(mean_metrics))

Unnamed: 0,accuracy,f1_score,precision,recall
95,0.997857,0.741991,0.769721,0.716189


#### CV Test

In [35]:
mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag)
get_micro_metrics(metrics_to_df(mean_metrics))

Unnamed: 0,accuracy,f1_score,precision,recall
95,0.997475,0.698909,0.717624,0.681146


# Compute Anaphora CRel Accuracy

In [36]:
# cv_folds = cross_validation(pred_tagged_essays_train, CV_FOLDS)
orig_cv_folds = cross_validation(orig_pred_tagged_essays_train, CV_FOLDS)

In [37]:
def build_metrics_by_code(td_preds_by_sent, diffs_train, diffs_crel_train, ys_by_code_train, preds_by_code_train):
    for pred_crels, pred_ccodes, crels in zip(td_preds_by_sent, diffs_train, diffs_crel_train):    
        for crel in cr_tags:
            ys_by_code_train[crel].append( 1 if crel in crels else 0)

        # filter predictions to only those that were produced as a result of the ana tags
        ana_preds = set()
        for crel in pred_crels:
            l,r = crel.split("->")
            lcode = l.split(":")[-1].strip()
            rcode = r.split(":")[-1].strip()
            if lcode in pred_ccodes or rcode in pred_ccodes:
                ana_preds.add(crel)

        for crel in cr_tags:
            preds_by_code_train[crel].append( 1 if crel in ana_preds else 0)

In [38]:
# Predicted Concept Codes - correspond to Ana tags
preds_by_code_train = defaultdict(list)
ys_by_code_train    = defaultdict(list)

preds_by_code_test = defaultdict(list)
ys_by_code_test    = defaultdict(list)

for (orig_td, orig_vd), (td, vd), td_preds_by_sent, vd_preds_by_sent in zip(orig_cv_folds, cv_folds, cv_td_preds_by_sent, cv_vd_preds_by_sent):
    
    diffs_train = get_predicted_tags(essays=orig_td)
    diffs_test  = get_predicted_tags(essays=orig_vd)

    # Actual Ana CRels
    diffs_crel_train = get_different_crel_tags_by_sent(essays_a=orig_td, essays_b=td)
    diffs_crel_test  = get_different_crel_tags_by_sent(essays_a=orig_vd, essays_b=vd)
    
    build_metrics_by_code(td_preds_by_sent, diffs_train, diffs_crel_train, ys_by_code_train, preds_by_code_train)
    build_metrics_by_code(vd_preds_by_sent, diffs_test,  diffs_crel_test,  ys_by_code_test,  preds_by_code_test)

### CV Metrics - Ana Codes Only

#### Train

In [39]:
df_train = get_metrics_from_dict(ys_by_code_train, preds_by_code_train, cr_tags)
get_micro_metrics(df_train)

Unnamed: 0,accuracy,f1_score,precision,recall
95,0.999642,0.084746,0.227273,0.052083


#### Test

In [40]:
df_test = get_metrics_from_dict(ys_by_code_test, preds_by_code_test, cr_tags)
get_micro_metrics(df_test)

Unnamed: 0,accuracy,f1_score,precision,recall
95,0.999638,0.086957,0.220339,0.054167


# Train for Test Set Eval

In [41]:
orig_test_folds  = [(orig_pred_tagged_essays_train, orig_pred_tagged_essays_test)]  # type: List[Tuple[Any,Any]]
test_folds     = [(pred_tagged_essays_train, pred_tagged_essays_test)]  # type: List[Tuple[Any,Any]]

In [42]:
result_test = evaluate_model(
    collection_prefix=COLLECTION_PREFIX,
    folds=test_folds,
    extractor_fn_names_lst=best_extractor_names,
    cost_function_name=cost_function_name,
    ngrams=ngrams,
    beta=beta,
    stemmed=stemmed,
    down_sample_rate=DOWN_SAMPLE_RATE,
    max_epochs=max_epochs)

### Test Metrics (All Codes Inc. Ana)

#### Train

In [43]:
cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_td_preds_by_sent, \
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag, cv_vd_preds_by_sent = result_test
    
mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag)
get_micro_metrics(metrics_to_df(mean_metrics))

Unnamed: 0,accuracy,f1_score,precision,recall
95,0.99788,0.743672,0.7747,0.715034


#### Test

In [45]:
cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_td_preds_by_sent, \
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag, cv_vd_preds_by_sent = result_test
    
mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag)
get_micro_metrics(metrics_to_df(mean_metrics))

Unnamed: 0,accuracy,f1_score,precision,recall
95,0.99784,0.711994,0.688331,0.737342


In [46]:
# Predicted Concept Codes - correspond to Ana tags
preds_by_code_train = defaultdict(list)
ys_by_code_train    = defaultdict(list)

preds_by_code_test = defaultdict(list)
ys_by_code_test    = defaultdict(list)

for (orig_td, orig_vd), (td, vd), td_preds_by_sent, vd_preds_by_sent in zip(orig_test_folds, test_folds, cv_td_preds_by_sent, cv_vd_preds_by_sent):
    
    diffs_train = get_predicted_tags(essays=orig_td)
    diffs_test  = get_predicted_tags(essays=orig_vd)

    # Actual Ana CRels
    diffs_crel_train = get_different_crel_tags_by_sent(essays_a=orig_td, essays_b=td)
    diffs_crel_test  = get_different_crel_tags_by_sent(essays_a=orig_vd, essays_b=vd)
    
    build_metrics_by_code(td_preds_by_sent, diffs_train, diffs_crel_train, ys_by_code_train, preds_by_code_train)
    build_metrics_by_code(vd_preds_by_sent, diffs_test,  diffs_crel_test,  ys_by_code_test,  preds_by_code_test)

### Test Metrics - Ana Codes Only

#### Train

In [50]:
df_train2 = get_metrics_from_dict(ys_by_code_train, preds_by_code_train, cr_tags)
get_micro_metrics(df_train2)

Unnamed: 0,accuracy,f1_score,precision,recall
95,0.999642,0.087838,0.232143,0.054167


#### Test

In [51]:
df_test2 = get_metrics_from_dict(ys_by_code_test, preds_by_code_test, cr_tags)
get_micro_metrics(df_test2)

Unnamed: 0,accuracy,f1_score,precision,recall
95,0.999788,0.051282,0.166667,0.030303
