In [99]:
# %load_ext autoreload
# %autoreload 2

In [100]:
import os
import sys

cm_folder = "/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Causal Model/"
src_path = os.path.join(cm_folder, "src")
sys.path.append(src_path)

In [101]:
from typing import Any

import dill
import numpy as np

from CrossValidation import cross_validation
from structured_perceptron import StructuredPerceptron
from Settings import Settings

from window_based_tagger_config import get_config
from crel_helper import get_cr_tags
from crel_processing import essay_to_crels_cv
from evaluation import evaluate_model_essay_level, get_micro_metrics, metrics_to_df
from feature_normalization import min_max_normalize_feats
from function_helpers import get_function_names
from results_procesor import ResultsProcessor
from train_parser import essay_to_crels, create_extractor_functions
from cost_functions import micro_f1_cost_plusepsilon
from train_reranker import train_model, train_instance, get_essays_for_data, evaluate_ranker
from searn_parser_breadth_first import SearnModelBreadthFirst
from causal_model_features import CausalModelType
from feature_extraction import get_features_from_probabilities
from results_procesor import ResultsProcessor
from filter_features import filter_feats

from wordtagginghelper import merge_dictionaries
from results_procesor import ResultsProcessor, __MICRO_F1__
from evaluation import add_cr_labels

from random import shuffle
from joblib import Parallel, delayed
from collections import defaultdict

In [102]:
# Global settings
settings = Settings()
CAUSAL_MODEL_TYPE = CausalModelType.CORAL_BLEACHING
# CAUSAL_MODEL_TYPE = CausalModelType.SKIN_CANCER
root = "/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Causal Model/Re-Ranker Final Scripts"

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [103]:
# Global settings
if CAUSAL_MODEL_TYPE == CausalModelType.CORAL_BLEACHING:
    root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
    training_folder = root_folder + "Training" + "/"
    test_folder = root_folder + "Test" + "/"

    crels_folder = root + "/crels/CB"
    coref_root = root_folder + "CoReference/"
    coref_output_folder = coref_root + "CRel/"

    # first and second were with initial_weight set to 1.0
    # thrid is with set to 0.001

    config = get_config(training_folder)
    results_processor = ResultsProcessor(dbname="metrics_causal_model_reranker")

    train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
    with open(train_fname, "rb") as f:
        pred_tagged_essays_train = dill.load(f)

    test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
    with open(test_fname, "rb") as f:
        pred_tagged_essays_test = dill.load(f)

else: # SC

    root_folder = settings.data_directory + "SkinCancer/Thesis_Dataset/"
    training_folder = root_folder + "Training" + "/"
    test_folder = root_folder + "Test" + "/"

    crels_folder = root + "/crels/SC"
    coref_root = root_folder + "CoReference/"
    coref_output_folder = coref_root + "CRel/"

    MONGO_COLLECTION = "SC_STR_PCPTRN_RE-RANKER_HYPER_PARAM_TD"
    MONGO_TEST_COLLECTION = "TEST_SC_STR_PCPTRN_RE-RANKER_TD"

    config = get_config(training_folder)
    results_processor = ResultsProcessor(dbname="metrics_causal_model_reranker")

    train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill"
    with open(train_fname, "rb") as f:
        pred_tagged_essays_train = dill.load(f)

    test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill"
    with open(test_fname, "rb") as f:
        pred_tagged_essays_test = dill.load(f)

print(len(pred_tagged_essays_train), len(pred_tagged_essays_test))

902 226


In [104]:
cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)

set_cr_tags = set(cr_tags)
print(len(cr_tags))

91


In [105]:
all_essays = pred_tagged_essays_train + pred_tagged_essays_test
name2essay = {}
for essay in all_essays:
    name2essay[essay.name] = essay

name2crels = essay_to_crels(all_essays, set_cr_tags)
assert len(name2crels) == len(all_essays)

In [106]:
c_lens = []
for name, crels in name2crels.items():
#     print(len(crels))
    if crels:
        c_lens.append(len(crels))
np.mean(c_lens)

3.568946796959826

In [107]:
crels_folder

'/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Causal Model/Re-Ranker Final Scripts/crels/CB'

In [108]:
def load_rerank(top_n):
    rr_fname = "xs_rerank_" + str(top_n) + ".dill"
    with open(os.path.join(crels_folder, rr_fname), "rb") as f:
        xs_rerank = dill.load(f)

    rr_fname = "xs_rerank_test" + str(top_n) + ".dill"
    with open(os.path.join(crels_folder, rr_fname), "rb") as f:
        xs_test_rerank = dill.load(f)
    return xs_rerank, xs_test_rerank

In [109]:
def get_actual_crels_counts():    
    lens = []
    # Dict[str,Set[str]]
    for ename, crels in name2crels.items():
        lens.append(len(crels))
    return lens

def get_predicted_crels_counts(top_n):
    # Td, Vd
    a,b = load_rerank(top_n=top_n)
    a.update(b)
    
    lens = []
    # Dict[str,Dict[str,List[float]]]
    for ename, dct in a.items():
        lens.append(len(dct.keys()))
    return lens

topn2lens = {}
for topn in [1,2,3,5,7,10]:
    topn2lens[topn] = get_predicted_crels_counts(topn)

In [222]:
DP = 3 # decimal points
cols = ["Beam Size", "Max.", "Mean", "25%", "50%", "75%", 
        #"90%", 
        "95%", "99%"]

def list2stats(l, n):
    d = {
        "Beam Size": n,
        "Mean": np.mean(l),
        "Max.": np.max(l),
        "Min.": np.min(l),
        "25%": np.percentile(l,25),
        "50%": np.percentile(l,50),
        "75%": np.percentile(l,75),
        "90%": np.percentile(l,90),
        "95%": np.percentile(l,95),
        "98%": np.percentile(l,98),
        "99%": np.percentile(l,99),
    }
    return d

def df2latex_table(df):
    # print header
    s = ""
    for col in cols:
        s += "\\textbf{{ {col} }} & ".format(col=col.replace("%", "\\%"))
    print(s[:-2] + " \\\\")
    print("\\midrule")
    del s
    
    # print rows
    for i,row in df.iterrows():
        s = ""
        for col in cols:
            oval = row[col]
            if type(oval) == str or oval is None:
                val = "Human Labels"
            else:
                v = str(round(oval,DP))                
                if v.endswith(".0") and "99" not in col:
                    val = v[:-2]
                else:     
                    fmt = "{val:." + str(DP) + "f}"
                    val = fmt.format(val=oval)
            s += val + " & "
        print(s[:-2].strip() + " \\\\")
    print()
        
l = get_actual_crels_counts()
df = pd.DataFrame([list2stats(l, "-")])[cols]

df2latex_table(df)

\textbf{ Beam Size } & \textbf{ Max. } & \textbf{ Mean } & \textbf{ 25\% } & \textbf{ 50\% } & \textbf{ 75\% } & \textbf{ 95\% } & \textbf{ 99\% }  \\
\midrule
Human Labels & 15.000 & 2.914 & 1 & 2 & 4 & 8 & 10.000 \\



## Actual Crels (CB)

In [223]:
l = get_actual_crels_counts()
df = pd.DataFrame([list2stats(l, "-")])[cols]

df2latex_table(df)
df

\textbf{ Beam Size } & \textbf{ Max. } & \textbf{ Mean } & \textbf{ 25\% } & \textbf{ 50\% } & \textbf{ 75\% } & \textbf{ 95\% } & \textbf{ 99\% }  \\
\midrule
Human Labels & 15.000 & 2.914 & 1 & 2 & 4 & 8 & 10.000 \\



Unnamed: 0,Beam Size,Max.,Mean,25%,50%,75%,95%,99%
0,-,15,2.914007,1.0,2.0,4.0,8.0,10.0


## Predicted Crels (CB)

In [224]:
dicts = []
for n, l in topn2lens.items():    
    dicts.append(list2stats(l, n))
df = pd.DataFrame(dicts)[cols]

df2latex_table(df)

df

\textbf{ Beam Size } & \textbf{ Max. } & \textbf{ Mean } & \textbf{ 25\% } & \textbf{ 50\% } & \textbf{ 75\% } & \textbf{ 95\% } & \textbf{ 99\% }  \\
\midrule
1 & 13 & 2.958 & 1 & 2 & 4 & 8 & 10.000 \\
2 & 13 & 2.966 & 1 & 2 & 4 & 8 & 10.000 \\
3 & 15 & 3.643 & 1 & 3 & 5 & 10 & 12.730 \\
5 & 22 & 5.353 & 2 & 4 & 8 & 14 & 17.000 \\
7 & 22 & 5.358 & 2 & 4 & 8 & 14 & 17.000 \\
10 & 22 & 5.358 & 2 & 4 & 8 & 14 & 17.000 \\



Unnamed: 0,Beam Size,Max.,Mean,25%,50%,75%,95%,99%
0,1,13,2.958333,1.0,2.0,4.0,8.0,10.0
1,2,13,2.966312,1.0,2.0,4.0,8.0,10.0
2,3,15,3.64273,1.0,3.0,5.0,10.0,12.73
3,5,22,5.352837,2.0,4.0,8.0,14.0,17.0
4,7,22,5.358156,2.0,4.0,8.0,14.0,17.0
5,10,22,5.358156,2.0,4.0,8.0,14.0,17.0


In [227]:
# df2latex_table(df)

## Differences by Top N?

In [113]:
# Are the counts different?
# CB: 5 and 10, yes, marginally. 7 and 10 - Nope
# SC: 1-3 and 5 diff, 5, 7 and 10 are the same

# for i, (a,b) in enumerate(zip(topn2lens[3],topn2lens[5])):
#     if a != b:
#         print(i,a,b)

for i, (a,b) in enumerate(zip(topn2lens[5],topn2lens[7])):
    if a != b:
        print(i,a,b)
print("." * 10)
for i, (a,b) in enumerate(zip(topn2lens[7],topn2lens[10])):
    if a != b:
        print(i,a,b)

401 3 4
503 5 6
551 2 3
585 12 13
806 5 6
996 7 8
..........
