## NOTE - Use Python 3.6

In [87]:
import numpy as np
import pandas as pd
from collections import defaultdict
import pymongo
import dill
import os

from BrattEssay import load_bratt_essays
from load_data import load_process_essays
from window_based_tagger_config import get_config
from FindFiles import find_files
from DirUtils import dir_exists
from results_procesor import ResultsProcessor, __MICRO_F1__
from Settings import Settings

In [88]:
FOLDER = "/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/Predictions/"

In [89]:
def filter_by_str(s, files, exclude=False):   
    return  [f for f in files if (s in f) != exclude]

In [119]:
all_files = os.listdir(FOLDER)
print(len(all_files))
files = filter_by_str("_VD_", all_files)
files = filter_by_str("_VD_", files)
print(len(files))
files = filter_by_str("2019", files, exclude=True)
print(len(files))

118
58
20


In [91]:
sorted(files)

['TEST_CB_TAGGING_VD_AVG_PERCEPTRON_MOST_COMMON_TAG_PREDS_.dill',
 'TEST_CB_TAGGING_VD_AVG_PERCEPTRON_MOST_COMMON_TAG_YS_.dill',
 'TEST_CB_TAGGING_VD_CRF_MOST_COMMON_TAG_FIXED_PREDS_.dill',
 'TEST_CB_TAGGING_VD_CRF_MOST_COMMON_TAG_FIXED_YS_.dill',
 'TEST_CB_TAGGING_VD_HMM_MOST_COMMON_TAG_MULTICLASS_FIXED_PREDS_.dill',
 'TEST_CB_TAGGING_VD_HMM_MOST_COMMON_TAG_MULTICLASS_FIXED_YS_.dill',
 'TEST_CB_TAGGING_VD_RNN_MOST_COMMON_TAG_PREDS_.dill',
 'TEST_CB_TAGGING_VD_RNN_MOST_COMMON_TAG_YS_.dill',
 'TEST_CB_TAGGING_VD_WINDOW_CLASSIFIER_MOST_COMMON_TAG_MULTICLASS_PREDS_.dill',
 'TEST_CB_TAGGING_VD_WINDOW_CLASSIFIER_MOST_COMMON_TAG_MULTICLASS_YS_.dill',
 'TEST_SC_TAGGING_VD_AVG_PERCEPTRON_MOST_COMMON_TAG_PREDS_.dill',
 'TEST_SC_TAGGING_VD_AVG_PERCEPTRON_MOST_COMMON_TAG_YS_.dill',
 'TEST_SC_TAGGING_VD_CRF_MOST_COMMON_TAG_PREDS_.dill',
 'TEST_SC_TAGGING_VD_CRF_MOST_COMMON_TAG_YS_.dill',
 'TEST_SC_TAGGING_VD_HMM_MOST_COMMON_TAG_MULTICLASS_PREDS_.dill',
 'TEST_SC_TAGGING_VD_HMM_MOST_COMMON_TAG_MULT

In [92]:
sc_files = filter_by_str("_SC_", files)
cb_files = filter_by_str("_CB_", files)
len(cb_files), len(sc_files)

(10, 10)

In [106]:
from collections import OrderedDict
filters = OrderedDict({
    "WINDOW_CLASSIFIER":"WINDOW_CLASSIFIER",
    "CRF": "_CRF_",
    "HMM" : "_HMM_",
    "PERCEPTRON":"PERCEPTRON",
    "RNN" : "_RNN_"
})

In [114]:
from pprint import pprint

def load_predictions(input_files):
    algo2preds = dict()
    pred_files = filter_by_str("_PREDS", input_files)
    for algo_name, fltr in filters.items():
        f_files = filter_by_str(fltr, pred_files)
        assert len(f_files) == 1
        fname = FOLDER + f_files[0]
        with open(fname, "rb+") as f:
            algo2preds[algo_name] = dill.load(f)
    return algo2preds

In [115]:
cb_preds = load_predictions(cb_files)
sc_preds = load_predictions(sc_files)

In [108]:
with open(FOLDER + "TEST_CB_TAGGING_VD_WINDOW_CLASSIFIER_MOST_COMMON_TAG_MULTICLASS_YS_.dill", "rb+") as f:
    cb_ysbytag = dill.load(f)
    
with open(FOLDER + "TEST_SC_TAGGING_VD_WINDOW_CLASSIFIER_MOST_COMMON_TAG_MULTICLASS_YS_.dill", "rb+") as f:
    sc_ysbytag = dill.load(f)

In [109]:
# sorted(files)

In [110]:
from scipy import stats

In [111]:
def get_all_preds(predsbytag):
    all_p = []
    for k, vals in sorted(predsbytag.items(), key = lambda tpl: tpl[0]):
        all_p.extend(vals)
    return all_p

In [112]:
def compute_p_value_binomial_test(ysbytag, predsbytaga, predsbytagb, alternative='two-sided'):
    assert len(ysbytag.keys()) == len(predsbytaga.keys()), (len(ysbytag.keys()),len(predsbytaga.keys())) 
    assert len(ysbytag.keys()) == len(predsbytagb.keys()), (len(ysbytag.keys()),len(predsbytagb.keys())) 

    first = list(ysbytag.keys())[0]
    assert len(ysbytag[first]) == len(predsbytaga[first])
    assert len(ysbytag[first]) == len(predsbytagb[first])

    ys = get_all_preds(ysbytag)
    aas = get_all_preds(predsbytaga)
    bbs = get_all_preds(predsbytagb)

    assert len(ys) == len(aas) == len(bbs)

    successes = defaultdict(int)
    for y,a,b in zip(ys,aas,bbs):    
        if a != b:
            if a == y:
                successes["a"] += 1
            else:
                successes["b"] += 1

    p_value = stats.binom_test(successes["a"], sum(successes.values()), p=0.5, alternative=alternative)
    return p_value

## Get Predicted Tags from Labelled Essays

In [130]:
def get_label(tag, expected_tag_set):
    if tag in expected_tag_set:
        return 1
    else:
        return 0

def get_wd_level_lbs(essays, expected_tags):
    expected_tags = set(expected_tags)
    ysbycode = defaultdict(list)
    for e in essays:
        for sent in e.sentences:
            for wd, tag_set in sent:
                for etag in expected_tags:
                    ysbycode[etag].append(get_label(etag, tag_set))
    return ysbycode    

# for pred tags
def get_wd_level_preds(essays, expected_tags):
    expected_tags = set(expected_tags)
    ysbycode = defaultdict(list)
    for e in essays:
        for sentix in range(len(e.sentences)):
            p_ccodes = e.pred_tagged_sentences[sentix]            
            for wordix in range(len(p_ccodes)):
                ptag_set = set([p_ccodes[wordix]])
                assert len(ptag_set) >=1, "No tags found"
                for exp_tag in expected_tags:
                    ysbycode[exp_tag].append(get_label(exp_tag, ptag_set))    
    return ysbycode

In [21]:
import sys
cm_folder = "/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Causal Model/"
src_path = os.path.join(cm_folder, "src")
sys.path.append(src_path)

In [56]:
settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
# used as inputs to parsing model
rnn_predictions_folder = root_folder + "Predictions/Bi-LSTM-4-SEARN/"
test_fname = rnn_predictions_folder + "essays_test_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill"
with open(test_fname, "rb") as f:
    cb_essays = dill.load(f)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [57]:
settings = Settings()
root_folder = settings.data_directory + "SkinCancer/Thesis_Dataset/"
# used as inputs to parsing model
rnn_predictions_folder = root_folder + "Predictions/Bi-LSTM-4-SEARN/"
test_fname = rnn_predictions_folder + "essays_test_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill"
with open(test_fname, "rb") as f:
    sc_essays = dill.load(f)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [62]:
# cb_preds["RNN"] = get_wd_level_preds(cb_essays, cb_ysbytag.keys())
# sc_preds["RNN"] = get_wd_level_preds(sc_essays, sc_ysbytag.keys())

In [102]:
def print_comparison(ysbytag, algo2preds):
    algo2metrics = {}
    for algo, preds in algo2preds.items():
        mean_metrics = ResultsProcessor.compute_mean_metrics(ysbytag, preds)
        algo2metrics[algo] = mean_metrics[__MICRO_F1__]

    matrix = dict()
    for algo_name_a, predsbytaga in algo2preds.items():
        for algo_name_b, predsbytagb in algo2preds.items():
            if algo_name_a == algo_name_b:
                continue
            f1_a = algo2metrics[algo_name_a]["f1_score"]
            f1_b = algo2metrics[algo_name_b]["f1_score"]
            pval = compute_p_value_binomial_test(ysbytag, predsbytaga, predsbytagb)
            print(f"{algo_name_a.ljust(20)} {f1_a:.4f}\t{algo_name_b.ljust(20)} {f1_b:.4f}\t {pval}")
            matrix[(algo_name_a, algo_name_b)] = pval
        print()

In [116]:
#perceptron and CRF differ somewhat from prior run
print_comparison(cb_ysbytag, cb_preds)

WINDOW_CLASSIFIER    0.8413	CRF                  0.8380	 0.16554440323794242
WINDOW_CLASSIFIER    0.8413	HMM                  0.7471	 6.539263608822287e-216
WINDOW_CLASSIFIER    0.8413	PERCEPTRON           0.8399	 0.5990715121670374
WINDOW_CLASSIFIER    0.8413	RNN                  0.8422	 0.23461093020583862

CRF                  0.8380	WINDOW_CLASSIFIER    0.8413	 0.16554440323794242
CRF                  0.8380	HMM                  0.7471	 3.4081766479273455e-186
CRF                  0.8380	PERCEPTRON           0.8399	 0.3953317058722892
CRF                  0.8380	RNN                  0.8422	 0.9570199756080899

HMM                  0.7471	WINDOW_CLASSIFIER    0.8413	 6.539263608822287e-216
HMM                  0.7471	CRF                  0.8380	 3.4081766479273455e-186
HMM                  0.7471	PERCEPTRON           0.8399	 1.840272414942264e-204
HMM                  0.7471	RNN                  0.8422	 1.265374305879553e-188

PERCEPTRON           0.8399	WINDOW_CLASSIFIER    0.8413	

In [117]:
# RNN only differs
print_comparison(sc_ysbytag, sc_preds)

WINDOW_CLASSIFIER    0.8144	CRF                  0.8043	 0.003608884280891862
WINDOW_CLASSIFIER    0.8144	HMM                  0.6754	 0.0
WINDOW_CLASSIFIER    0.8144	PERCEPTRON           0.8148	 0.45153921840005645
WINDOW_CLASSIFIER    0.8144	RNN                  0.8268	 0.001295431695570259

CRF                  0.8043	WINDOW_CLASSIFIER    0.8144	 0.003608884280891862
CRF                  0.8043	HMM                  0.6754	 3.4009044173226684e-281
CRF                  0.8043	PERCEPTRON           0.8148	 0.0007131169075950804
CRF                  0.8043	RNN                  0.8268	 7.617716382544424e-08

HMM                  0.6754	WINDOW_CLASSIFIER    0.8144	 0.0
HMM                  0.6754	CRF                  0.8043	 3.4009044173226684e-281
HMM                  0.6754	PERCEPTRON           0.8148	 0.0
HMM                  0.6754	RNN                  0.8268	 5e-324

PERCEPTRON           0.8148	WINDOW_CLASSIFIER    0.8144	 0.45153921840005645
PERCEPTRON           0.8148	CRF           

## Find Closest Perceptron Results for CB

In [154]:
all_files = os.listdir(FOLDER)
perc_files = filter_by_str("_CB_", all_files)
perc_files = filter_by_str("_VD_", perc_files)
perc_files = filter_by_str("_PREDS_", perc_files)
perc_files = filter_by_str("_PERCEPTRON_", perc_files)
perc_files = filter_by_str("_2019", perc_files)
perc_files

['TEST_CB_TAGGING_VD_AVG_PERCEPTRON_MOST_COMMON_TAG_PREDS__2019-10-6_22-6.dill',
 'TEST_CB_TAGGING_VD_AVG_PERCEPTRON_MOST_COMMON_TAG_PREDS__2019-10-6_22-22.dill',
 'TEST_CB_TAGGING_VD_AVG_PERCEPTRON_MOST_COMMON_TAG_PREDS__2019-10-5_9-43.dill',
 'TEST_CB_TAGGING_VD_AVG_PERCEPTRON_MOST_COMMON_TAG_PREDS__2019-10-5_9-55.dill']

In [156]:
import pandas as pd

ysbytag = cb_ysbytag
f2preds = dict()
rows = []
for f in perc_files:
    with open(FOLDER + f, "rb+") as fin:
        preds = dill.load(fin)
    metrics = ResultsProcessor.compute_mean_metrics(ysbytag, preds)[__MICRO_F1__]
    metrics["fname"] = f.split("2019-")[-1]
    rows.append(metrics)
df = pd.DataFrame(rows).sort_values("f1_score")
df # 0.837

Unnamed: 0,accuracy,data_points,f1_score,fname,num_codes,precision,recall
3,0.994813,399087.0,0.839932,10-5_9-55.dill,6792.0,0.884528,0.799617
2,0.994848,399087.0,0.841529,10-5_9-43.dill,6792.0,0.883048,0.80374
0,0.994881,399087.0,0.842786,10-6_22-6.dill,6792.0,0.882799,0.806243
1,0.994898,399087.0,0.843409,10-6_22-22.dill,6792.0,0.882931,0.807273
