## NOTE - Use Python 3.6

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import pymongo
import dill
import os
from scipy import stats

from BrattEssay import load_bratt_essays
from load_data import load_process_essays
from window_based_tagger_config import get_config
from FindFiles import find_files
from DirUtils import dir_exists
from results_procesor import ResultsProcessor, __MICRO_F1__
from Settings import Settings
from statsmodels.stats.contingency_tables import mcnemar

In [2]:
# result = mcnemar([[1,2],[3,1]], exact=True)
# result.statistic, result.pvalue

In [2]:
FOLDER = "/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/Predictions/"

In [3]:
def filter_by_str(filters, files, exclude=False):
    if type(filters) != list:
        filters = [filters]
    flt = files
    for fltr in filters:
        files = [f for f in files if (fltr in f) != exclude]
    return  files

In [4]:
all_files = os.listdir(FOLDER)
print(len(all_files))
files = list(all_files)

files = filter_by_str("COREF", files) # this is the sentence CREL parser (sentence level)
# files = filter_by_str("_CR_", files) 
# files = filter_by_str("TEST_", files)
# files = filter_by_str("_VD_", files)
print(len(files))
files = filter_by_str("2019", files, exclude=True)
print(len(files))

1202
48
24


In [5]:
sorted(files)

['COREF_CB_GRID_BERKELEY_TD_PREDS_.dill',
 'COREF_CB_GRID_BERKELEY_TD_YS_.dill',
 'COREF_CB_GRID_BERKELEY_VD_PREDS_.dill',
 'COREF_CB_GRID_BERKELEY_VD_YS_.dill',
 'COREF_CB_GRID_STANFORD_TD_PREDS_.dill',
 'COREF_CB_GRID_STANFORD_TD_YS_.dill',
 'COREF_CB_GRID_STANFORD_VD_PREDS_.dill',
 'COREF_CB_GRID_STANFORD_VD_YS_.dill',
 'COREF_CB_NEAREST_TAG_TD_PREDS_.dill',
 'COREF_CB_NEAREST_TAG_TD_YS_.dill',
 'COREF_CB_NEAREST_TAG_VD_PREDS_.dill',
 'COREF_CB_NEAREST_TAG_VD_YS_.dill',
 'COREF_SC_GRID_BERKELEY_TD_PREDS_.dill',
 'COREF_SC_GRID_BERKELEY_TD_YS_.dill',
 'COREF_SC_GRID_BERKELEY_VD_PREDS_.dill',
 'COREF_SC_GRID_BERKELEY_VD_YS_.dill',
 'COREF_SC_GRID_STANFORD_TD_PREDS_.dill',
 'COREF_SC_GRID_STANFORD_TD_YS_.dill',
 'COREF_SC_GRID_STANFORD_VD_PREDS_.dill',
 'COREF_SC_GRID_STANFORD_VD_YS_.dill',
 'COREF_SC_NEAREST_TAG_TD_PREDS_.dill',
 'COREF_SC_NEAREST_TAG_TD_YS_.dill',
 'COREF_SC_NEAREST_TAG_VD_PREDS_.dill',
 'COREF_SC_NEAREST_TAG_VD_YS_.dill']

In [6]:
sc_files_tr = filter_by_str(["_SC_","_TD_"], files)
cb_files_tr = filter_by_str(["_CB_","_TD_"], files)
len(cb_files_tr), len(sc_files_tr)

(6, 6)

In [7]:
sc_files_test = filter_by_str(["_SC_","_VD_"], files)
cb_files_test = filter_by_str(["_CB_","_VD_"], files)
len(cb_files_test), len(sc_files_test)

(6, 6)

In [8]:
sorted(cb_files_tr)

['COREF_CB_GRID_BERKELEY_TD_PREDS_.dill',
 'COREF_CB_GRID_BERKELEY_TD_YS_.dill',
 'COREF_CB_GRID_STANFORD_TD_PREDS_.dill',
 'COREF_CB_GRID_STANFORD_TD_YS_.dill',
 'COREF_CB_NEAREST_TAG_TD_PREDS_.dill',
 'COREF_CB_NEAREST_TAG_TD_YS_.dill']

In [9]:
sorted(cb_files_test)

['COREF_CB_GRID_BERKELEY_VD_PREDS_.dill',
 'COREF_CB_GRID_BERKELEY_VD_YS_.dill',
 'COREF_CB_GRID_STANFORD_VD_PREDS_.dill',
 'COREF_CB_GRID_STANFORD_VD_YS_.dill',
 'COREF_CB_NEAREST_TAG_VD_PREDS_.dill',
 'COREF_CB_NEAREST_TAG_VD_YS_.dill']

In [10]:
sorted(sc_files_tr)

['COREF_SC_GRID_BERKELEY_TD_PREDS_.dill',
 'COREF_SC_GRID_BERKELEY_TD_YS_.dill',
 'COREF_SC_GRID_STANFORD_TD_PREDS_.dill',
 'COREF_SC_GRID_STANFORD_TD_YS_.dill',
 'COREF_SC_NEAREST_TAG_TD_PREDS_.dill',
 'COREF_SC_NEAREST_TAG_TD_YS_.dill']

In [11]:
sorted(sc_files_test)

['COREF_SC_GRID_BERKELEY_VD_PREDS_.dill',
 'COREF_SC_GRID_BERKELEY_VD_YS_.dill',
 'COREF_SC_GRID_STANFORD_VD_PREDS_.dill',
 'COREF_SC_GRID_STANFORD_VD_YS_.dill',
 'COREF_SC_NEAREST_TAG_VD_PREDS_.dill',
 'COREF_SC_NEAREST_TAG_VD_YS_.dill']

In [12]:
from collections import OrderedDict
filters = OrderedDict({
    "STANFORD":     "STANFORD",
    "BERKELEY":     "BERKELEY",
    "NEAREST_TAG":  "NEAREST_TAG"
})

In [13]:
from pprint import pprint

def load_predictions(input_files):
    algo2preds = dict()
    pred_files = filter_by_str("_PREDS", input_files)
    for algo_name, fltr in filters.items():
        f_files = filter_by_str(fltr, pred_files)
        assert len(f_files) == 1, (algo_name,f_files)
        fname = FOLDER + f_files[0]
        with open(fname, "rb+") as f:
            algo2preds[algo_name] = dill.load(f)
    return algo2preds

In [14]:
cb_preds_tr = load_predictions(cb_files_tr)
sc_preds_tr = load_predictions(sc_files_tr)

In [15]:
cb_preds_test = load_predictions(cb_files_test)
sc_preds_test = load_predictions(sc_files_test)

In [16]:
with open(FOLDER + "COREF_CB_GRID_BERKELEY_TD_YS_.dill", "rb+") as f:
    cb_ysbytag_tr = dill.load(f)
    
with open(FOLDER + "COREF_SC_GRID_BERKELEY_TD_YS_.dill", "rb+") as f:
    sc_ysbytag_tr = dill.load(f)

In [17]:
with open(FOLDER + "COREF_CB_GRID_BERKELEY_VD_YS_.dill", "rb+") as f:
    cb_ysbytag_test = dill.load(f)
    
with open(FOLDER + "COREF_SC_GRID_BERKELEY_VD_YS_.dill", "rb+") as f:
    sc_ysbytag_test = dill.load(f)

In [18]:
def get_all_preds(predsbytag, k_filter=None):
    if k_filter is None:
        k_filter = set(predsbytag.keys())
    all_p = []
    
    for k, vals in sorted(predsbytag.items(), key = lambda tpl: tpl[0]):
        if k not in k_filter:
            continue
        all_p.extend(vals)
    return all_p

In [19]:
def compute_p_value_binomial_test(ysbytag, predsbytaga, predsbytagb, alternative='two-sided'):
#     assert len(ysbytag.keys()) == len(predsbytaga.keys()), (len(ysbytag.keys()),len(predsbytaga.keys())) 
#     assert len(ysbytag.keys()) == len(predsbytagb.keys()), (len(ysbytag.keys()),len(predsbytagb.keys())) 

    first = list(ysbytag.keys())[0]
    assert len(ysbytag[first]) == len(predsbytaga[first])
    assert len(ysbytag[first]) == len(predsbytagb[first])

    lbls = set(ysbytag.keys())
    lbls = lbls.intersection(predsbytaga.keys())
    lbls = lbls.intersection(predsbytagb.keys())
    
    ys = get_all_preds(ysbytag, k_filter=lbls)
    aas = get_all_preds(predsbytaga, k_filter=lbls)
    bbs = get_all_preds(predsbytagb, k_filter=lbls)

    assert len(ys) == len(aas) == len(bbs)

    successes = defaultdict(int)
    both_correct, both_wrong, a_correct_only, b_correct_only = 0,0,0,0
    for y,a,b in zip(ys,aas,bbs):
        if a == b:
            if a == y:
                both_correct +=1
            else:
                both_wrong += 1        
        else: # a != b
            if a == y:
                successes["a"] += 1
                a_correct_only += 1
            else:
                successes["b"] += 1
                b_correct_only +=1


    mcn_result = mcnemar([[both_correct, a_correct_only],[b_correct_only, both_wrong]], exact=True)
#     p_value = stats.binom_test(successes["a"], sum(successes.values()), p=0.5, alternative=alternative)
    return mcn_result.pvalue

In [39]:
def print_comparison(ysbytag, algo2preds, alternative="two-sided", stats_only=False):
    algo2metrics = {}
    for algo, preds in algo2preds.items():
        mean_metrics = ResultsProcessor.compute_mean_metrics(ysbytag, preds)
        algo2metrics[algo] = mean_metrics[__MICRO_F1__]

    matrix = dict()
    for algo_name_a, predsbytaga in algo2preds.items():
        for algo_name_b, predsbytagb in algo2preds.items():
            if algo_name_a == algo_name_b:
                continue
            f1_a = algo2metrics[algo_name_a]["f1_score"]
            rec_a = algo2metrics[algo_name_a]["recall"]
            prec_a = algo2metrics[algo_name_a]["precision"]
            
            f1_b = algo2metrics[algo_name_b]["f1_score"]
            rec_b = algo2metrics[algo_name_b]["recall"]
            prec_b = algo2metrics[algo_name_b]["precision"]
            
            pval = compute_p_value_binomial_test(ysbytag, predsbytaga, predsbytagb, alternative)
            if stats_only:
                print(f"{algo_name_a.ljust(20)} \t{algo_name_b.ljust(20)}\t {pval}")
            else:
                print(f"{algo_name_a.ljust(20)} {f1_a:.3f} {rec_a:.3f} {prec_a:.3f} {algo_name_b.ljust(20)} {f1_b:.3f} {rec_b:.3f} {prec_b:.3f}\t [{pval}]")
            matrix[(algo_name_a, algo_name_b)] = pval
        print()

In [40]:
import mlxtend
from mlxtend.evaluate import cochrans_q

def run_cochrans_test(ysbytag, predsbyalgo):
    all_preds = []
    lens = []
    for a, preds in predsbyalgo.items():
        p = get_all_preds(preds)
        arr = np.asarray(p)
        all_preds.append(arr)
        lens.append(arr.shape[0])
    ground_truth = np.asarray(get_all_preds(ysbytag))
    lens.append(ground_truth.shape[0])
    assert len(set(lens)) == 1, "different length arrays"
    qvalue, pvalue = cochrans_q(ground_truth, *all_preds)
    return qvalue, pvalue

# Training Data

In [41]:
run_cochrans_test(cb_ysbytag_tr, cb_preds_tr)

(89.53583617747441, 3.610274366089721e-20)

In [42]:
#perceptron and CRF differ somewhat from prior run
print_comparison(cb_ysbytag_tr, cb_preds_tr)

STANFORD             0.038 0.020 0.292 BERKELEY             0.059 0.032 0.379	 [0.690037965774536]
STANFORD             0.038 0.020 0.292 NEAREST_TAG          0.262 0.241 0.287	 [1.7826345232844652e-11]

BERKELEY             0.059 0.032 0.379 STANFORD             0.038 0.020 0.292	 [0.690037965774536]
BERKELEY             0.059 0.032 0.379 NEAREST_TAG          0.262 0.241 0.287	 [2.1103320310110507e-12]

NEAREST_TAG          0.262 0.241 0.287 STANFORD             0.038 0.020 0.292	 [1.7826345232844652e-11]
NEAREST_TAG          0.262 0.241 0.287 BERKELEY             0.059 0.032 0.379	 [2.1103320310110507e-12]



In [43]:
run_cochrans_test(sc_ysbytag_tr, sc_preds_tr)

(21.758928571428573, 1.884120585702624e-05)

In [44]:
# RNN only differs
print_comparison(sc_ysbytag_tr, sc_preds_tr)

STANFORD             0.126 0.070 0.673 BERKELEY             0.036 0.019 0.300	 [0.00033670934946623804]
STANFORD             0.126 0.070 0.673 NEAREST_TAG          0.235 0.167 0.399	 [2.7302648267492683e-05]

BERKELEY             0.036 0.019 0.300 STANFORD             0.126 0.070 0.673	 [0.00033670934946623804]
BERKELEY             0.036 0.019 0.300 NEAREST_TAG          0.235 0.167 0.399	 [0.058441136475943914]

NEAREST_TAG          0.235 0.167 0.399 STANFORD             0.126 0.070 0.673	 [2.7302648267492683e-05]
NEAREST_TAG          0.235 0.167 0.399 BERKELEY             0.036 0.019 0.300	 [0.058441136475943914]



# Test Data

In [45]:
run_cochrans_test(cb_ysbytag_test, cb_preds_test)

(2.074074074074074, 0.3545035081397576)

In [46]:
#perceptron and CRF differ somewhat from prior run
print_comparison(cb_ysbytag_test, cb_preds_test)

STANFORD             0.048 0.026 0.333 BERKELEY             0.045 0.026 0.200	 [0.5]
STANFORD             0.048 0.026 0.333 NEAREST_TAG          0.324 0.282 0.379	 [0.32693958282470703]

BERKELEY             0.045 0.026 0.200 STANFORD             0.048 0.026 0.333	 [0.5]
BERKELEY             0.045 0.026 0.200 NEAREST_TAG          0.324 0.282 0.379	 [0.5571970939636235]

NEAREST_TAG          0.324 0.282 0.379 STANFORD             0.048 0.026 0.333	 [0.32693958282470703]
NEAREST_TAG          0.324 0.282 0.379 BERKELEY             0.045 0.026 0.200	 [0.5571970939636235]



In [47]:
run_cochrans_test(sc_ysbytag_test, sc_preds_test)

(15.878048780487806, 0.0003565541663988249)

In [48]:
# RNN only differs
print_comparison(sc_ysbytag_test, sc_preds_test)

STANFORD             0.142 0.084 0.450 BERKELEY             0.129 0.075 0.471	 [0.9999999999999998]
STANFORD             0.142 0.084 0.450 NEAREST_TAG          0.295 0.262 0.337	 [0.005870707889737204]

BERKELEY             0.129 0.075 0.471 STANFORD             0.142 0.084 0.450	 [0.9999999999999998]
BERKELEY             0.129 0.075 0.471 NEAREST_TAG          0.295 0.262 0.337	 [0.0025475648982539578]

NEAREST_TAG          0.295 0.262 0.337 STANFORD             0.142 0.084 0.450	 [0.005870707889737204]
NEAREST_TAG          0.295 0.262 0.337 BERKELEY             0.129 0.075 0.471	 [0.0025475648982539578]

