## NOTE - Use Python 3.6

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import pymongo
import dill
import os
from scipy import stats

from BrattEssay import load_bratt_essays
from load_data import load_process_essays
from window_based_tagger_config import get_config
from FindFiles import find_files
from DirUtils import dir_exists
from results_procesor import ResultsProcessor, __MICRO_F1__
from Settings import Settings
from statsmodels.stats.contingency_tables import mcnemar

In [2]:
# result = mcnemar([[1,2],[3,1]], exact=True)
# result.statistic, result.pvalue

In [3]:
FOLDER = "/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/Predictions/"

In [4]:
def filter_by_str(filters, files, exclude=False):
    if type(filters) != list:
        filters = [filters]
    flt = files
    for fltr in filters:
        files = [f for f in files if (fltr in f) != exclude]
    return  files

In [5]:
all_files = os.listdir(FOLDER)
print(len(all_files))
files = list(all_files)
# files = filter_by_str("_TAGGING_", files, exclude=True)
files = filter_by_str("_FINAL_RUN_", files, exclude=True) # this is the sentence CREL parser (sentence level)
files = filter_by_str("_TAGGING_", files, exclude=True) # this is the sentence CREL parser (sentence level)
files = filter_by_str("_STACKED_", files, exclude=True) # this is the sentence CREL parser (sentence level)
files = filter_by_str("_PA_", files, exclude=True) # this is the sentence CREL parser (sentence level)
# files = filter_by_str("_CR_", files) 
files = filter_by_str("TEST_", files)
files = filter_by_str("_VD_", files)
print(len(files))
files = filter_by_str("2019", files, exclude=True)
print(len(files))

1202
163
15


In [6]:
sorted(files)

['SENT_TEST_CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_FIXED_VD_PREDS__.dill',
 'SENT_TEST_CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_FIXED_VD_YS_.dill',
 'SENT_TEST_CR_SC_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM_VD_2_PREDS_.dill',
 'SENT_TEST_CR_SC_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM_VD_2_YS_.dill',
 'TEST_CB_STR_PCPTRN_RE-RANKER_VD_PREDS.dill',
 'TEST_CB_STR_PCPTRN_RE-RANKER_VD_YS_.dill',
 'TEST_CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM2_VD_PREDS__.dill',
 'TEST_CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM2_VD_YS_.dill',
 'TEST_CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_VD_PREDS_.dill',
 'TEST_CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_VD_YS_.dill',
 'TEST_CR_SC_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM2_VD_PREDS__.dill',
 'TEST_CR_SC_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM2_VD_YS_.dill',
 'TEST_CR_SC_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM2_VD_YS__.dill',
 'TEST_SC_STR_PCPTRN_RE-RANKER_VD_2_PREDS__.dill',
 'TEST_SC_STR_PCPTRN_RE-RANKER_VD_2_YS_.dill']

In [7]:
sc_files = filter_by_str("_SC_", files)
cb_files = filter_by_str("_CB_", files)
len(cb_files), len(sc_files)

(8, 7)

In [8]:
cb_files

['TEST_CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_VD_YS_.dill',
 'TEST_CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_VD_PREDS_.dill',
 'SENT_TEST_CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_FIXED_VD_YS_.dill',
 'TEST_CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM2_VD_PREDS__.dill',
 'SENT_TEST_CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_FIXED_VD_PREDS__.dill',
 'TEST_CB_STR_PCPTRN_RE-RANKER_VD_YS_.dill',
 'TEST_CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM2_VD_YS_.dill',
 'TEST_CB_STR_PCPTRN_RE-RANKER_VD_PREDS.dill']

In [9]:
sc_files

['SENT_TEST_CR_SC_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM_VD_2_PREDS_.dill',
 'SENT_TEST_CR_SC_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM_VD_2_YS_.dill',
 'TEST_CR_SC_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM2_VD_YS_.dill',
 'TEST_CR_SC_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM2_VD_YS__.dill',
 'TEST_SC_STR_PCPTRN_RE-RANKER_VD_2_YS_.dill',
 'TEST_CR_SC_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM2_VD_PREDS__.dill',
 'TEST_SC_STR_PCPTRN_RE-RANKER_VD_2_PREDS__.dill']

In [10]:
from collections import OrderedDict
filters = OrderedDict({
    "SENT_PARSER":  ["SENT_", "_PARSER_"],
    "ESSAY_PARSER": "SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM2",
    "RE-RANKER":    "STR_PCPTRN_RE-RANKER"
})

In [11]:
from pprint import pprint

def load_predictions(input_files):
    algo2preds = dict()
    pred_files = filter_by_str("_PREDS", input_files)
    for algo_name, fltr in filters.items():
        f_files = filter_by_str(fltr, pred_files)
        assert len(f_files) == 1, (algo_name,f_files)
        fname = FOLDER + f_files[0]
        with open(fname, "rb+") as f:
            algo2preds[algo_name] = dill.load(f)
    return algo2preds

In [12]:
cb_preds = load_predictions(cb_files)
sc_preds = load_predictions(sc_files)

In [13]:
with open(FOLDER + "SENT_TEST_CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_FIXED_VD_YS_.dill", "rb+") as f:
    cb_ysbytag = dill.load(f)
    
with open(FOLDER + "SENT_TEST_CR_SC_SHIFT_REDUCE_PARSER_TEMPLATED_HYPER_PARAM_VD_2_YS_.dill", "rb+") as f:
    sc_ysbytag = dill.load(f)

In [14]:
def get_all_preds(predsbytag, k_filter=None):
    if k_filter is None:
        k_filter = set(predsbytag.keys())
    all_p = []
    
    for k, vals in sorted(predsbytag.items(), key = lambda tpl: tpl[0]):
        if k not in k_filter:
            continue
        all_p.extend(vals)
    return all_p

In [15]:
def compute_p_value_binomial_test(ysbytag, predsbytaga, predsbytagb, alternative='two-sided'):
#     assert len(ysbytag.keys()) == len(predsbytaga.keys()), (len(ysbytag.keys()),len(predsbytaga.keys())) 
#     assert len(ysbytag.keys()) == len(predsbytagb.keys()), (len(ysbytag.keys()),len(predsbytagb.keys())) 

    first = list(ysbytag.keys())[0]
    assert len(ysbytag[first]) == len(predsbytaga[first])
    assert len(ysbytag[first]) == len(predsbytagb[first])

    lbls = set(ysbytag.keys())
    lbls = lbls.intersection(predsbytaga.keys())
    lbls = lbls.intersection(predsbytagb.keys())
    
    ys = get_all_preds(ysbytag, k_filter=lbls)
    aas = get_all_preds(predsbytaga, k_filter=lbls)
    bbs = get_all_preds(predsbytagb, k_filter=lbls)

    assert len(ys) == len(aas) == len(bbs)

    successes = defaultdict(int)
    both_correct, both_wrong, a_correct_only, b_correct_only = 0,0,0,0
    for y,a,b in zip(ys,aas,bbs):
        if a == b:
            if a == y:
                both_correct +=1
            else:
                both_wrong += 1        
        else: # a != b
            if a == y:
                successes["a"] += 1
                a_correct_only += 1
            else:
                successes["b"] += 1
                b_correct_only +=1


    mcn_result = mcnemar([[both_correct, a_correct_only],[b_correct_only, both_wrong]], exact=True)
#     p_value = stats.binom_test(successes["a"], sum(successes.values()), p=0.5, alternative=alternative)
    return mcn_result.pvalue

## Get Predicted Tags from Labelled Essays

In [16]:
def print_comparison(ysbytag, algo2preds, alternative="two-sided", stats_only=False):
    algo2metrics = {}
    for algo, preds in algo2preds.items():
        mean_metrics = ResultsProcessor.compute_mean_metrics(ysbytag, preds)
        algo2metrics[algo] = mean_metrics[__MICRO_F1__]

    matrix = dict()
    for algo_name_a, predsbytaga in algo2preds.items():
        for algo_name_b, predsbytagb in algo2preds.items():
            if algo_name_a == algo_name_b:
                continue
            f1_a = algo2metrics[algo_name_a]["f1_score"]
            f1_b = algo2metrics[algo_name_b]["f1_score"]
            pval = compute_p_value_binomial_test(ysbytag, predsbytaga, predsbytagb, alternative)
            if stats_only:
                print(f"{algo_name_a.ljust(20)} \t{algo_name_b.ljust(20)}\t {pval}")
            else:
                print(f"{algo_name_a.ljust(20)} {f1_a:.4f}\t{algo_name_b.ljust(20)} {f1_b:.4f}\t {pval}")
            matrix[(algo_name_a, algo_name_b)] = pval
        print()

In [17]:
import mlxtend
from mlxtend.evaluate import cochrans_q

def run_cochrans_test(ysbytag, predsbyalgo):
    all_preds = []
    for a, preds in predsbyalgo.items():
        p = get_all_preds(preds)
        all_preds.append(np.asarray(p))
    ground_truth = np.asarray(get_all_preds(ysbytag))
    qvalue, pvalue = cochrans_q(ground_truth, *all_preds)
    return qvalue, pvalue

# CB

In [18]:
run_cochrans_test(cb_ysbytag, cb_preds)

(7.0, 0.0301973834223185)

In [19]:
#perceptron and CRF differ somewhat from prior run
print_comparison(cb_ysbytag, cb_preds)

SENT_PARSER          0.7366	ESSAY_PARSER         0.7393	 0.1453655497472726
SENT_PARSER          0.7366	RE-RANKER            0.7500	 0.0021949089077805944

ESSAY_PARSER         0.7393	SENT_PARSER          0.7366	 0.1453655497472726
ESSAY_PARSER         0.7393	RE-RANKER            0.7500	 0.43699054908597207

RE-RANKER            0.7500	SENT_PARSER          0.7366	 0.0021949089077805944
RE-RANKER            0.7500	ESSAY_PARSER         0.7393	 0.43699054908597207



# SC

In [21]:
run_cochrans_test(sc_ysbytag, sc_preds) # not surprising, results are virtually identical

(4.130232558139535, 0.12680354511244885)

In [23]:
# RNN only differs
print_comparison(sc_ysbytag, sc_preds)

SENT_PARSER          0.8271	ESSAY_PARSER         0.8210	 0.15188924140345142
SENT_PARSER          0.8271	RE-RANKER            0.8292	 0.9049745264594615

ESSAY_PARSER         0.8210	SENT_PARSER          0.8271	 0.15188924140345142
ESSAY_PARSER         0.8210	RE-RANKER            0.8292	 0.12135284705943229

RE-RANKER            0.8292	SENT_PARSER          0.8271	 0.9049745264594615
RE-RANKER            0.8292	ESSAY_PARSER         0.8210	 0.12135284705943229



## Stats Only

In [57]:
print_comparison(cb_ysbytag, cb_preds, stats_only=True)

SENT_PARSER          	ESSAY_PARSER        	 0.1453655497472726
SENT_PARSER          	RE-RANKER           	 0.0021949089077805944

ESSAY_PARSER         	SENT_PARSER         	 0.1453655497472726
ESSAY_PARSER         	RE-RANKER           	 0.43699054908597207

RE-RANKER            	SENT_PARSER         	 0.0021949089077805944
RE-RANKER            	ESSAY_PARSER        	 0.43699054908597207



In [58]:
print_comparison(sc_ysbytag, sc_preds, stats_only=True)

SENT_PARSER          	ESSAY_PARSER        	 0.15188924140345142
SENT_PARSER          	RE-RANKER           	 0.9049745264594615

ESSAY_PARSER         	SENT_PARSER         	 0.15188924140345142
ESSAY_PARSER         	RE-RANKER           	 0.12135284705943229

RE-RANKER            	SENT_PARSER         	 0.9049745264594615
RE-RANKER            	ESSAY_PARSER        	 0.12135284705943229

