## NOTE - Use Python 3.6

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import pymongo
import dill
import os

from BrattEssay import load_bratt_essays
from load_data import load_process_essays
from window_based_tagger_config import get_config
from FindFiles import find_files
from DirUtils import dir_exists
from results_procesor import ResultsProcessor, __MICRO_F1__
from Settings import Settings
from statsmodels.stats.contingency_tables import mcnemar

In [2]:
# result = mcnemar([[1,2],[3,1]], exact=True)
# result.statistic, result.pvalue

In [3]:
FOLDER = "/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/Predictions/"

In [4]:
def filter_by_str(s, files, exclude=False):   
    return  [f for f in files if (s in f) != exclude]

In [5]:
all_files = os.listdir(FOLDER)
print(len(all_files))
files = list(all_files)
# files = filter_by_str("_TAGGING_", files, exclude=True)
files = filter_by_str("_HYPER_PARAM2_", files, exclude=True) # this is the essay parser
files = filter_by_str("_CR_", files) 
# files = filter_by_str("_RNN_", files)
files = filter_by_str("_VD_", files)
print(len(files))
files = filter_by_str("2019", files, exclude=True)
print(len(files))

988
76
14


In [6]:
sorted(files)

['TEST_CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_FINAL_RUN_VD_PREDS__.dill',
 'TEST_CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_FINAL_RUN_VD_YS_.dill',
 'TEST_CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_VD_PREDS_.dill',
 'TEST_CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_VD_YS_.dill',
 'TEST_CR_CB_STACKED_VD_PREDS_.dill',
 'TEST_CR_CB_STACKED_VD_YS_.dill',
 'TEST_CR_CB_TAGGING_VD_MOST_COMMON_TAG_RNN_PREDS_.dill',
 'TEST_CR_CB_TAGGING_VD_MOST_COMMON_TAG_RNN_YS_.dill',
 'TEST_CR_SC_SHIFT_REDUCE_PARSER_TEMPLATED_FINAL_RUN_VD_PREDS_.dill',
 'TEST_CR_SC_SHIFT_REDUCE_PARSER_TEMPLATED_FINAL_RUN_VD_YS_.dill',
 'TEST_CR_SC_STACKED_VD_PREDS_.dill',
 'TEST_CR_SC_STACKED_VD_YS_.dill',
 'TEST_CR_SC_TAGGING_VD_MOST_COMMON_TAG_RNN_PREDS_.dill',
 'TEST_CR_SC_TAGGING_VD_MOST_COMMON_TAG_RNN_YS_.dill']

In [7]:
sc_files = filter_by_str("_SC_", files)
cb_files = filter_by_str("_CB_", files)
len(cb_files), len(sc_files)

(8, 6)

In [8]:
from collections import OrderedDict
filters = OrderedDict({
    "STACKED_CLASSIFIER":"STACKED",
    "RNN": "RNN",
    "PARSER": "PARSER_TEMPLATED_FINAL_RUN"
})

In [9]:
from pprint import pprint

def load_predictions(input_files):
    algo2preds = dict()
    pred_files = filter_by_str("_PREDS", input_files)
    for algo_name, fltr in filters.items():
        f_files = filter_by_str(fltr, pred_files)
        assert len(f_files) == 1, (algo_name,f_files)
        fname = FOLDER + f_files[0]
        with open(fname, "rb+") as f:
            algo2preds[algo_name] = dill.load(f)
    return algo2preds

In [10]:
cb_preds = load_predictions(cb_files)
sc_preds = load_predictions(sc_files)

In [11]:
with open(FOLDER + "TEST_CR_CB_STACKED_VD_YS_.dill", "rb+") as f:
    cb_ysbytag = dill.load(f)
    
with open(FOLDER + "TEST_CR_SC_STACKED_VD_YS_.dill", "rb+") as f:
    sc_ysbytag = dill.load(f)

In [12]:
# sorted(files)

In [13]:
from scipy import stats

In [14]:
def get_all_preds(predsbytag, k_filter=None):
    if k_filter is None:
        k_filter = set(predsbytag.keys())
    all_p = []
    
    for k, vals in sorted(predsbytag.items(), key = lambda tpl: tpl[0]):
        if k not in k_filter:
            continue
        all_p.extend(vals)
    return all_p

In [15]:
def compute_p_value_binomial_test(ysbytag, predsbytaga, predsbytagb, alternative='two-sided'):
#     assert len(ysbytag.keys()) == len(predsbytaga.keys()), (len(ysbytag.keys()),len(predsbytaga.keys())) 
#     assert len(ysbytag.keys()) == len(predsbytagb.keys()), (len(ysbytag.keys()),len(predsbytagb.keys())) 

    first = list(ysbytag.keys())[0]
    assert len(ysbytag[first]) == len(predsbytaga[first])
    assert len(ysbytag[first]) == len(predsbytagb[first])

    lbls = set(ysbytag.keys())
    lbls = lbls.intersection(predsbytaga.keys())
    lbls = lbls.intersection(predsbytagb.keys())
    
    ys = get_all_preds(ysbytag, k_filter=lbls)
    aas = get_all_preds(predsbytaga, k_filter=lbls)
    bbs = get_all_preds(predsbytagb, k_filter=lbls)

    assert len(ys) == len(aas) == len(bbs)

    successes = defaultdict(int)
    both_correct, both_wrong, a_correct_only, b_correct_only = 0,0,0,0
    for y,a,b in zip(ys,aas,bbs):
        if a == b:
            if a == y:
                both_correct +=1
            else:
                both_wrong += 1        
        else: # a != b
            if a == y:
                successes["a"] += 1
                a_correct_only += 1
            else:
                successes["b"] += 1
                b_correct_only +=1


    mcn_result = mcnemar([[both_correct, a_correct_only],[b_correct_only, both_wrong]], exact=True)
#     p_value = stats.binom_test(successes["a"], sum(successes.values()), p=0.5, alternative=alternative)
    return mcn_result.pvalue

## Get Predicted Tags from Labelled Essays

In [16]:
def print_comparison(ysbytag, algo2preds, alternative="two-sided", stats_only=False):
    algo2metrics = {}
    for algo, preds in algo2preds.items():
        mean_metrics = ResultsProcessor.compute_mean_metrics(ysbytag, preds)
        algo2metrics[algo] = mean_metrics[__MICRO_F1__]

    matrix = dict()
    for algo_name_a, predsbytaga in algo2preds.items():
        for algo_name_b, predsbytagb in algo2preds.items():
            if algo_name_a == algo_name_b:
                continue
            f1_a = algo2metrics[algo_name_a]["f1_score"]
            f1_b = algo2metrics[algo_name_b]["f1_score"]
            pval = compute_p_value_binomial_test(ysbytag, predsbytaga, predsbytagb, alternative)
            if stats_only:
                print(f"{algo_name_a.ljust(20)} \t{algo_name_b.ljust(20)}\t {pval}")
            else:
                print(f"{algo_name_a.ljust(20)} {f1_a:.4f}\t{algo_name_b.ljust(20)} {f1_b:.4f}\t {pval}")
            matrix[(algo_name_a, algo_name_b)] = pval
        print()

In [17]:
#perceptron and CRF differ somewhat from prior run
print_comparison(cb_ysbytag, cb_preds)

STACKED_CLASSIFIER   0.7081	RNN                  0.6730	 1.6427816964126646e-05
STACKED_CLASSIFIER   0.7081	PARSER               0.7264	 0.6931012792713278

RNN                  0.6730	STACKED_CLASSIFIER   0.7081	 1.6427816964126646e-05
RNN                  0.6730	PARSER               0.7264	 0.0003808264095775129

PARSER               0.7264	STACKED_CLASSIFIER   0.7081	 0.6931012792713278
PARSER               0.7264	RNN                  0.6730	 0.0003808264095775129



In [18]:
# RNN only differs
print_comparison(sc_ysbytag, sc_preds)

STACKED_CLASSIFIER   0.7674	RNN                  0.7851	 0.4932963607808494
STACKED_CLASSIFIER   0.7674	PARSER               0.7908	 0.11321794589246366

RNN                  0.7851	STACKED_CLASSIFIER   0.7674	 0.4932963607808494
RNN                  0.7851	PARSER               0.7908	 0.4148572810310714

PARSER               0.7908	STACKED_CLASSIFIER   0.7674	 0.11321794589246366
PARSER               0.7908	RNN                  0.7851	 0.4148572810310714



## Stats Only

In [19]:
print_comparison(cb_ysbytag, cb_preds, stats_only=True)

STACKED_CLASSIFIER   	RNN                 	 1.6427816964126646e-05
STACKED_CLASSIFIER   	PARSER              	 0.6931012792713278

RNN                  	STACKED_CLASSIFIER  	 1.6427816964126646e-05
RNN                  	PARSER              	 0.0003808264095775129

PARSER               	STACKED_CLASSIFIER  	 0.6931012792713278
PARSER               	RNN                 	 0.0003808264095775129



In [20]:
print_comparison(sc_ysbytag, sc_preds, stats_only=True)

STACKED_CLASSIFIER   	RNN                 	 0.4932963607808494
STACKED_CLASSIFIER   	PARSER              	 0.11321794589246366

RNN                  	STACKED_CLASSIFIER  	 0.4932963607808494
RNN                  	PARSER              	 0.4148572810310714

PARSER               	STACKED_CLASSIFIER  	 0.11321794589246366
PARSER               	RNN                 	 0.4148572810310714

