# Performance figures

In [3]:
import numpy as np
import pandas as pd
from itertools import islice
from ast import literal_eval
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Rectangle
%matplotlib inline
plt.rcParams.update({'font.size': 14})

## Baseline model performance
There were 3 baselines:
1. GPT3
    * Without relation embedding
    * With relation embedding
2. OpenIE
    * Without relation embedding
    * With relation embedding
3. My heuristic model
    * Model relies on relation embedding, so only includes relation embedding

Each was evaluated according to the following:
1. GPT3
    * Without relation embedding: evaluated without checking relation labels
    * With relation embedding: evaluated both with and without checking relation labels
2. OpenIE
    * Without relation embedding: evaluated without checking relation labels
    * With relation embedding: evaluated both with and without checking relation labels
3. My heuristic model
    * With relation embedding: evaluated with and without checking relation labels

### Reading in the data

In [1]:
data_paths = {
    
    'gpt3_no_embed':'../data/baselines/gpt3/10Mar_FINAL_TEST_no_rel_embed_performance.csv',
    'gpt3_embed_check':'../data/baselines/gpt3/10Mar_FINAL_TEST_embedded_rel_yes_check_rels_performance.csv',
    'gpt3_embed_no_check':'../data/baselines/gpt3/10Mar_FINAL_TEST_embedded_rel_no_check_rels_performance.csv',
    #'openIE_no_embed':,
    #'openIE_embed_check':,
    #'openIE_embed_no_check',
    'heuristic_check':'../data/distant_sup_output/09Mar_TEST_check_rels_performance.csv',
    'heuristic_no_check':'../data/distant_sup_output/09Mar_TEST_no_check_rels_performance.csv',

}

In [6]:
# Read in all dfs
perf_dfs = {k:pd.read_csv(v) for k,v in data_paths.items()}
# Add model/eval name as last column to all
perf_df_list = []
for model_eval, df in perf_dfs.items():
    df['model_eval'] = model_eval
    perf_df_list.append(df)
# Concat
overall_perfs = pd.concat(perf_df_list)
overall_perfs

Unnamed: 0,precision,precision_CI,recall,recall_CI,F1,F1_CI,model_eval
0,0.011923,"(0.0, 0.023809523809523808)",0.010914,"(0.0, 0.02631578947368421)",0.011292,"(0.0, 0.024691358024691357)",gpt3_no_embed
0,0.0,"(0.0, 0.0)",0.0,"(0.0, 0.0)",0.0,"(0.0, 0.0)",gpt3_embed_check
0,0.012005,"(0.0, 0.023255813953488372)",0.011105,"(0.0, 0.02631578947368421)",0.011423,"(0.0, 0.023676470588235285)",gpt3_embed_no_check
0,0.042277,"(0.0, 0.1)",0.011388,"(0.0, 0.02564102564102564)",0.017627,"(0.0, 0.03541666666666665)",heuristic_check
0,0.041411,"(0.0, 0.1)",0.010674,"(0.0, 0.02300475687103593)",0.016738,"(0.0, 0.036111111111111066)",heuristic_no_check


#### NOTE
I find these results very suspicious, for two reasons:
1. The performance of the heuristic alg is higher when checking the relation labels than when not. This shouldn't be possible, as anything evaluated as correct when checking rel labels should still be correct when evaluated without checking.
2. The performance of the heuristic algorithm is *drastically* worse than what it was the first time I ran it before making any of the changes that helped me drop way fewer sentences. Again, the performance should at least be the same, even if having access to more sentences didn't help, unless new sentences that weren't dropped before are now dropped, which I find that hard to beleive, as a huge number of sentences aren't dropped now. However, that original performance was evaluated on all documents, not just the test set; this observation could potentially be due to the specific documents in the test set? --> Should eval just the current test set out of the old predictions to test that hypothesis.

### Generating the plot

In [None]:
fig, axs = plt.subplots(1, subplot_kw={'aspect': 'equal'}, figsize=(20, 20), sharex=True)
fig.subplots_adjust(hspace=0.05)

######################### Make 2D plot for models with both values ##############################
for label in full_laydown_order:
    
    # Get ellipse
    ell_tup = f1_full_entries[label]
    # Get color & linestyle
    color_label = ' | '.join(label.split(' | ')[:-1])
    color = full_entry_colors[color_label]
    if label.split(' | ')[-1] == 'Original':
        linestyle = ':'
    else:
        linestyle = '-'
        
    # If the relation performance and CI are zero, need to make a thin rectangle instead of an ellipse
    if 0 in ell_tup:
        rect_x = ell_tup[0] - 0.5*ell_tup[2]
        shape = Rectangle((rect_x, 0), ell_tup[2], 0.02, edgecolor='black', linewidth=2,
                          linestyle=linestyle, facecolor=color, label=color_label, alpha=0.5)
    else:
        shape = Ellipse((ell_tup[0], ell_tup[1]), ell_tup[2], ell_tup[3], edgecolor='black', linewidth=2,
                        linestyle=linestyle, facecolor=color, label=color_label, alpha=0.5)
    
    # Add shape to axis
    axs[0].add_artist(shape)

axs[0].set_ylabel('Relation Performance')

# Keep only the solid line version of each model/training set combination
ellipses, labels = axs[0].get_legend_handles_labels()
keep_labs, keep_ells = [], []
for lab, ell in zip(full_laydown_order, ellipses):
    if lab not in keep_labs and 'PICKLE' in lab:
        keep_labs.append(lab)
        keep_ells.append(ell)
legend1 = axs[0].legend(handles=keep_ells, loc='upper left')
# Dummy lines with NO entries, just to create the black style legend
dummy_lines = []
for linestyle, label in zip([':', '-'], ['Original', 'PICKLE']):
    dummy_lines.append(Ellipse((0, 0), 0, 0, edgecolor="black", linewidth=2, linestyle=linestyle, label=label,
                              facecolor='white'))
legend2 = axs[0].legend(dummy_lines, ['Original', 'PICKLE'])
for legobj in legend2.legendHandles:
    if legobj._label == 'PICKLE':
        legobj.set_linewidth(1)
axs[0].add_artist(legend1)
axs[0].add_artist(legend2)

############################################### Annotate plot #######################################
orig_text = 'Performance on original\ndomain for all models\nis nearly identical'
pick_text = 'On PICKLE, SciERC models\noutperform GENIA models\nfor entity extraction'
ace_text = 'ACE05 models extract 0\nrelations, and also have\npoor entity extraction'

axs[0].annotate('', xy=(0.8, 0), xytext=(0.8,0.45), arrowprops=dict(arrowstyle='<->', lw=2))
axs[0].text(
    0.64, 0.4, orig_text, ha="center", va="center", size=12,
    bbox=dict(boxstyle='round', fc="white", ec="black", lw=2))

axs[0].annotate('', xy=(0.3, 0), xycoords='data',
            xytext=(0.55, 0.1), textcoords='data',
            arrowprops=dict(arrowstyle="<->",
                            connectionstyle="bar,angle=360,fraction=-0.2", lw=2))
axs[0].text(
    0.43, 0.22, pick_text, ha="center", va="center", size=12,
    bbox=dict(boxstyle='round', fc="white", ec="black", lw=2))

axs[0].annotate('', xy=(0.1, 0.03), xytext=(0.1,0.45), arrowprops=dict(arrowstyle='->', lw=2))
axs[0].text(
    0.175, 0.5, ace_text, ha="center", va="center", size=12,
    bbox=dict(boxstyle='round', fc="white", ec="black", lw=2))
