In [None]:
import pandas as pd
import pickle
import numpy as  np
from sklearn.metrics import classification_report, f1_score
import seaborn as sns
from matplotlib import pyplot as plt
sns.set(style='ticks', palette='Set2')
sns.despine()

In [None]:
dataset = pd.read_pickle('whole7.pickle')

In [None]:
with open('train_new_out_ablation_1001/aida_under_all_max_stats10_ner_wiki_model.pickle', 'rb') as fd:
    clf = pickle.load(fd)

In [None]:
train_df = dataset[dataset['src'].isin(['dataset_and_preds/AIDA-YAGO2_train.csv'])]

In [None]:
test_df = dataset[dataset['src'].isin(['dataset_and_preds/AIDA-YAGO2_testa.csv', 'dataset_and_preds/AIDA-YAGO2_testb.csv'])]

In [None]:
features =  [
                'cross_stats_10_max',
                'cross_stats_10_mean',
                'cross_stats_10_median',
                'cross_stats_10_stdev',
                'bi_stats_10_max',
                'bi_stats_10_mean',
                'bi_stats_10_median',
                'bi_stats_10_stdev',
                'ner_per',
                'ner_loc',
                'ner_org',
                'ner_misc',
                'wiki_per_cross',
                'wiki_loc_cross',
                'wiki_org_cross',
                'wiki_misc_cross',
            ]

In [None]:
y_whom = 'y_cross'

In [None]:
train_df_shape_original = train_df.shape[0]
test_df_shape_original = test_df.shape[0]

train_df = train_df[train_df[features].notna().all(axis=1)]
test_df = test_df[test_df[features].notna().all(axis=1)]

train_df_shape_notna = train_df.shape[0]
test_df_shape_notna = test_df.shape[0]

print('undersampling...')

train_df_0 = train_df.query(f'{y_whom} == 0')
train_df_1 = train_df.query(f'{y_whom} == 1')

train_df_1 = train_df_1.sample(frac=1).iloc[:train_df_0.shape[0]]
train_df = pd.concat([train_df_0, train_df_1]).sample(frac=1)

In [None]:
train_df_shape_actual = train_df.shape[0]
test_df_shape_actual = test_df.shape[0]

df_size_report = pd.DataFrame({
    'train': [train_df_shape_original, train_df_shape_notna, train_df_shape_actual],
    'test': [test_df_shape_original, test_df_shape_notna, test_df_shape_actual]
}, index=['original', 'notna', 'actual']).to_markdown()
print(df_size_report)

print(pd.DataFrame(train_df[y_whom].value_counts()).to_markdown())

X_train = train_df[features].values
y_train = train_df[y_whom].values

X_test = test_df[features].values
y_test = test_df[y_whom].values


In [None]:
y_pred = np.array(list(map(lambda x: x[1], clf.predict_proba(X_test))))
y_pred_round = np.round(y_pred)

test_df['y_pred_round'] = y_pred_round
test_df['y_pred'] = y_pred

bi_baseline = test_df.query('bi_labels == bi_best_candidate or Wikipedia_title == bi_best_candidate_title').shape[0]
cross_baseline = test_df.query('cross_labels == cross_best_candidate or Wikipedia_title == cross_best_candidate_title').shape[0]

bi_acc = test_df.query('(y_pred_round == 1 and (bi_labels == bi_best_candidate or Wikipedia_title == bi_best_candidate_title)) or (bi_labels == -1 and y_pred_round == 0)').shape[0]
cross_acc = test_df.query('(y_pred_round == 1 and (cross_labels == cross_best_candidate or Wikipedia_title == cross_best_candidate_title)) or (cross_labels == -1 and y_pred_round == 0)').shape[0]

_classification_report = classification_report(y_test, y_pred_round)

In [None]:
print(_classification_report)

In [None]:
name = 'oracle_1'
test = 'aida_test'

In [None]:
csv_report = pd.DataFrame()

In [None]:
[(0.5-i/20, 0.5+i/20) for i in range(11)]

In [None]:
intervals = [
 (0.6, 0.4), # no human validation
 (0.45, 0.55),
 (0.4, 0.6),
 (0.35, 0.65),
 (0.3, 0.7),
 (0.25, 0.75),
 (0.2, 0.8),
 (0.15, 0.85),
 (0.1, 0.9),
 (0.05, 0.95),
 #(0.0, 1.0) # all validated
]
intervals

In [None]:
for tl, th in intervals:
    print(tl ,th)

In [None]:
csv_report = pd.DataFrame()

In [None]:
test_df.columns

In [None]:
test_df['y_test'] = test_df[y_whom]

In [None]:
# oracle corrects in [0.25, 0.75]
    # TODO maybe look for a better way to get them (e.g. correct-error kde intersections ?)
#tl = 0.25
#th = 0.75
for tl, th in intervals:
    oracle_original_shape = test_df.shape[0]
    
    
    test_df_oracle = test_df.query(f'y_pred < {tl} or y_pred > {th}')


    _classification_report_oracle = classification_report(test_df_oracle['y_test'], test_df_oracle['y_pred_round'])


    bi_acc_oracle = test_df_oracle.query(
        '(y_pred_round == 1 and (bi_labels == bi_best_candidate or Wikipedia_title == bi_best_candidate_title)) or '
        '(bi_labels == -1 and y_pred_round == 0)').shape[0]
    cross_acc_oracle = test_df_oracle.query(
        '(y_pred_round == 1 and '
        '(cross_labels == cross_best_candidate or Wikipedia_title == cross_best_candidate_title))'
        ' or (cross_labels == -1 and y_pred_round == 0)').shape[0]

    bi_acc_oracle_correcting_nel = test_df_oracle.query(
        '(y_pred_round == 1 and (bi_labels == bi_best_candidate or Wikipedia_title == bi_best_candidate_title))'
        ' or (bi_labels != bi_best_candidate and y_pred_round == 0)').shape[0]
    cross_acc_oracle_correcting_nel = test_df_oracle.query(
        '(y_pred_round == 1 and '
        '(cross_labels == cross_best_candidate or Wikipedia_title == cross_best_candidate_title))'
        ' or (cross_labels != cross_best_candidate and y_pred_round == 0)').shape[0]

    _f1_0 = f1_score(y_test, y_pred_round, pos_label=0)
    _f1_1 = f1_score(y_test, y_pred_round, pos_label=1)

    _macro_avg_f1 = (_f1_0 + _f1_1) / 2

    _f1_0_oracle = f1_score(test_df_oracle['y_test'], test_df_oracle['y_pred_round'], pos_label=0)
    _f1_1_oracle = f1_score(test_df_oracle['y_test'], test_df_oracle['y_pred_round'], pos_label=1)

    _macro_avg_f1_oracle = (_f1_0_oracle + _f1_1_oracle) / 2
    
    oracle_ratio = 1 - (test_df_oracle.shape[0] / oracle_original_shape)
    
    test_df_oracle_random = test_df.sample(n=test_df_oracle.shape[0], random_state=1244)
    
    bi_acc_oracle_random = test_df_oracle_random.query(
        '(y_pred_round == 1 and (bi_labels == bi_best_candidate or Wikipedia_title == bi_best_candidate_title)) or '
        '(bi_labels == -1 and y_pred_round == 0)').shape[0]
    cross_acc_oracle_random = test_df_oracle_random.query(
        '(y_pred_round == 1 and '
        '(cross_labels == cross_best_candidate or Wikipedia_title == cross_best_candidate_title))'
        ' or (cross_labels == -1 and y_pred_round == 0)').shape[0]

    bi_acc_oracle_random_correcting_nel = test_df_oracle_random.query(
        '(y_pred_round == 1 and (bi_labels == bi_best_candidate or Wikipedia_title == bi_best_candidate_title))'
        ' or (bi_labels != bi_best_candidate and y_pred_round == 0)').shape[0]
    cross_acc_oracle_random_correcting_nel = test_df_oracle_random.query(
        '(y_pred_round == 1 and '
        '(cross_labels == cross_best_candidate or Wikipedia_title == cross_best_candidate_title))'
        ' or (cross_labels != cross_best_candidate and y_pred_round == 0)').shape[0]
    
    _f1_0_oracle_random = f1_score(test_df_oracle_random['y_test'], test_df_oracle_random['y_pred_round'], pos_label=0)
    _f1_1_oracle_random = f1_score(test_df_oracle_random['y_test'], test_df_oracle_random['y_pred_round'], pos_label=1)

    _macro_avg_f1_oracle_random = (_f1_0_oracle_random + _f1_1_oracle_random) / 2

    assert test_df_oracle.shape[0] == test_df_oracle.shape[0]
    
    csv_report = csv_report.append({
        'name': name,
        'th': th,
        'tl': tl,
        'bi_baseline': bi_baseline / test_df_shape_actual,
        'cross_baseline': cross_baseline / test_df_shape_actual,
        'bi_acc': bi_acc / test_df_shape_actual,
        'cross_acc': cross_acc / test_df_shape_actual,
        'bi_acc_adjusted': bi_acc / test_df_shape_original,
        'cross_acc_adjusted': cross_acc / test_df_shape_original,
        '0-f1': _f1_0,
        '1-f1': _f1_1,
        'macro-avg-f1': _macro_avg_f1,
        'oracle_ratio': oracle_ratio,
        'bi_acc_oracle': bi_acc_oracle / test_df_oracle.shape[0],
        'cross_acc_oracle': cross_acc_oracle / test_df_oracle.shape[0],
        'bi_acc_oracle_overall': (bi_acc_oracle + oracle_original_shape - test_df_oracle.shape[0] )/ test_df.shape[0],
        'cross_acc_oracle_overall': (cross_acc_oracle + oracle_original_shape - test_df_oracle.shape[0] ) / test_df.shape[0],
        'bi_acc_oracle_correcting_nel': bi_acc_oracle_correcting_nel / test_df_oracle.shape[0],
        'cross_acc_oracle_correcting_nel': cross_acc_oracle_correcting_nel / test_df_oracle.shape[0],
        'bi_acc_oracle_correcting_nel_overall': (bi_acc_oracle_correcting_nel+ oracle_original_shape - test_df_oracle.shape[0] ) / test_df.shape[0],
        'cross_acc_oracle_correcting_nel_overall': (cross_acc_oracle_correcting_nel + oracle_original_shape - test_df_oracle.shape[0] )/ test_df.shape[0],
        '0-f1-oracle': _f1_0_oracle,
        '1-f1-oracle': _f1_1_oracle,
        'macro-avg-f1-oracle': _macro_avg_f1_oracle,
        'bi_acc_oracle_random': bi_acc_oracle_random / test_df_oracle_random.shape[0],
        'cross_acc_oracle_random': cross_acc_oracle_random / test_df_oracle_random.shape[0],
        'bi_acc_oracle_random_overall': (bi_acc_oracle_random + oracle_original_shape - test_df_oracle.shape[0] )/ test_df.shape[0],
        'cross_acc_oracle_random_overall': (cross_acc_oracle_random + oracle_original_shape - test_df_oracle.shape[0] )/ test_df.shape[0],
        'bi_acc_oracle_random_correcting_nel_overall': (bi_acc_oracle_random_correcting_nel + oracle_original_shape - test_df_oracle.shape[0] )/ test_df.shape[0],
        'cross_acc_oracle_random_correcting_nel_overall': (cross_acc_oracle_random_correcting_nel+ oracle_original_shape - test_df_oracle.shape[0] ) / test_df.shape[0],
        'bi_acc_oracle_random_correcting_nel': bi_acc_oracle_random_correcting_nel / test_df_oracle_random.shape[0],
        'cross_acc_oracle_random_correcting_nel': cross_acc_oracle_random_correcting_nel / test_df_oracle_random.shape[0],
        '0-f1-oracle_random': _f1_0_oracle_random,
        '1-f1-oracle_random': _f1_1_oracle_random,
        'macro-avg-f1-oracle_random': _macro_avg_f1_oracle_random,
    }, ignore_index=True)

    print(_classification_report)

    print('-- Performances over test set:', test, '--')
    print('Bi baseline:', bi_baseline / test_df_shape_actual)
    print('Cross baseline:', cross_baseline / test_df_shape_actual)
    print('Bi acc:', bi_acc / test_df_shape_actual)
    print('Cross acc:', cross_acc / test_df_shape_actual)
    print('Bi acc adjusted:', bi_acc / test_df_shape_original)
    print('Cross acc adjusted:', cross_acc / test_df_shape_original)

    print(f'-- Oracle HITL evaluation when y_pred in [{tl}, {th}]')
    print('Ratio to human validator:', 1 - (test_df_oracle.shape[0] / oracle_original_shape))
    print(_classification_report_oracle)

    print('Bi acc oracle:', bi_acc_oracle / test_df_oracle.shape[0])
    print('Cross acc oracle:', cross_acc_oracle / test_df_oracle.shape[0])

In [None]:
csv_report.loc[10] = [1]*csv_report.shape[1]

In [None]:
csv_report

In [None]:
csv_report.plot(x = 'oracle_ratio', y=[
    'cross_acc_oracle', 'cross_acc_oracle_correcting_nel', '0-f1-oracle', '1-f1-oracle'])

In [None]:
csv_report[['oracle_ratio', 'cross_acc_oracle', 'cross_acc_oracle_overall', 'cross_acc_oracle_random_overall', 'cross_acc_oracle_correcting_nel', '0-f1-oracle', '1-f1-oracle']]

In [None]:
oracle_report = csv_report[[
    'oracle_ratio', 'cross_acc_oracle', 'cross_acc_oracle_overall',
    'cross_acc_oracle_random', 'cross_acc_oracle_random_overall', 'cross_acc_oracle_correcting_nel',
    'cross_acc_oracle_correcting_nel_overall', '0-f1-oracle', '1-f1-oracle',
    'cross_acc_oracle_random_correcting_nel',
    'cross_acc_oracle_random_correcting_nel_overall'
]].copy()

In [None]:
oracle_report = oracle_report*100

In [None]:
oracle_report['interval to HITL'] = csv_report[['tl', 'th']].apply(lambda x: f'[{x.tl:.2f}, {x.th:.2f}]', axis=1)

In [None]:
oracle_report = oracle_report.rename(columns={
    'oracle_ratio': 'Ratio to HITL',
    'cross_acc_oracle_random':'Acc Random',
    'cross_acc_oracle':'Acc',
    'cross_acc_oracle_correcting_nel': 'Acc-mit',
    'cross_acc_oracle_random_overall':'Acc Random Overall',
    'cross_acc_oracle_overall':'Acc Overall',
    'cross_acc_oracle_correcting_nel_overall': 'Acc-mit Overall',
    'cross_acc_oracle_random_correcting_nel': 'Acc-mit Random',
    'cross_acc_oracle_random_correcting_nel_overall': 'Acc-mit Random Overall',
    '0-f1-oracle': '0-f1',
    '1-f1-oracle': '1-f1'
})

In [None]:
plt.figure(dpi = 200)
oracle_report.plot(x='Ratio to HITL', y=[
    'Acc-mit Overall', 'Acc-mit Random Overall', 'Acc Overall', 'Acc Random Overall', 
], ax = plt.gca())

In [None]:
plt.figure(dpi = 100)
#oracle_report.plot(x='Ratio to HITL', y=['Accuracy Random', 'Accuracy', 'Accuracy**', '0-f1', '1-f1'], ax = plt.gca())
oracle_report.plot(x='Ratio to HITL', y=['1-f1', 'Acc**', '0-f1',  'Acc','Acc Random',], ax = plt.gca())

In [None]:
print(oracle_report.round(decimals=1)[[
    'interval to HITL', 'Ratio to HITL', 'Acc Random Overall', 'Acc Random',
    'Acc Overall', 'Acc', 'Acc** Random Overall', 'Acc** Random',
    'Acc** Overall','Acc**',  '0-f1', '1-f1']
                               ].to_latex())