In [1]:
import os
os.chdir("..")

In [2]:
import json
import math
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score

In [3]:
tqdm.pandas()

In [4]:
import warnings
warnings.filterwarnings('ignore')

## Helper functions

In [5]:
def get_median(distr, start, end, step):
    x = np.arange(start, end, step)
    exp_f = np.exp(distr)
    alpha = exp_f.sum() * 0.5
    try:
        median_low = x[exp_f.cumsum() <= alpha][-1]
    except IndexError:
        median_low = start
    try:
        median_high = x[::-1][exp_f[::-1].cumsum() < alpha][-1]
    except IndexError:
        median_high = end
    median_avg = (median_low + median_high) / 2
    return median_avg

def get_thresholds_from_record(record, start=0, end=1, step=0.01):
    thresholds = dict()
    for claim in record:
        thresholds[claim] = {'threshold': get_median(record[claim]['distributions'][-1], start, end, step)}
    return thresholds

def get_multi_pred(row, column, claims_dict, thresholds=None, black_list=None, min_length=1):
    if thresholds is None:
        thresholds = {t: {'threshold': 0.0} for t in claims_dict}
    if black_list is None:
        black_list = []
    result = list()
    if isinstance(row[column], list):
        zsl_scores = {k:row[column][0][claims_dict[k]] for k in claims_dict}
    else:
        zsl_scores = {k:row[column][claims_dict[k]] for k in claims_dict}
    for c in CLASSES:
        sub_pred = {k: zsl_scores[k] for k in zsl_scores if k[:3] == c and not k in black_list}
        if len([k for k in sub_pred if sub_pred[k] > thresholds[k]['threshold']]) >= min_length:
            result.append(1)
        else:
            result.append(0)
    return result

## Data

In [6]:
test_df = pd.read_pickle('./data/climate_change/testing.pkl')

In [7]:
CLASSES = ['1_1', '1_2', '1_3', '1_4', '1_6', '1_7']

In [8]:
with open("./data/climate_change/claims.json") as file:
    claims = json.load(file)

claims_descr = claims["class_descr"]
del claims["class_descr"]

In [9]:
inverse_claims = {claims[k]: k for k in claims}

## Experiment 1: Changing the value of p for the Probabilistic Bisection

### p = 0.6

In [10]:
with open('./data/bisection_records/BART_0-6.json') as file:
    bart_record = json.load(file)

In [11]:
bart_thresholds = get_thresholds_from_record(bart_record, start=0, end=1)

In [12]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_BART', claims, thresholds=bart_thresholds), axis=1)

In [13]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         1_1       0.68      0.64      0.66        36
         1_2       0.46      0.92      0.61        12
         1_3       0.61      0.77      0.68        48
         1_4       0.76      0.67      0.71        70
         1_6       0.83      0.62      0.71        24
         1_7       0.10      0.65      0.17        34

   micro avg       0.37      0.69      0.48       224
   macro avg       0.57      0.71      0.59       224
weighted avg       0.60      0.69      0.61       224
 samples avg       0.05      0.06      0.05       224



### p = 0.8

In [14]:
with open('./data/bisection_records/BART_0-8.json') as file:
    bart_record = json.load(file)

In [15]:
bart_thresholds = get_thresholds_from_record(bart_record, start=0, end=1)

In [16]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_BART', claims, thresholds=bart_thresholds), axis=1)

In [17]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         1_1       0.64      0.64      0.64        36
         1_2       0.83      0.83      0.83        12
         1_3       0.61      0.77      0.68        48
         1_4       0.70      0.64      0.67        70
         1_6       0.88      0.62      0.73        24
         1_7       0.50      0.56      0.53        34

   micro avg       0.65      0.67      0.66       224
   macro avg       0.69      0.68      0.68       224
weighted avg       0.67      0.67      0.66       224
 samples avg       0.05      0.05      0.05       224



### p = 0.9

In [18]:
with open('./data/bisection_records/BART_0-9.json') as file:
    bart_record = json.load(file)

In [19]:
bart_thresholds = get_thresholds_from_record(bart_record, start=0, end=1)

In [20]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_BART', claims, thresholds=bart_thresholds), axis=1)

In [21]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         1_1       0.64      0.69      0.67        36
         1_2       0.89      0.67      0.76        12
         1_3       0.60      0.75      0.67        48
         1_4       0.84      0.61      0.71        70
         1_6       0.88      0.62      0.73        24
         1_7       0.51      0.68      0.58        34

   micro avg       0.68      0.67      0.67       224
   macro avg       0.73      0.67      0.69       224
weighted avg       0.71      0.67      0.68       224
 samples avg       0.05      0.05      0.05       224



## Experiment 2: Threshold-tuning on separate folds of the data

In [22]:
with open('./data/bisection_records/BART_FOLD_1.json') as file:
    record_1 = json.load(file)

with open('./data/bisection_records/BART_FOLD_2.json') as file:
    record_2 = json.load(file)

with open('./data/bisection_records/BART_FOLD_3.json') as file:
    record_3 = json.load(file)

In [23]:
start = 0
stop = 1
step = 0.01
x = np.arange(start, stop, step)

def find_CI(distrib, low_mass=0.025, high_mass=0.975):
    exp_f = np.exp(distrib)
    alpha_low = exp_f.sum() * low_mass
    alpha_high = exp_f.sum() * high_mass

    try:
        l_bound_low = x[exp_f.cumsum() <= alpha_low][-1]
        l_low_weight = exp_f.cumsum()[int(l_bound_low * 100)]
    except IndexError:
        l_bound_low = start
        l_low_weight = 0.0
    try:
        l_bound_high = x[::-1][exp_f[::-1].cumsum() <= alpha_high][-1]
        l_high_weight = exp_f.cumsum()[int(l_bound_high * 100)]
    except IndexError:
        l_bound_high = stop
        l_high_weight = 1.0
    l_bound_avg = (l_bound_low + l_bound_high) / 2
    if (l_bound_avg * 1000) % 10 != 0:
        round_down = math.floor(int(l_bound_avg * 100))
        round_up = math.ceil(int(l_bound_avg * 100))
        l_weight = (exp_f.cumsum()[round_down] + exp_f.cumsum()[round_up]) / 2
    else:
        l_weight = exp_f.cumsum()[int(l_bound_avg * 100)]  
        
    try:
        r_bound_low = x[exp_f.cumsum() <= alpha_high][-1]
        r_low_weight = exp_f.cumsum()[int(r_bound_low * 100)]
    except IndexError:
        r_bound_low = start
        r_low_weight = 0.0
    try:
        r_bound_high = x[::-1][exp_f[::-1].cumsum() <= alpha_low][-1]
        r_high_weight = exp_f.cumsum()[int(r_bound_high * 100)]
    except IndexError:
        r_bound_high = stop
        r_high_weight = 1.0
    r_bound_avg = (r_bound_low + r_bound_high) / 2
    if (r_bound_avg * 1000) % 10 != 0:
        round_down = math.floor(int(r_bound_avg * 100))
        round_up = math.ceil(int(r_bound_avg * 100))
        r_weight = (exp_f.cumsum()[round_down] + exp_f.cumsum()[round_up]) / 2
    else:
        r_weight = exp_f.cumsum()[int(r_bound_avg * 100)]  

    return l_bound_avg, r_bound_avg

In [24]:
CI_low_1 = list()
CI_low_2 = list()
CI_low_3 = list()

CI_high_1 = list()
CI_high_2 = list()
CI_high_3 = list()

medians_1 = list()
medians_2 = list()
medians_3 = list()

nb_annots_1 = list()
nb_annots_2 = list()
nb_annots_3 = list()
nb_annots_whole = list()

for idx, trait in enumerate(sorted(claims)):
    
    median_1 = sum(find_CI(record_1[trait]['distributions'][-1], low_mass=0.5, high_mass=0.5))/2
    median_2 = sum(find_CI(record_2[trait]['distributions'][-1], low_mass=0.5, high_mass=0.5))/2
    median_3 = sum(find_CI(record_3[trait]['distributions'][-1], low_mass=0.5, high_mass=0.5))/2
    
    l_bound_1, r_bound_1 = find_CI(record_1[trait]['distributions'][-1], low_mass=0.05, high_mass=0.95)
    l_bound_2, r_bound_2 = find_CI(record_2[trait]['distributions'][-1], low_mass=0.05, high_mass=0.95)
    l_bound_3, r_bound_3 = find_CI(record_3[trait]['distributions'][-1], low_mass=0.05, high_mass=0.95)
    
    CI_low_1.append(median_1 - l_bound_1)
    CI_low_2.append(median_2 - l_bound_2)
    CI_low_3.append(median_3 - l_bound_3)
    
    CI_high_1.append(r_bound_1 - median_1)
    CI_high_2.append(r_bound_2 - median_2)
    CI_high_3.append(r_bound_3 - median_3)

    medians_1.append(median_1)
    medians_2.append(median_2)
    medians_3.append(median_3)
    
    nb_annots_1.append(len(record_1[trait]['annot']))
    nb_annots_2.append(len(record_2[trait]['annot']))
    nb_annots_3.append(len(record_3[trait]['annot']))
    
asymmetric_error_1 = [CI_low_1, CI_high_1]
asymmetric_error_2 = [CI_low_2, CI_high_2]
asymmetric_error_3 = [CI_low_3, CI_high_3]

In [25]:
converged_1 = list()
for t, low_b, high_b in zip(claims, asymmetric_error_1[0], asymmetric_error_1[1]):
    if (high_b + low_b) < 0.2:
        converged_1.append(t)
        
converged_2 = list()
for t, low_b, high_b in zip(claims, asymmetric_error_2[0], asymmetric_error_2[1]):
    if (high_b + low_b) < 0.2:
        converged_2.append(t)
        
converged_3 = list()
for t, low_b, high_b in zip(claims, asymmetric_error_3[0], asymmetric_error_3[1]):
    if (high_b + low_b) < 0.2:
        converged_3.append(t)

In [26]:
all_diffs = list()
all_diffs_CLAIM = list()

conv_diffs = list()
conv_diffs_CLAIM = list()

not_conv_diffs = list()
not_conv_diffs_CLAIM = list()

mixed_diffs = list()
mixed_diffs_CLAIM = list()

for t, m_1, m_2, m_3 in zip(claims, medians_1, medians_2, medians_3):
    diff = max([m_1, m_2, m_3]) - min([m_1, m_2, m_3])
    all_diffs.append(diff)
    all_diffs_CLAIM.append(t)
    if t in converged_1 and t in converged_2 and t in converged_3: 
        print(t, "\t%0.3f" % diff, "\tCONVERGED")
        conv_diffs.append(diff)
        conv_diffs_CLAIM.append(t)
    elif not t in converged_1 and not t in converged_2 and not t in converged_3:
        print(t, "\t%0.3f" % diff)
        not_conv_diffs.append(diff)
        not_conv_diffs_CLAIM.append(t)
    else:
        print(t, "\t%0.3f" % diff)
        mixed_diffs.append(diff)
        mixed_diffs_CLAIM.append(t)

1_1_0_0 	0.220
1_1_0_1 	0.110
1_1_0_2 	0.080
1_1_1_0 	0.110
1_1_1_1 	0.180
1_1_2_0 	0.130
1_1_2_1 	0.210
1_1_3_0 	0.170
1_1_3_1 	0.120
1_1_4_0 	0.170
1_1_4_1 	0.120
1_2_0_0 	0.170
1_2_0_1 	0.080
1_3_0_0 	0.110
1_3_0_1 	0.050 	CONVERGED
1_3_0_2 	0.240
1_3_0_3 	0.250
1_3_0_4 	0.080 	CONVERGED
1_3_0_5 	0.220
1_4_0_0 	0.070
1_4_0_1 	0.130
1_6_0_0 	0.160
1_6_0_1 	0.100 	CONVERGED
1_6_0_2 	0.200
1_6_0_3 	0.050
1_6_0_4 	0.280
1_7_0_0 	0.120 	CONVERGED
1_7_0_1 	0.010 	CONVERGED
1_7_0_2 	0.130
1_7_0_3 	0.020


### All claims

In [27]:
print("MAX:\t %.2f" % max(all_diffs))
print("MIN:\t %.2f" % min(all_diffs))
print("AVG:\t %.2f" % (sum(all_diffs)/len(all_diffs)))
print("STD:\t %.2f" % np.std(all_diffs))

MAX:	 0.28
MIN:	 0.01
AVG:	 0.14
STD:	 0.07


### Claims for which threshol-tuning is "complete" on all three folds

In [28]:
print("MAX:\t %.2f" % max(conv_diffs))
print("MIN:\t %.2f" % min(conv_diffs))
print("AVG:\t %.2f" % (sum(conv_diffs)/len(conv_diffs)))
print("STD:\t %.2f" % np.std(conv_diffs))

MAX:	 0.12
MIN:	 0.01
AVG:	 0.07
STD:	 0.04


### Claims for which threshol-tuning is "complete" on all none of the folds

In [29]:
print("MAX:\t %.2f" % max(not_conv_diffs))
print("MIN:\t %.2f" % min(not_conv_diffs))
print("AVG:\t %.2f" % (sum(not_conv_diffs)/len(not_conv_diffs)))
print("STD:\t %.2f" % np.std(not_conv_diffs))

MAX:	 0.28
MIN:	 0.05
AVG:	 0.16
STD:	 0.06


### Claims for which threshol-tuning is "complete" on some but not all of the three folds

In [30]:
print("MAX:\t %.2f" % max(mixed_diffs))
print("MIN:\t %.2f" % min(mixed_diffs))
print("AVG:\t %.2f" % (sum(mixed_diffs)/len(mixed_diffs)))
print("STD:\t %.2f" % np.std(mixed_diffs))

MAX:	 0.11
MIN:	 0.02
AVG:	 0.07
STD:	 0.04


## Experiment 3: Using negated claims

In [31]:
with open('./data/climate_change/claims_neg.json') as file:
    claims_neg = json.load(file)

In [32]:
def normalise_nli_scores(row):
    dict_pos = {k: row['FSL_BART'][claims[k]] for k in claims}
    dict_neg = {k: row['FSL_BART_neg'][claims_neg[k]] for k in claims_neg}
    new_dict = dict()
    for k in dict_pos:
        if dict_neg[k] > dict_pos[k]:
            new_dict[claims[k]] = 0.0
        else:
            new_dict[claims[k]] = dict_pos[k]
    return new_dict

In [33]:
test_df['FSL_BART_pos_and_neg'] = test_df.apply(normalise_nli_scores, axis=1)

In [34]:
with open('./data/bisection_records/CCC_BART.json') as file:
    bart_record = json.load(file)

In [35]:
bart_thresholds = get_thresholds_from_record(bart_record, start=0, end=1)

In [36]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_BART_pos_and_neg', claims, thresholds=bart_thresholds), axis=1)

In [37]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         1_1       0.70      0.58      0.64        36
         1_2       0.82      0.75      0.78        12
         1_3       0.60      0.75      0.67        48
         1_4       0.83      0.64      0.73        70
         1_6       0.94      0.62      0.75        24
         1_7       0.52      0.65      0.58        34

   micro avg       0.69      0.66      0.68       224
   macro avg       0.74      0.67      0.69       224
weighted avg       0.73      0.66      0.68       224
 samples avg       0.05      0.05      0.05       224



## Experiment 4: Adding more claims

In [38]:
with open('./data/climate_change/claims_extras.json') as file:
    claims_extras = json.load(file)
    
claims_w_extras = {**claims, **claims_extras}

del claims_w_extras["class_descr"]

In [39]:
with open('./data/bisection_records/CCC_BART.json') as file:
    bart_record = json.load(file)

with open('./data/bisection_records/BART_extras.json') as file:
    bart_record_extras = json.load(file)
    
bart_record_w_extras = {**bart_record, **bart_record_extras}

In [40]:
bart_extras_thresholds = get_thresholds_from_record(bart_record_w_extras, start=0, end=1)

In [41]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_BART_extras', claims_w_extras, thresholds=bart_extras_thresholds), axis=1)

In [42]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         1_1       0.66      0.69      0.68        36
         1_2       0.48      0.92      0.63        12
         1_3       0.62      0.85      0.72        48
         1_4       0.60      0.79      0.68        70
         1_6       0.77      0.71      0.74        24
         1_7       0.37      0.68      0.47        34

   micro avg       0.57      0.77      0.65       224
   macro avg       0.58      0.77      0.65       224
weighted avg       0.59      0.77      0.66       224
 samples avg       0.06      0.06      0.06       224

