In [1]:
import os
os.chdir('..')

In [2]:
import json
import pandas as pd
from annotator import BissectionAnnotator

# Active Learning annotation using the Probabilistic Bisection Algorithm

In [3]:
config_list = [
    {
        'PATH': "./data/climate_change",
        'NAME': 'CCC_BART',
        'COLUMN_NAME': 'FSL_BART',
        'START': 0,
        'END': 1,
        'P': 0.7,
    }, {
        'PATH': "./data/topic_stance",
        'NAME': 'TS_BART_topic',
        'COLUMN_NAME': 'FSL_BART_topic',
        'START': 0,
        'END': 1,
        'P': 0.7,
    }, {
        'PATH': "./data/topic_stance",
        'NAME': 'TS_BART_stance',
        'COLUMN_NAME': 'FSL_BART_stance',
        'START': 0,
        'END': 1,
        'P': 0.7,
    }, {
        'PATH': "./data/depression",
        'NAME': 'D_BART',
        'COLUMN_NAME': 'FSL_BART',
        'START': 0,
        'END': 1,
        'P': 0.7,
    }, {
        'PATH': "./data/climate_change",
        'NAME': 'CCC_SBERT',
        'COLUMN_NAME': 'sbert_cosine',
        'START': -1,
        'END': 1,
        'P': 0.7,
    }, {
        'PATH': "./data/topic_stance",
        'NAME': 'TS_SBERT_topic',
        'COLUMN_NAME': 'sbert_cosine_topic',
        'START': -1,
        'END': 1,
        'P': 0.7,
    }, {
        'PATH': "./data/topic_stance",
        'NAME': 'TS_SBERT_stance',
        'COLUMN_NAME': 'sbert_cosine_stance',
        'START': -1,
        'END': 1,
        'P': 0.7,
    }, {
        'PATH': "./data/depression",
        'NAME': 'D_SBERT',
        'COLUMN_NAME': 'sbert_cosine',
        'START': -1,
        'END': 1,
        'P': 0.7,
    }, {
        'PATH': "./data/climate_change",
        'NAME': 'BART_0-6',
        'COLUMN_NAME': 'FSL_BART',
        'START': 0,
        'END': 1,
        'P': 0.6,
    }, {
        'PATH': "./data/climate_change",
        'NAME': 'BART_0-8',
        'COLUMN_NAME': 'FSL_BART',
        'START': 0,
        'END': 1,
        'P': 0.8,
    }, {
        'PATH': "./data/climate_change",
        'NAME': 'BART_0-9',
        'COLUMN_NAME': 'FSL_BART',
        'START': 0,
        'END': 1,
        'P': 0.9,        
    }, {
        'PATH': "./data/climate_change",
        'NAME': 'BART_fold_1',
        'COLUMN_NAME': 'FSL_BART',
        'START': 0,
        'END': 1,
        'P': 0.7,        
    }, {
        'PATH': "./data/climate_change",
        'NAME': 'BART_fold_2',
        'COLUMN_NAME': 'FSL_BART',
        'START': 0,
        'END': 1,
        'P': 0.7,        
    }, {
        'PATH': "./data/climate_change",
        'NAME': 'BART_fold_3',
        'COLUMN_NAME': 'FSL_BART',
        'START': 0,
        'END': 1,
        'P': 0.7,        
    }, {
        'PATH': "./data/climate_change",
        'NAME': 'BART_neg',
        'COLUMN_NAME': 'FSL_BART_pos_and_neg',
        'START': 0,
        'END': 1,
        'P': 0.7,       
    }, {
        'PATH': "./data/climate_change",
        'NAME': 'BART_extras',
        'COLUMN_NAME': 'FSL_BART_extras',
        'START': 0,
        'END': 1,
        'P': 0.7,
    }
]

In [4]:
for i, config in enumerate(config_list):
    print('\t*', i, ':\t', config['NAME'])

	* 0 :	 CCC_BART
	* 1 :	 TS_BART_topic
	* 2 :	 TS_BART_stance
	* 3 :	 D_BART
	* 4 :	 CCC_SBERT
	* 5 :	 TS_SBERT_topic
	* 6 :	 TS_SBERT_stance
	* 7 :	 D_SBERT
	* 8 :	 BART_0-6
	* 9 :	 BART_0-8
	* 10 :	 BART_0-9
	* 11 :	 BART_fold_1
	* 12 :	 BART_fold_2
	* 13 :	 BART_fold_3
	* 14 :	 BART_neg
	* 15 :	 BART_extras


In [47]:
config_index = 12
config = config_list[config_index]
print(config['NAME'])

BART_fold_2


## Load data

In [48]:
if config['NAME'] == 'BART_fold_1':       
    train_df = pd.read_pickle(os.path.join(config['PATH'], 'training_fold_1.pkl'))
elif config['NAME'] == 'BART_fold_2':       
    train_df = pd.read_pickle(os.path.join(config['PATH'], 'training_fold_2.pkl'))
elif config['NAME'] == 'BART_fold_3':       
    train_df = pd.read_pickle(os.path.join(config['PATH'], 'training_fold_3.pkl'))
else:
    train_df = pd.read_pickle(os.path.join(config['PATH'], 'training.pkl'))

In [49]:
len(train_df)

7812

In [50]:
train_df["claim"].value_counts()

0_0    5420
5_1     456
5_2     330
2_1     252
1_4     170
1_7     141
2_3     124
3_2     123
1_1     115
4_1     115
3_3     106
4_4      91
1_3      74
3_1      64
1_2      62
4_2      59
1_6      55
4_5      55
Name: claim, dtype: int64

In [51]:
if config['NAME'] == 'BART_extras':       
    with open(os.path.join(config['PATH'], 'claims_extras.json')) as file:
        claims = json.load(file)
elif config['NAME'] in ['TS_BART_topic', 'TS_SBERT_topic']: 
    with open(os.path.join(config['PATH'], 'claims_topic.json')) as file:
        claims = json.load(file)
elif config['NAME'] in ['TS_BART_stance', 'TS_SBERT_stance']: 
    with open(os.path.join(config['PATH'], 'claims_stance.json')) as file:
        claims = json.load(file)
else:
    with open(os.path.join(config['PATH'], 'claims.json')) as file:
        claims = json.load(file)
        
    if config['NAME'] == 'BART_neg':
        with open(os.path.join(config['PATH'], 'claims_neg.json')) as file:
            claims_neg = json.load(file)  
            
class_descr = claims["class_descr"]

## Normalise entailment scores for BART_neg

In [39]:
def normalise_nli_scores(row):
    dict_pos = {k: row['FSL_BART'][claims[k]] for k in claims if k != "class_descr"}
    dict_neg = {k: row['FSL_BART_neg'][claims_neg[k]] for k in claims_neg}
    new_dict = dict()
    for k in dict_pos:
        if dict_neg[k] > dict_pos[k]:
            new_dict[claims[k]] = 0.0
        else:
            new_dict[claims[k]] = dict_pos[k]
    return new_dict

In [40]:
if config['NAME'] == 'BART_neg':
    train_df[config['COLUMN_NAME']] = train_df.apply(normalise_nli_scores, axis=1)

## Annotate

In [41]:
annotator = BissectionAnnotator(train_df, config['START'], config['END'], step=0.01, p=config['P'], level='class', sleep_time=1, verbose=0)

In [42]:
record = dict()

In [43]:
from tqdm import tqdm

In [44]:
for claim_idx in tqdm(claims):
    if claim_idx in record:
        continue
    if claim_idx == "class_descr":
        continue
    claims_text = claims[claim_idx]
    if config['NAME'] in ["CCC_BART", "CCC_SBERT", "BART_0-6", "BART_0-8", "BART_0-9", "BART_fold_1", "BART_fold_2", "BART_fold_3", "BART_neg", "BART_extras"]:
        df, threshold, prob_distr = annotator(claim_idx, claims_text, claim_idx[:3], class_descr[claim_idx[:3]], column=config['COLUMN_NAME'])
    elif config['NAME'] in ["TS_BART_topic", "TS_SBERT_topic"]:
        df, threshold, prob_distr = annotator(claim_idx, claims_text, claim_idx[:1], class_descr[claim_idx[:1]], column=config['COLUMN_NAME'], text_column="Tweet")
    elif config['NAME'] in ["TS_BART_stance", "TS_SBERT_stance"]:
        df, threshold, prob_distr = annotator(claim_idx, claims_text, claim_idx[:2], class_descr[claim_idx[:2]], column=config['COLUMN_NAME'], text_column="Tweet")
    elif config['NAME'] in ["D_BART", "D_SBERT"]:
        df, threshold, prob_distr = annotator(claim_idx, claims_text, claim_idx.split("_")[0], class_descr[claim_idx.split("_")[0]], column=config['COLUMN_NAME'], text_column="Sentence")
    record[claim_idx] = threshold

100%|███████████████████████████████████████████████████████████████████████████████| 31/31 [06:07<00:00, 11.84s/it]


In [46]:
with open(os.path.join("./data/bisection_records",  config['NAME'] + '.json'), "w") as file:
    json.dump(record, file)