In [1]:
import os
os.chdir("..")

In [2]:
import json
import math
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score

In [3]:
tqdm.pandas() 

In [4]:
import warnings
warnings.filterwarnings('ignore')

# Experiments from paper

## Helper functions

In [5]:
def get_median(distr, start, end, step):
    x = np.arange(start, end, step)
    exp_f = np.exp(distr)
    alpha = exp_f.sum() * 0.5
    try:
        median_low = x[exp_f.cumsum() <= alpha][-1]
    except IndexError:
        median_low = start
    try:
        median_high = x[::-1][exp_f[::-1].cumsum() < alpha][-1]
    except IndexError:
        median_high = end
    median_avg = (median_low + median_high) / 2
    return median_avg

def get_thresholds_from_record(record, start=0, end=1, step=0.01):
    thresholds = dict()
    for claim in record:
        thresholds[claim] = {'threshold': get_median(record[claim]['distributions'][-1], start, end, step)}
    return thresholds

def keep_only_top_claim(scores_dict):
    new_dict = dict()
    
    for cl in CLASSES:
        rel_claims = [t for t in claims if t[:3] == cl]
        cl_dict = {t: scores_dict[t] for t in rel_claims}
        highest_t = max(cl_dict, key = cl_dict.get)
        for t in rel_claims:
            if t == highest_t:
                new_dict[t] = scores_dict[t]
            else:
                new_dict[t] = 0
    return new_dict

def get_multi_pred(row, column, claims_dict, thresholds=None, black_list=None, min_length=1):
    if thresholds is None:
        thresholds = {t: {'threshold': 0.0} for t in claims_dict}
    if black_list is None:
        black_list = []
    result = list()
    if isinstance(row[column], list):
        zsl_scores = {k:row[column][0][claims_dict[k]] for k in claims_dict}
    else:
        zsl_scores = {k:row[column][claims_dict[k]] for k in claims_dict}
    for c in CLASSES:
        sub_pred = {k: zsl_scores[k] for k in zsl_scores if k[:3] == c and not k in black_list}
        if len([k for k in sub_pred if sub_pred[k] > thresholds[k]['threshold']]) >= min_length:
            result.append(1)
        else:
            result.append(0)
    return result


def get_binary_pred(row, claim_class, column, claims_dict, thresholds=None, black_list=None, min_length=1):
    if thresholds is None:
        thresholds = {t: {'threshold': 0.0} for t in claims_dict}
    if black_list is None:
        black_list = []
    result = list()
    if isinstance(row[column], list):
        zsl_scores = {k:row[column][0][claims_dict[k]] for k in claims_dict}
    else:
        zsl_scores = {k:row[column][claims_dict[k]] for k in claims_dict}

    sub_pred = {k: zsl_scores[k] for k in zsl_scores if k[:3] == claim_class and not k in black_list}
    if len([k for k in sub_pred if sub_pred[k] > thresholds[k]['threshold']]) >= min_length:
        return 1
    else:
        return 0

## Climate Change Contrarianism detection

In [6]:
test_df = pd.read_pickle('./data/climate_change/testing.pkl')

In [7]:
CLASSES = ['1_1', '1_2', '1_3', '1_4', '1_6', '1_7']

In [8]:
with open("./data/climate_change/claims.json") as file:
    claims = json.load(file)

claims_descr = claims["class_descr"]
del claims["class_descr"]

In [9]:
inverse_claims = {claims[k]: k for k in claims}

### Baseline 1: Fine-tuned BERT

In [10]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['BERT'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         1_1       0.67      0.86      0.76        36
         1_2       0.56      0.42      0.48        12
         1_3       0.63      0.35      0.45        48
         1_4       0.54      0.89      0.67        70
         1_6       0.69      0.92      0.79        24
         1_7       0.40      0.47      0.43        34

   micro avg       0.57      0.68      0.62       224
   macro avg       0.58      0.65      0.60       224
weighted avg       0.58      0.68      0.60       224
 samples avg       0.06      0.06      0.06       224



### Baseline 2: SBERT cosine similarity with threshold-tuning

In [11]:
with open('./data/bisection_records/CCC_SBERT.json') as file:
    sbert_record = json.load(file)

In [12]:
sbert_thresholds = get_thresholds_from_record(sbert_record, start=-1, end=1)

In [13]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'sbert_cosine', claims, thresholds=sbert_thresholds), axis=1)

In [14]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         1_1       0.43      0.89      0.58        36
         1_2       0.20      0.08      0.12        12
         1_3       0.64      0.15      0.24        48
         1_4       0.40      0.44      0.42        70
         1_6       0.52      0.96      0.68        24
         1_7       0.50      0.50      0.50        34

   micro avg       0.45      0.50      0.47       224
   macro avg       0.45      0.50      0.42       224
weighted avg       0.47      0.50      0.43       224
 samples avg       0.04      0.04      0.04       224



In [15]:
datapoints = list()
for k in sbert_record:
    datapoints += sbert_record[k]['texts']
len(set(datapoints))

680

### Baseline 3: BART MNLI model with unique threshold (Zero-Shot approach)

In [16]:
artificial_thresholds = {k: {'threshold': 0.5} for k in claims}

In [17]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_BART', claims, thresholds=artificial_thresholds), axis=1)

In [18]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         1_1       0.22      0.69      0.34        36
         1_2       0.06      1.00      0.11        12
         1_3       0.03      0.98      0.06        48
         1_4       0.47      0.80      0.59        70
         1_6       0.29      0.96      0.45        24
         1_7       0.01      0.97      0.03        34

   micro avg       0.05      0.88      0.09       224
   macro avg       0.18      0.90      0.26       224
weighted avg       0.23      0.88      0.31       224
 samples avg       0.03      0.07      0.04       224



### Baseline 4: BART MNLI model with Temperature Scaling

In [19]:
sample_sizes = [5, 10, 20, 40, 80, 160]

In [20]:
artificial_thresholds = {k: {'threshold': 0.5} for k in claims}

In [21]:
for samp_size in sample_sizes:
    test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'Temp_Scaling_BART_' + str(samp_size), claims, thresholds=artificial_thresholds), axis=1)
    print(samp_size, ': %.3f' % f1_score(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), average="micro"))

5 : 0.161
10 : 0.220
20 : 0.205
40 : 0.154
80 : 0.193
160 : 0.188


### Baseline 5: Llama 2 70b model with prompting

In [22]:
def get_llama_pred(row, column, claims_dict, black_list=None, min_length=1):
    if black_list is None:
        black_list = []
    result = list()
    zsl_scores = {k:row[column][claims_dict[k]] for k in claims_dict}
    for c in CLASSES:
        sub_pred = {k: zsl_scores[k] for k in zsl_scores if k[:3] == c and not k in black_list}
        if len([k for k in sub_pred if sub_pred[k].lower().strip() == "yes"]) >= min_length:
            result.append(1)
        else:
            result.append(0)
    return result

In [23]:
test_df['pred_multi'] = test_df.apply(lambda x: get_llama_pred(x, 'Llama-2', claims), axis=1)

In [24]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         1_1       0.03      0.97      0.06        36
         1_2       0.04      0.92      0.08        12
         1_3       0.10      0.75      0.18        48
         1_4       0.07      0.94      0.13        70
         1_6       0.05      0.88      0.09        24
         1_7       0.03      0.65      0.05        34

   micro avg       0.05      0.85      0.09       224
   macro avg       0.05      0.85      0.10       224
weighted avg       0.06      0.85      0.11       224
 samples avg       0.03      0.07      0.04       224



### Ours 1: Few-shots NLI approach using BART MNLI

In [25]:
with open('./data/bisection_records/CCC_BART.json') as file:
    bart_record = json.load(file)

In [26]:
bart_thresholds = get_thresholds_from_record(bart_record, start=0, end=1)

In [27]:
test_df['pred_multi'] = test_df.apply(lambda x: get_multi_pred(x, 'FSL_BART', claims, thresholds=bart_thresholds), axis=1)

In [28]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

         1_1       0.68      0.64      0.66        36
         1_2       0.82      0.75      0.78        12
         1_3       0.61      0.77      0.68        48
         1_4       0.80      0.64      0.71        70
         1_6       0.88      0.62      0.73        24
         1_7       0.51      0.65      0.57        34

   micro avg       0.68      0.67      0.68       224
   macro avg       0.72      0.68      0.69       224
weighted avg       0.71      0.67      0.68       224
 samples avg       0.05      0.05      0.05       224



In [29]:
datapoints = list()
for k in bart_record:
    if k in ["1_4_0_1"]:
        continue
    datapoints += bart_record[k]['texts']
len(set(datapoints))

457

## Topic and stance classification

In [30]:
test_df = pd.read_pickle('./data/topic_stance/testing.pkl')

In [31]:
CLASSES_TOPIC = ['1', '2', '3', '4', '5']
CLASSES_STANCE = ['1A', '1F', '1N', '2A', '2F', '2N', '3A', '3F', '3N', '4A', '4F', '4N', '5A', '5F', '5N']

In [32]:
with open("./data/topic_stance/claims_topic.json") as file:
    claims_topic = json.load(file)
    
with open("./data/topic_stance/claims_stance.json") as file:
    claims_stance = json.load(file)
    
claims = {**claims_topic, **claims_stance}

claims_descr = claims["class_descr"]
del claims["class_descr"]

In [33]:
inverse_claims = {claims[k]: k for k in claims}

In [34]:
def custom_transformation(x, threshold):
    if x <= threshold:
        return x * (0.5 / threshold)
    else:
        return 0.5 + (x - threshold) * (0.5 / (1 - threshold))
    
    
def get_normed_scores(zsl_dict, thresholds, inverse_claims):
    new_results = dict()
    for trait in zsl_dict:
        if not trait in inverse_claims:
            continue
        new_results[trait] = custom_transformation(zsl_dict[trait], thresholds[inverse_claims[trait]]['threshold'])
    return new_results


def get_avg_normed_topic_scores(normed_zsl_dict, classes_list, inverse_claims):
    new_results = dict()
    for c in classes_list:
        temp = [normed_zsl_dict[trait] for trait in normed_zsl_dict if inverse_claims[trait].split("_")[0] == c]
        if len(temp) > 0:
            new_results[c] = sum(temp) / len(temp)
    return new_results
    

def get_topic_pred_multiclass(avg_normed_ZSL_scores):
    return max(avg_normed_ZSL_scores, key=avg_normed_ZSL_scores.get)

def get_stance_pred(normed_zsl_dict, topic, traits):
    traits_anti = [traits[t] for t in traits if t[:2] == (topic + "A")]
    detected_anti = [normed_zsl_dict[t] for t in traits_anti if normed_zsl_dict[t] > 0.5]
    traits_favor = [traits[t] for t in traits if t[:2] == (topic + "F")]
    detected_favor = [normed_zsl_dict[t] for t in traits_favor if normed_zsl_dict[t] > 0.5]
    if len(detected_anti) > len(detected_favor):
        return topic + "A"
    elif len(detected_favor) > len(detected_anti):
        return topic + "F"
    if len(detected_favor) == 0:
        return topic + "N"
    avg_scores_anti = sum(detected_anti) / len(detected_anti)
    avg_scores_favour = sum(detected_favor) / len(detected_favor)
    if avg_scores_anti > (1.05 * avg_scores_favour):
        return topic + "A"
    elif avg_scores_favour > (1.05 * avg_scores_anti):
        return topic + "F"
    return topic + "N"
    return max(normed_zsl_dict, key=normed_zsl_dict.get)

### Baseline 1: Fine-tuned BERT

In [35]:
print(classification_report(test_df['topic_stance_annot'].to_list(), test_df['BERT_topic_stance'].to_list(), target_names=CLASSES_STANCE))

              precision    recall  f1-score   support

          1A       0.79      0.74      0.76       160
          1F       0.29      0.19      0.23        32
          1N       0.23      0.32      0.26        28
          2A       0.00      0.00      0.00        11
          2F       0.85      0.91      0.88       123
          2N       0.26      0.20      0.23        35
          3A       0.68      0.54      0.60       183
          3F       0.26      0.60      0.36        58
          3N       0.32      0.16      0.21        44
          4A       0.66      0.76      0.71       172
          4F       0.58      0.67      0.62        45
          4N       0.32      0.36      0.34        78
          5A       0.69      0.47      0.56       189
          5F       0.55      0.24      0.33        46
          5N       0.10      0.20      0.13        45

    accuracy                           0.55      1249
   macro avg       0.44      0.42      0.41      1249
weighted avg       0.59   

### Baseline 2: SBERT cosine similarity with threshold-tuning

In [36]:
with open('./data/bisection_records/TS_SBERT_topic.json') as file:
    sbert_record_topic = json.load(file)
    
with open('./data/bisection_records/TS_SBERT_stance.json') as file:
    sbert_record_stance = json.load(file)

In [37]:
sbert_topic_thresholds = get_thresholds_from_record(sbert_record_topic, start=-1, end=1)
sbert_stance_thresholds = get_thresholds_from_record(sbert_record_stance, start=-1, end=1)

In [38]:
test_df['normed_sbert_topic'] = test_df["sbert_cosine_topic"].apply(lambda x: get_normed_scores(x, sbert_topic_thresholds, inverse_claims))
test_df['normed_sbert_stance'] = test_df["sbert_cosine_stance"].apply(lambda x: get_normed_scores(x, sbert_stance_thresholds, inverse_claims))
test_df['avg_normed_sbert_topic'] = test_df["normed_sbert_topic"].apply(lambda x: get_avg_normed_topic_scores(x, CLASSES_TOPIC, inverse_claims))
test_df['pred_topic'] = test_df['avg_normed_sbert_topic'].apply(get_topic_pred_multiclass)
test_df['pred_stance'] = test_df.apply(lambda x: get_stance_pred(x['normed_sbert_stance'], x['pred_topic'], claims_stance), axis=1)

In [39]:
print(classification_report(test_df['topic_stance_annot'].to_list(), test_df['pred_stance'].to_list(), target_names=CLASSES_STANCE))

              precision    recall  f1-score   support

          1A       0.72      0.76      0.74       160
          1F       0.80      0.12      0.22        32
          1N       0.05      0.14      0.07        28
          2A       0.00      0.00      0.00        11
          2F       0.84      0.92      0.88       123
          2N       0.25      0.49      0.33        35
          3A       0.62      0.57      0.60       183
          3F       0.00      0.00      0.00        58
          3N       0.12      0.43      0.18        44
          4A       0.73      0.51      0.60       172
          4F       0.50      0.02      0.04        45
          4N       0.27      0.53      0.35        78
          5A       0.71      0.52      0.60       189
          5F       0.00      0.00      0.00        46
          5N       0.07      0.07      0.07        45

    accuracy                           0.49      1249
   macro avg       0.38      0.34      0.31      1249
weighted avg       0.54   

In [40]:
datapoints = list()
for k in sbert_record_topic:
    datapoints += sbert_record_topic[k]['texts']
    
for k in sbert_record_stance:
    datapoints += sbert_record_stance[k]['texts']
    
len(set(datapoints))

1271

### Baseline 3: BART MNLI model with unique threshold (Zero-Shot approach)

In [41]:
artificial_thresholds = {k: {'threshold': 0.5} for k in claims}

In [42]:
test_df['normed_BART_topic'] = test_df["FSL_BART_topic"].apply(lambda x: get_normed_scores(x, artificial_thresholds, inverse_claims))
test_df['normed_BART_stance'] = test_df["FSL_BART_stance"].apply(lambda x: get_normed_scores(x, artificial_thresholds, inverse_claims))
test_df['avg_normed_BART_topic'] = test_df["normed_BART_topic"].apply(lambda x: get_avg_normed_topic_scores(x, CLASSES_TOPIC, inverse_claims))
test_df['pred_topic'] = test_df['avg_normed_BART_topic'].apply(get_topic_pred_multiclass)
test_df['pred_stance'] = test_df.apply(lambda x: get_stance_pred(x['normed_BART_stance'], x['pred_topic'], claims_stance), axis=1)

In [43]:
print(classification_report(test_df['topic_stance_annot'].to_list(), test_df['pred_stance'].to_list(), target_names=CLASSES_STANCE))

              precision    recall  f1-score   support

          1A       0.76      0.76      0.76       160
          1F       0.58      0.56      0.57        32
          1N       0.05      0.07      0.06        28
          2A       0.67      0.18      0.29        11
          2F       0.73      0.98      0.84       123
          2N       0.21      0.09      0.12        35
          3A       0.71      0.25      0.37       183
          3F       0.20      0.88      0.32        58
          3N       0.14      0.39      0.20        44
          4A       0.89      0.51      0.65       172
          4F       0.76      0.71      0.74        45
          4N       0.32      0.40      0.35        78
          5A       0.89      0.16      0.28       189
          5F       0.18      0.43      0.26        46
          5N       0.25      0.02      0.04        45

    accuracy                           0.47      1249
   macro avg       0.49      0.43      0.39      1249
weighted avg       0.63   

### Baseline 4: BART MNLI model with Temperature Scaling

In [44]:
sample_sizes = [5, 10, 20, 40, 80, 160]

In [45]:
artificial_thresholds = {k: {'threshold': 0.5} for k in claims}

In [46]:
test_df.columns

Index(['ID', 'Target', 'Tweet', 'Stance', 'FSL_BART_stance', 'FSL_BART_topic',
       'sbert_cosine_topic', 'sbert_cosine_stance', 'topic_stance_annot',
       'topic_annot', '1_annot', '2_annot', '3_annot', '4_annot', '5_annot',
       '1A_annot', '1F_annot', '2A_annot', '2F_annot', '3A_annot', '3F_annot',
       '4A_annot', '4F_annot', '5A_annot', '5F_annot', 'BERT_topic',
       'BERT_topic_stance', 'Temp_Scaling_BART_5_topic',
       'Temp_Scaling_BART_10_topic', 'Temp_Scaling_BART_20_topic',
       'Temp_Scaling_BART_40_topic', 'Temp_Scaling_BART_80_topic',
       'Temp_Scaling_BART_160_topic', 'Temp_Scaling_BART_5_stance',
       'Temp_Scaling_BART_10_stance', 'Temp_Scaling_BART_20_stance',
       'Temp_Scaling_BART_40_stance', 'Temp_Scaling_BART_80_stance',
       'Temp_Scaling_BART_160_stance', 'Llama-2', 'Llama-2_STANCE',
       'normed_sbert_topic', 'normed_sbert_stance', 'avg_normed_sbert_topic',
       'pred_topic', 'pred_stance', 'normed_BART_topic', 'normed_BART_stance',


In [47]:
for samp_size in sample_sizes:
    test_df['normed_TempScal_topic'] = test_df["Temp_Scaling_BART_" + str(samp_size) + "_topic"].apply(lambda x: get_normed_scores(x, artificial_thresholds, inverse_claims))
    test_df['normed_TempScal_stance'] = test_df["Temp_Scaling_BART_" + str(samp_size) + "_stance"].apply(lambda x: get_normed_scores(x, artificial_thresholds, inverse_claims))
    test_df['avg_normed_TempScal_topic'] = test_df["normed_TempScal_topic"].apply(lambda x: get_avg_normed_topic_scores(x, CLASSES_TOPIC, inverse_claims))
    test_df['pred_topic'] = test_df['avg_normed_TempScal_topic'].apply(get_topic_pred_multiclass)
    test_df['pred_stance'] = test_df.apply(lambda x: get_stance_pred(x['normed_TempScal_stance'], x['pred_topic'], claims_stance), axis=1)
    print(samp_size, ': %.3f' % f1_score(test_df['topic_stance_annot'].to_list(), test_df['pred_stance'].to_list(), average="micro"))

5 : 0.420
10 : 0.420
20 : 0.418
40 : 0.415
80 : 0.417
160 : 0.416


### Baseline 5: Llama 2 70b model with prompting

In [48]:
test_df.columns

Index(['ID', 'Target', 'Tweet', 'Stance', 'FSL_BART_stance', 'FSL_BART_topic',
       'sbert_cosine_topic', 'sbert_cosine_stance', 'topic_stance_annot',
       'topic_annot', '1_annot', '2_annot', '3_annot', '4_annot', '5_annot',
       '1A_annot', '1F_annot', '2A_annot', '2F_annot', '3A_annot', '3F_annot',
       '4A_annot', '4F_annot', '5A_annot', '5F_annot', 'BERT_topic',
       'BERT_topic_stance', 'Temp_Scaling_BART_5_topic',
       'Temp_Scaling_BART_10_topic', 'Temp_Scaling_BART_20_topic',
       'Temp_Scaling_BART_40_topic', 'Temp_Scaling_BART_80_topic',
       'Temp_Scaling_BART_160_topic', 'Temp_Scaling_BART_5_stance',
       'Temp_Scaling_BART_10_stance', 'Temp_Scaling_BART_20_stance',
       'Temp_Scaling_BART_40_stance', 'Temp_Scaling_BART_80_stance',
       'Temp_Scaling_BART_160_stance', 'Llama-2', 'Llama-2_STANCE',
       'normed_sbert_topic', 'normed_sbert_stance', 'avg_normed_sbert_topic',
       'pred_topic', 'pred_stance', 'normed_BART_topic', 'normed_BART_stance',


In [49]:
target2idx = {
    "Atheism": "1",
    "Climate Change is a Real Concern": "2",
    "Feminist Movement": "3",
    "Hillary Clinton": "4",
    "Legalization of Abortion": "5"
}

In [50]:
def get_llama_pred(stance_preds, stance_annots, topic_annot):
    result = list()
    
    for topic in target2idx:
        if topic in stance_preds and topic == topic_annot: 
            if stance_preds[topic].lower() == "anti":
                result += [1, 0, 0]
            elif stance_preds[topic].lower() == "pro":
                result += [0, 1, 0]
            else:
                result += [0, 0, 1]
        else:
            result += [0, 0, 0]
    
    sub_results = list()
    
    for p,a in zip(result, stance_annots):
        if a == 0:
            sub_results.append(0)
        else:
            sub_results.append(p)
            
    return sub_results

In [51]:
test_df['topic_stance_annot_multi'] = test_df['topic_stance_annot'].apply(lambda x: [1 if k == x else 0 for k in CLASSES_STANCE])

In [52]:
test_df['pred_stance'] = test_df.apply(lambda x: get_llama_pred(x["Llama-2_STANCE"], x["topic_stance_annot_multi"], x["Target"]), axis=1)

In [53]:
print(classification_report(test_df['topic_stance_annot_multi'].to_list(), test_df['pred_stance'].to_list(), target_names=CLASSES_STANCE))

              precision    recall  f1-score   support

          1A       0.00      0.00      0.00       160
          1F       1.00      0.03      0.06        32
          1N       0.00      0.00      0.00        28
          2A       0.00      0.00      0.00        11
          2F       1.00      0.07      0.14       123
          2N       1.00      0.34      0.51        35
          3A       0.00      0.00      0.00       183
          3F       1.00      0.14      0.24        58
          3N       1.00      0.09      0.17        44
          4A       1.00      0.01      0.01       172
          4F       1.00      0.04      0.09        45
          4N       1.00      0.04      0.07        78
          5A       1.00      0.01      0.02       189
          5F       1.00      0.13      0.23        46
          5N       1.00      0.16      0.27        45

   micro avg       1.00      0.04      0.08      1249
   macro avg       0.73      0.07      0.12      1249
weighted avg       0.69   

### Ours 1: Few-shots NLI approach using BART MNLI

In [54]:
with open('./data/bisection_records/TS_BART_topic.json') as file:
    bart_record_topic = json.load(file)
    
with open('./data/bisection_records/TS_BART_stance.json') as file:
    bart_record_stance = json.load(file)

In [55]:
bart_topic_thresholds = get_thresholds_from_record(bart_record_topic, start=0, end=1)
bart_stance_thresholds = get_thresholds_from_record(bart_record_stance, start=0, end=1)

In [56]:
bart_topic_thresholds

{'1_0': {'threshold': 0.44},
 '1_1': {'threshold': 0.37},
 '1_2': {'threshold': 0.21000000000000002},
 '1_3': {'threshold': 0.16999999999999998},
 '1_4': {'threshold': 0.14},
 '2_0': {'threshold': 0.46},
 '2_1': {'threshold': 0.46},
 '2_2': {'threshold': 0.41000000000000003},
 '2_3': {'threshold': 0.31},
 '2_4': {'threshold': 0.19},
 '3_0': {'threshold': 0.53},
 '3_1': {'threshold': 0.44},
 '3_2': {'threshold': 0.65},
 '3_3': {'threshold': 0.6},
 '3_4': {'threshold': 0.27},
 '4_0': {'threshold': 0.47},
 '4_1': {'threshold': 0.63},
 '4_2': {'threshold': 0.11},
 '4_3': {'threshold': 0.16999999999999998},
 '4_4': {'threshold': 0.26},
 '5_0': {'threshold': 0.12},
 '5_1': {'threshold': 0.5700000000000001},
 '5_2': {'threshold': 0.1},
 '5_3': {'threshold': 0.16},
 '5_4': {'threshold': 0.05}}

In [57]:
test_df['normed_BART_topic'] = test_df["FSL_BART_topic"].apply(lambda x: get_normed_scores(x, bart_topic_thresholds, inverse_claims))
test_df['normed_BART_stance'] = test_df["FSL_BART_stance"].apply(lambda x: get_normed_scores(x, bart_stance_thresholds, inverse_claims))
test_df['avg_normed_BART_topic'] = test_df["normed_BART_topic"].apply(lambda x: get_avg_normed_topic_scores(x, CLASSES_TOPIC, inverse_claims))
test_df['pred_topic'] = test_df['avg_normed_BART_topic'].apply(get_topic_pred_multiclass)
test_df['pred_stance'] = test_df.apply(lambda x: get_stance_pred(x['normed_BART_stance'], x['pred_topic'], claims_stance), axis=1)

In [58]:
print(classification_report(test_df['topic_stance_annot'].to_list(), test_df['pred_stance'].to_list(), target_names=CLASSES_STANCE))

              precision    recall  f1-score   support

          1A       0.77      0.74      0.75       160
          1F       0.61      0.59      0.60        32
          1N       0.15      0.39      0.22        28
          2A       1.00      0.18      0.31        11
          2F       0.78      0.92      0.85       123
          2N       0.28      0.31      0.29        35
          3A       0.76      0.35      0.48       183
          3F       0.45      0.53      0.49        58
          3N       0.13      0.55      0.21        44
          4A       0.91      0.53      0.67       172
          4F       0.76      0.78      0.77        45
          4N       0.34      0.42      0.38        78
          5A       0.75      0.53      0.62       189
          5F       0.42      0.17      0.25        46
          5N       0.16      0.27      0.20        45

    accuracy                           0.54      1249
   macro avg       0.55      0.48      0.47      1249
weighted avg       0.66   

In [59]:
datapoints = list()
for k in bart_record:
    if k in ["1_4_0_1"]:
        continue
    datapoints += bart_record[k]['texts']
len(set(datapoints))

457

## Depressive symptoms detection

In [60]:
test_df = pd.read_pickle('./data/depression/testing.pkl')

In [61]:
test_df = test_df.drop_duplicates(subset=["Sentence"])

In [62]:
CLASSES = [
    '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', 
    '17', '18', '19', '20', '21'
]

In [63]:
with open("./data/depression/claims.json") as file:
    claims = json.load(file)

claims_descr = claims["class_descr"]
del claims["class_descr"]

In [64]:
inverse_claims = {claims[k]: k for k in claims}

In [65]:
test_df["multi_annot"] = test_df.apply(lambda x: [x[C + "_annot"] for C in CLASSES], axis=1)

In [66]:
def get_detected_claims(zsl_dict, thresholds, claims):
    detected = list()
    for claim_idx in claims:
        if zsl_dict[claims[claim_idx]] >= thresholds[claim_idx]['threshold']:
            detected.append(claim_idx)
    return detected 


def get_pred(detected, classes):
    preds = list()
    for group in classes:
        rel = [d for d in detected if d.split("_")[0] == group]
        if len(rel) > 0:
            preds.append(1)
        else:
            preds.append(0)
    return preds

### Baseline 1: Fine-tuned BERT

In [67]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['BERT_pred'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

           1       0.68      0.56      0.62        64
           2       0.00      0.00      0.00        28
           3       1.00      0.19      0.32        31
           4       0.28      0.76      0.40        29
           5       0.00      0.00      0.00        16
           6       0.00      0.00      0.00        13
           7       0.83      0.16      0.26        32
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         7
          10       0.00      0.00      0.00        15
          11       0.00      0.00      0.00        10
          12       0.00      0.00      0.00        11
          13       0.00      0.00      0.00        10
          14       1.00      0.06      0.12        16
          15       0.00      0.00      0.00         4
          16       0.00      0.00      0.00         2
          17       0.00      0.00      0.00        20
          18       0.00    

### Baseline 2: SBERT cosine similarity with threshold-tuning

In [68]:
with open('./data/bisection_records/D_SBERT.json') as file:
    sbert_record = json.load(file)

In [69]:
sbert_thresholds = get_thresholds_from_record(sbert_record, start=-1, end=1)

In [70]:
test_df['detected'] = test_df['sbert_cosine'].apply(lambda x: get_detected_claims(x, sbert_thresholds, claims))
test_df['pred_multi'] = test_df['detected'].apply(lambda x: get_pred(x, CLASSES))

In [71]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

           1       0.70      0.22      0.33        64
           2       1.00      0.04      0.07        28
           3       0.33      0.06      0.11        31
           4       0.33      0.03      0.06        29
           5       1.00      0.19      0.32        16
           6       0.00      0.00      0.00        13
           7       1.00      0.12      0.22        32
           8       0.00      0.00      0.00         3
           9       0.25      0.86      0.39         7
          10       1.00      0.47      0.64        15
          11       0.00      0.00      0.00        10
          12       0.00      0.00      0.00        11
          13       0.00      0.00      0.00        10
          14       0.00      0.00      0.00        16
          15       0.33      0.25      0.29         4
          16       0.00      0.00      0.00         2
          17       0.00      0.00      0.00        20
          18       0.25    

In [72]:
datapoints = list()
for k in sbert_record:
    datapoints += sbert_record[k]['texts']
len(set(datapoints))

241

### Baseline 3: BART MNLI model with unique threshold (Zero-Shot approach)

In [73]:
artificial_thresholds = {k: {'threshold': 0.5} for k in claims}

In [74]:
test_df['detected'] = test_df['FSL_BART'].apply(lambda x: get_detected_claims(x, artificial_thresholds, claims))
test_df['pred_multi'] = test_df['detected'].apply(lambda x: get_pred(x, CLASSES))

In [75]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

           1       0.15      1.00      0.26        64
           2       0.06      0.89      0.11        28
           3       0.05      0.97      0.10        31
           4       0.01      0.28      0.03        29
           5       0.04      0.94      0.08        16
           6       0.02      1.00      0.04        13
           7       0.07      1.00      0.13        32
           8       0.00      1.00      0.01         3
           9       0.16      0.43      0.23         7
          10       0.08      0.87      0.14        15
          11       0.06      0.80      0.12        10
          12       0.08      0.73      0.14        11
          13       0.01      0.80      0.03        10
          14       0.07      1.00      0.12        16
          15       0.00      0.50      0.01         4
          16       0.05      0.50      0.09         2
          17       0.12      0.80      0.20        20
          18       0.00    

### Baseline 4: BART MNLI model with Temperature Scaling

In [76]:
sample_sizes = [5, 10, 20, 40, 80, 160]

In [77]:
artificial_thresholds = {k: {'threshold': 0.5} for k in claims}

In [78]:
for samp_size in sample_sizes:
    test_df['detected'] = test_df["Temp_Scaling_BART_" + str(samp_size)].apply(lambda x: get_detected_claims(x, artificial_thresholds, claims))
    test_df['pred_multi'] = test_df['detected'].apply(lambda x: get_pred(x, CLASSES))
    print(samp_size, ': %.3f' % f1_score(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), average="micro"))

5 : 0.153
10 : 0.179
20 : 0.194
40 : 0.218
80 : 0.252
160 : 0.268


### Baseline 5: Llama 2 (70b model) with prompting

In [79]:
def get_detected_claims_LLAMA(zsl_dict, claims):
    detected = list()
    for claim_idx in claims:
        if zsl_dict[claims[claim_idx]].lower().strip() == "yes":
            detected.append(claim_idx)
    return detected 

In [80]:
test_df['detected'] = test_df['Llama-2'].apply(lambda x: get_detected_claims_LLAMA(x, claims))
test_df['pred_multi'] = test_df['detected'].apply(lambda x: get_pred(x, CLASSES))

In [81]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

           1       0.41      0.83      0.55        64
           2       0.14      0.86      0.24        28
           3       0.28      0.61      0.38        31
           4       0.05      0.21      0.08        29
           5       0.13      0.69      0.21        16
           6       0.07      0.23      0.11        13
           7       0.36      0.81      0.50        32
           8       0.01      0.33      0.02         3
           9       0.08      0.57      0.14         7
          10       0.10      0.67      0.17        15
          11       0.05      0.80      0.09        10
          12       0.07      0.45      0.12        11
          13       0.02      0.20      0.04        10
          14       0.14      0.94      0.24        16
          15       0.02      0.50      0.04         4
          16       0.02      1.00      0.05         2
          17       0.09      0.65      0.16        20
          18       0.05    

### Ours 1: Few-shots NLI approach using BART MNLI

In [82]:
with open('./data/bisection_records/D_BART.json') as file:
    bart_record = json.load(file)

In [83]:
bart_thresholds = get_thresholds_from_record(bart_record, start=0, end=1)

In [84]:
test_df['detected'] = test_df['FSL_BART'].apply(lambda x: get_detected_claims(x, bart_thresholds, claims))
test_df['pred_multi'] = test_df['detected'].apply(lambda x: get_pred(x, CLASSES))

In [85]:
print(classification_report(test_df['multi_annot'].to_list(), test_df['pred_multi'].to_list(), target_names=CLASSES))

              precision    recall  f1-score   support

           1       0.62      0.48      0.54        64
           2       0.36      0.32      0.34        28
           3       0.39      0.52      0.44        31
           4       0.08      0.03      0.05        29
           5       1.00      0.31      0.48        16
           6       0.18      0.15      0.17        13
           7       0.26      0.72      0.38        32
           8       0.10      0.33      0.15         3
           9       0.43      0.43      0.43         7
          10       0.75      0.20      0.32        15
          11       0.14      0.10      0.12        10
          12       0.27      0.27      0.27        11
          13       0.29      0.20      0.24        10
          14       0.39      0.75      0.51        16
          15       0.09      0.25      0.13         4
          16       0.00      0.00      0.00         2
          17       0.22      0.35      0.27        20
          18       0.00    

In [86]:
datapoints = list()
for k in bart_record:
    if k in ["1_4_0_1"]:
        continue
    datapoints += bart_record[k]['texts']
len(set(datapoints))

277