In [12]:
from sentence_transformers import CrossEncoder

model = CrossEncoder('cross-encoder/nli-deberta-v3-large', device='cuda:7')
# model = CrossEncoder('cross-encoder/nli-deberta-v3-base', device='cuda:7')
# model = CrossEncoder('cross-encoder/nli-deberta-v3-small')

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]



In [13]:
import json
import pandas as pd
from pathlib import Path
from typing import Any, List

def read_jsonl(path: Path):
    with path.open(encoding='utf-8') as f:
        return [json.loads(line) for line in f]

data = read_jsonl(Path("../atomic_facts.jsonl"))

In [14]:
import torch

def get_ce_score(pairs):
    scores = model.predict(pairs)
    label_names = ['contradiction', 'entailment', 'neutral']
    predictions = []
    for pred in scores:
        predictions.append({name: round(float(pred), 2) for pred, name in zip(pred, label_names)})
    return predictions

def get_custom_scores(input):
    pairs = list(itertools.combinations(input, r=2))
    
    formatted_pairs = [(pair[0], pair[1]) for pair in pairs]
    scores_1 = get_ce_score(formatted_pairs)

    formatted_pairs = [(pair[1], pair[0]) for pair in pairs]
    scores_2 = get_ce_score(formatted_pairs)
    return scores_1, scores_2

In [15]:
def weighted_agg(input, ent_w, cont_w, neutral_w):
    ent = input['entailment']
    cont = input['contradiction']
    neutral = input['neutral']

    weighted_sum = ent_w * ent + neutral_w * neutral + cont_w * cont
    return weighted_sum

In [16]:
from scipy.stats import hmean
from sklearn.cluster import KMeans

def aggregation(scores_1, scores_2, ent_w=1, cont_w=-1, neutral_w=0):
    agg_1 = [weighted_agg(s, ent_w, cont_w, neutral_w) for s in scores_1]
    agg_2 = [weighted_agg(s, ent_w, cont_w, neutral_w) for s in scores_2]

    agg = np.array(agg_1 + agg_2)

    kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(agg.reshape(-1, 1))
    centroids = kmeans.cluster_centers_
    return np.min(centroids)

    # abs_max = np.argmax(np.absolute(agg)) 
    # return agg[abs_max]

    # return np.min(agg)

In [17]:
from tqdm import tqdm
import itertools

normal_scores = []
strange_scores = []

for sample in tqdm(data):
    normal_scores.append(get_custom_scores(sample['normal']))
    strange_scores.append(get_custom_scores(sample['strange']))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:20<00:00,  5.00it/s]


In [18]:
true_classes = [0] * len(normal_scores) + [1] * len(strange_scores)
scores = normal_scores + strange_scores

In [19]:
def eval_weights_on_test(test_scores, test_labels, ent_w, cont_w, neutral_w, thr=0):
    agg_scores = []

    for score in test_scores:
        agg_scores.append(aggregation(*score, ent_w=ent_w, cont_w=cont_w, neutral_w=neutral_w))
    
    our_outputs = [el < thr for el in agg_scores]

    return accuracy_score(test_labels, our_outputs)

In [20]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

kf = KFold(n_splits=3, shuffle=True)
kf.get_n_splits(scores)

fold_accuracies = []

for i, (train_index, test_index) in enumerate(kf.split(scores)):
    best_acc = 0
    print(f"FOLD {i}")
    train_scores = np.array(scores)[train_index]
    train_labels = np.array(true_classes)[train_index]
    test_scores = np.array(scores)[test_index]
    test_labels = np.array(true_classes)[test_index]
    
    for ent_w in tqdm(np.arange(0, 2, 0.25)):
        for cont_w in np.arange(-2, 0, 0.25):
            # for neutral_w in np.arange(-0.25, 0.25, 0.25):
                neutral_w = 0.0
                if ent_w == 0 and cont_w == 0 and neutral_w == 0:
                    continue
                agg_scores = []

                for score in train_scores:
                    agg_scores.append(aggregation(*score, ent_w=ent_w, cont_w=cont_w, neutral_w=neutral_w))

                thr = 0.0
                our_outputs = [el < thr for el in agg_scores]
                acc = accuracy_score(train_labels, our_outputs)
        
                if acc >= best_acc:
                    test_acc = eval_weights_on_test(test_scores, test_labels, ent_w, cont_w, neutral_w, 0.0)
                    best_acc = acc
                    best_ent_w = ent_w
                    best_cont_w = cont_w
                    best_neutral_w = neutral_w
    print(f"last fold acc={round(best_acc * 100, 2)}% on {thr=} {ent_w=}, {cont_w=}, {neutral_w=}, {test_acc=}")
    fold_accuracies.append(test_acc)

print(f"Mean acc: {(np.mean(fold_accuracies) * 100).round(2)}")
print(f"std: {(np.std(fold_accuracies) * 100).round(2)}")
print(f"var: {(np.var(fold_accuracies) * 100).round(2)}")
print(f"accs: {fold_accuracies}")

FOLD 0


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:14<00:00,  1.86s/it]


last fold acc=74.26% on thr=0.0 ent_w=1.75, cont_w=-0.25, neutral_w=0.0, test_acc=0.6911764705882353
FOLD 1


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:14<00:00,  1.87s/it]


last fold acc=72.06% on thr=0.0 ent_w=1.75, cont_w=-0.25, neutral_w=0.0, test_acc=0.7352941176470589
FOLD 2


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:15<00:00,  1.88s/it]

last fold acc=71.32% on thr=0.0 ent_w=1.75, cont_w=-0.25, neutral_w=0.0, test_acc=0.75
Mean acc: 72.55
std: 2.5
var: 0.06
accs: [0.6911764705882353, 0.7352941176470589, 0.75]





In [21]:
eval_weights_on_test(scores, true_classes, ent_w=1.75, cont_w=-2.0, neutral_w=0.0, thr=0.0)

0.7254901960784313