In [1]:
%cd ../

/home/qwj/code/HDInstruct


In [2]:
import os
import pandas as pd
from sklearn.metrics import roc_auc_score
from scipy.stats import pearsonr
import pandas as pd
from prettytable import PrettyTable

In [3]:
data_dir = "./outputs/completion_few_shot_full_2024-03-11_13-27"
file_suffix = "_uncertainty.jsonl"


In [6]:
def print_result(title, df):
    table = PrettyTable()
    table.title = title
    table.field_names = ["Uncertainty Methods",  "AUC_R", "AUC_S","PCC"]

    pos_or_neg = {
        'perplexity': 'neg',
        'energy_score': 'pos',
        'ln_entropy': 'neg',
        'lexical_similarity': 'pos',
        'eigen_score': 'neg',
    }

    for uncertainty, flag in pos_or_neg.items():
        if flag == 'pos':
            aus_s = roc_auc_score(df['similarity_correctness'], df[uncertainty])
            aus_r = roc_auc_score(df['rouge_correctness'], df[uncertainty])
        else:
            aus_s = roc_auc_score(df['similarity_correctness'], -df[uncertainty])
            aus_r = roc_auc_score(df['rouge_correctness'], -df[uncertainty])
        pearson_r = abs(pearsonr(df[uncertainty], df['rouge_correctness'])[0])
        table.add_row([uncertainty, f"{aus_r*100:.2f}", f"{aus_s*100:.2f}", f"{pearson_r*100:.2f}"])
    table.reversesort = True
    print(table)    


## CoQA

In [6]:
df = pd.read_json(os.path.join(data_dir, "coqa" + file_suffix), lines=True, orient="records")
df.head()

Unnamed: 0,id,perplexity,ln_entropy,energy_score,eigen_score,lexical_similarity,rouge_correctness,similarity_correctness
0,coqa_3dr23u6we5exclen4th8uq9rb42tel_0,1000.0,1.564141,1.564141,-1.879919,0.522871,False,False
1,coqa_3dr23u6we5exclen4th8uq9rb42tel_1,1000.0,0.904711,0.904711,-2.903833,0.555896,False,False
2,coqa_3dr23u6we5exclen4th8uq9rb42tel_2,0.860375,0.978429,0.978429,-5.009766,1.0,True,True
3,coqa_3dr23u6we5exclen4th8uq9rb42tel_3,0.809057,1.167674,1.167674,-3.112764,0.429333,False,False
4,coqa_3dr23u6we5exclen4th8uq9rb42tel_4,1000.0,1.129405,1.129405,-0.82846,0.377347,False,False


In [7]:
print_result("CoQA", df)

+---------------------------------------------+
|                     CoQA                    |
+---------------------+-------+-------+-------+
| Uncertainty Methods | AUC_R | AUC_S |  PCC  |
+---------------------+-------+-------+-------+
|      perplexity     | 69.01 | 68.99 | 50.07 |
|     energy_score    | 32.98 | 33.04 | 29.58 |
|      ln_entropy     | 67.02 | 66.96 | 29.58 |
|  lexical_similarity | 80.50 | 80.89 | 51.55 |
|     eigen_score     | 80.08 | 82.20 | 52.20 |
+---------------------+-------+-------+-------+


## Squad

In [12]:
df = pd.read_json(os.path.join(data_dir, "squad" + file_suffix), lines=True)
df.head()

Unnamed: 0,id,perplexity,ln_entropy,energy_score,eigen_score,lexical_similarity,rouge_correctness,similarity_correctness
0,squad_56ddde6b9a695914005b9628,0.128348,0.128348,0.128348,-6.251751,1.0,True,True
1,squad_56ddde6b9a695914005b9629,0.060648,0.101328,0.101328,-5.675595,0.966667,True,True
2,squad_56ddde6b9a695914005b962a,0.121977,0.182097,0.182097,-4.786143,0.817253,True,True
3,squad_56ddde6b9a695914005b962b,0.046122,0.046122,0.046122,-6.250949,1.0,True,True
4,squad_56ddde6b9a695914005b962c,0.238863,0.193989,0.193989,-5.537917,0.840351,True,True


In [13]:
print_result("SQuAD", df)

+---------------------------------------------+
|                    SQuAD                    |
+---------------------+-------+-------+-------+
| Uncertainty Methods | AUC_R | AUC_S |  PCC  |
+---------------------+-------+-------+-------+
|      perplexity     | 65.77 | 67.91 | 15.30 |
|     energy_score    | 26.82 | 25.96 | 37.59 |
|      ln_entropy     | 73.18 | 74.04 | 37.59 |
|  lexical_similarity | 82.55 | 81.64 | 54.20 |
|     eigen_score     | 80.20 | 81.58 | 52.86 |
+---------------------+-------+-------+-------+


## NQ

In [8]:
df = pd.read_json(os.path.join(data_dir, "nq" + file_suffix), lines=True)
df.head()

Unnamed: 0,id,perplexity,ln_entropy,energy_score,eigen_score,lexical_similarity,rouge_correctness,similarity_correctness
0,nq_0,0.046152,0.046152,0.046152,-6.249629,1.0,True,False
1,nq_1,0.889976,0.953212,0.953212,-3.919195,0.292481,True,True
2,nq_2,0.098977,0.098977,0.098977,-6.245945,1.0,False,False
3,nq_3,0.39373,0.647165,0.647165,-4.312772,0.163158,False,False
4,nq_4,1.04526,1.262186,1.262186,-1.205522,0.128574,False,False


In [9]:
print_result("Natural Questions", df)

+---------------------------------------------+
|              Natural Questions              |
+---------------------+-------+-------+-------+
| Uncertainty Methods | AUC_R | AUC_S |  PCC  |
+---------------------+-------+-------+-------+
|      perplexity     | 73.12 | 72.56 | 29.52 |
|     energy_score    | 24.99 | 25.46 | 35.91 |
|      ln_entropy     | 75.01 | 74.54 | 35.91 |
|  lexical_similarity | 80.88 | 80.73 | 48.18 |
|     eigen_score     | 77.09 | 79.67 | 39.82 |
+---------------------+-------+-------+-------+


## TriviaQA

In [10]:
df = pd.read_json(os.path.join(data_dir, "triviaqa" + file_suffix), lines=True)
df.head()

Unnamed: 0,id,perplexity,ln_entropy,energy_score,eigen_score,lexical_similarity,rouge_correctness,similarity_correctness
0,triviaqa_tc_2,0.147896,0.373409,0.373409,-4.590501,0.654737,False,False
1,triviaqa_tc_33,0.51965,0.767348,0.767348,-3.41702,0.273544,False,False
2,triviaqa_tc_40,0.422261,0.847624,0.847624,-1.023559,0.138583,False,False
3,triviaqa_tc_49,0.244038,0.329107,0.329107,-5.603721,0.9,False,False
4,triviaqa_tc_56,0.447769,0.807017,0.807017,-4.255697,0.475088,False,False


In [11]:
print_result("TriviaQA", df)

+---------------------------------------------+
|                   TriviaQA                  |
+---------------------+-------+-------+-------+
| Uncertainty Methods | AUC_R | AUC_S |  PCC  |
+---------------------+-------+-------+-------+
|      perplexity     | 79.86 | 79.61 | 47.02 |
|     energy_score    | 17.21 | 18.28 | 54.57 |
|      ln_entropy     | 82.79 | 81.72 | 54.57 |
|  lexical_similarity | 88.00 | 86.49 | 64.26 |
|     eigen_score     | 85.20 | 86.26 | 60.65 |
+---------------------+-------+-------+-------+


---


In [4]:
df = pd.read_json(os.path.join(data_dir, "triviaqa_correctness_logits.jsonl"), lines=True, orient="records")
df.head()

Unnamed: 0,id,rouge_correctness,similarity_correctness,logits
0,triviaqa_tc_2,False,False,"[9.3197774887, -7.8463964462]"
1,triviaqa_tc_33,False,False,"[9.523311615, -7.3078994751]"
2,triviaqa_tc_40,False,False,"[9.3080377579, -7.3941273688999996]"
3,triviaqa_tc_49,False,False,"[9.3530950546, -8.2431402206]"
4,triviaqa_tc_56,False,False,"[9.4434719086, -8.2656526566]"


In [6]:
probs = df['logits'].apply(lambda x: x[0]/(x[0]+x[1]))
probs = probs.to_list()
probs[:5]

[6.32543600051105,
 4.298663640720983,
 4.863361321092657,
 8.426554638168279,
 8.017759849454395]

In [7]:
roc_auc_score(df['rouge_correctness'], probs)

0.5188627743030793

In [8]:
roc_auc_score(df['similarity_correctness'], probs)

0.5300154702186318

In [9]:
abs(pearsonr(probs, df['rouge_correctness'])[0])

0.0050030740489649325

In [11]:
import numpy as np
preds = df['logits'].apply(lambda x: np.argmax(x))
preds = preds.to_list()
preds[:5]

[0, 0, 0, 0, 0]

In [12]:
sum(preds)/len(preds)

0.0