In [1]:
cd /homes/bussotti/XFC2/code

/homes/bussotti/XFC2/code


In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from explainable_fact_checking.result_presentation.notebook_utility import *


[W095] Model 'en_core_web_sm' (3.1.0) was trained with spaCy v3.1.0 and may not be 100% compatible with the current version (3.7.3). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate



/home/bussotti/.conda/envs/feverous2/bin/python


# ROC curve noise

In [3]:
df = xfc.load_preprocess_explanations(experiment_code_list=[
    'fbs_np_1.0',
    'fbs_np_2.0',
    'lla_np_1.0',
    'lla_np_2.0',
])

In [4]:
df['model_id'].unique()

array(['LLAMA3_1', 'feverous_verdict_predictor'], dtype=object)

KeyError: 'model_path'

In [6]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import auc

tdf = df.copy()
tdf = tdf[tdf['type'] == 'evidence']

In [7]:
tdf['model_id'].unique()

array(['LLAMA3_1', 'feverous_verdict_predictor'], dtype=object)

In [8]:
def get_best_f1(y_true, y_pred_proba):
    precision, recall, f1_thresholds = precision_recall_curve(y_true, y_pred_proba)
    numerator = 2 * recall * precision
    denom = recall + precision
    f1_scores = np.divide(numerator, denom, out=np.zeros_like(denom), where=(denom != 0))
    max_f1 = np.max(f1_scores)
    max_f1_thresh = f1_thresholds[np.argmax(f1_scores)]
    return max_f1_thresh, max_f1, f1_scores


In [9]:
# calculate the auc, accuracy and best threshold for each model_id and explainer_name
res_list = []
index_cols = ['model_id', 'explainer_name',  #'predicted_label'
              ]
for keys, group_df in tdf.groupby(index_cols):
    # model_id, explainer_name, predicted_label = keys
    model_id, explainer_name = keys
    tdict = dict(zip(index_cols, keys))
    usefull_vs_noise_ground_truth = group_df['noisetag'] == 0
    useful_score = group_df[['SUPPORTS', 'REFUTES']].abs().sum(axis=1)
    fpr, tpr, thresholds = roc_curve(usefull_vs_noise_ground_truth, useful_score)
    auc_score = auc(fpr, tpr)
    best_threshold = thresholds[np.argmax(tpr - fpr)]
    best_accuracy = accuracy_score(usefull_vs_noise_ground_truth, useful_score > best_threshold)

    best_f1_th, best_f1_useful, f1_scores_useful = get_best_f1(usefull_vs_noise_ground_truth, useful_score)
    best_f1_th_noise, best_f1_noise, f1_scores_noise = get_best_f1(~usefull_vs_noise_ground_truth, -useful_score)
    # precision, recall, f1_thresholds = precision_recall_curve(y_true, y_pred_proba)
    tdict.update(auc=auc_score, best_threshold=best_threshold, best_accuracy=best_accuracy, fpr=fpr, tpr=tpr,
                 # precision=precision, recall=recall,
                 thresholds=thresholds, best_f1_useful=best_f1_useful, best_f1_th=best_f1_th, f1_scores_useful=f1_scores_useful,
                 best_f1_noise=best_f1_noise, best_f1_th_noise=best_f1_th_noise, f1_scores_noise=f1_scores_noise
                 )
    res_list.append(tdict)
roc_df = pd.DataFrame(res_list)

In [10]:
avg_score = roc_df[['auc', 'best_accuracy']].mean().to_frame().T
avg_score['model_id'] = 'average'
avg_score['explainer_name'] = 'average'
with_avg = pd.concat([roc_df,
                      # avg_score
                      ]).sort_values(by=index_cols)
with_avg.drop(columns=['fpr', 'tpr', 'thresholds']).to_latex(os.path.join(save_path, 'roc_df.latex'), index=False,
                                                             float_format='%.2f',
                                                             bold_rows=True,
                                                             caption='ROC curve results',
                                                             label='tab:roc_df'
                                                             )
tcols = index_cols + ['best_f1_useful',
                      'best_f1_th', 'best_f1_noise', 'best_f1_th_noise', 'auc', 'best_threshold', 'best_accuracy', ]
roc_df[tcols].sort_values(by=index_cols).to_csv(os.path.join(save_path, 'roc_df.csv'))
roc_df[tcols].drop(columns=['best_threshold', 'best_f1_th', 'best_f1_th_noise'])

Unnamed: 0,model_id,explainer_name,best_f1_useful,best_f1_noise,auc,best_accuracy
0,LLAMA3_1,lime,0.143006,0.97034,0.548724,0.792472
1,LLAMA3_1,shap,0.133636,0.97034,0.561789,0.734125
2,feverous_verdict_predictor,lime,0.396832,0.971695,0.761034,0.84881
3,feverous_verdict_predictor,shap,0.402715,0.97178,0.78105,0.805593


In [None]:
avg_score

In [None]:
# The histogram of scores compared to true labels
fig_hist = px.histogram(
    x=useful_score, color=usefull_vs_noise_ground_truth, nbins=50,
    labels=dict(color='True Labels', x='Score')
)
fig_hist.show()

In [None]:
fig = go.Figure()
for i, row in roc_df.iterrows():
    fig.add_trace(go.Scatter(x=row['fpr'], y=row['tpr'],
                             mode='lines',
                             name=f'{" ".join([row[k] for k in index_cols])} auc={row["auc"]:.2f}',
                             line=dict(width=2),
                             ))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1],
                         mode='lines',
                         name='Random',
                         line=dict(color='black', width=2, dash='dash')
                         ))

fig.update_xaxes(title='False Positive Rate')
fig.update_yaxes(title='True Positive Rate')

fig.update_layout(title='ROC curve',
                  **layout_dict, **h_legend_dict)
fig = end_fig_func(fig)
fig.show()
save_fig(fig, 'noise_detection_roc_curve')

In [None]:
fig = go.Figure()
for i, row in roc_df.iterrows():
    fig.add_trace(go.Scatter(x=row['precision'], y=row['recall'],
                             mode='lines',
                             name=f'{" ".join([row[k] for k in index_cols])} auc={row["auc"]:.2f}',
                             line=dict(width=2),
                             ))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1],
                         mode='lines',
                         name='Random',
                         line=dict(color='black', width=2, dash='dash')
                         ))

fig.update_xaxes(title='False Positive Rate')
fig.update_yaxes(title='True Positive Rate')

fig.update_layout(title='ROC curve',
                  **layout_dict, **h_legend_dict)
fig = end_fig_func(fig)
fig.show()

# F1 models on SUPPORTS and REFUTES

In [None]:
# take the first element for each id, model_id, dataset_file_name, for explainer_name 'LIME'
# filter explainer_name 'LIME'
LIME_mask = df['explainer_name'] == 'lime'
# xclude dataset_file_name 'ex_AB_00.jsonl'
normal_dataset_mask = df['dataset_file_name'] != 'ex_AB_00.jsonl'
first_elements = df[LIME_mask & normal_dataset_mask].copy().groupby(['id', 'dataset_file_name', 'model_id'],
                                                                    as_index=False).first()

In [None]:
# define a function to compute the f1 score to be used in the groupby
def f1_score_func(x):
    predicted_dummies = pd.get_dummies(x['predicted_label'])
    # if NEI is not present in the predicted_dummies add it with all zeros
    if 'NEI' not in predicted_dummies.columns:
        predicted_dummies['NEI'] = 0
    true_dummies = pd.get_dummies(x['label'])
    f1_score_list = []
    for class_ in xfc.xfc_utils.class_names:
        if class_ in true_dummies.columns:
            f1_score_list.append(f1_score(true_dummies[class_], predicted_dummies[class_]))
        else:
            f1_score_list.append(np.nan)
    return pd.Series(f1_score_list, index=xfc.xfc_utils.class_names)



In [None]:
# compute the f1 score for each id, model_id, dataset_file_name
f1_score_df = first_elements.groupby(['model_id']).apply(f1_score_func)


In [None]:
f1_score_df

# 