In [10]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from explainable_fact_checking.plot.notebook_utility import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# ROC curve noise

In [11]:
df = xfc.load_preprocess_explanations(experiment_code_list=[
    'fbs_np_1.0',
    'fbs_np_2.0',
])

In [12]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import auc

tdf = df.copy()
tdf = tdf[tdf['type'] == 'evidence']

In [13]:
def get_best_f1(y_true, y_pred_proba):
    precision, recall, f1_thresholds = precision_recall_curve(y_true, y_pred_proba)
    numerator = 2 * recall * precision
    denom = recall + precision
    f1_scores = np.divide(numerator, denom, out=np.zeros_like(denom), where=(denom != 0))
    max_f1 = np.max(f1_scores)
    max_f1_thresh = f1_thresholds[np.argmax(f1_scores)]
    return max_f1_thresh, max_f1, f1_scores


In [14]:
# calculate the auc, accuracy and best threshold for each model_id and explainer_name
res_list = []
index_cols = ['model_id', 'explainer_name',  #'predicted_label'
              ]
for keys, group_df in tdf.groupby(index_cols):
    # model_id, explainer_name, predicted_label = keys
    model_id, explainer_name = keys
    tdict = dict(zip(index_cols, keys))
    usefull_vs_noise_ground_truth = group_df['noisetag'] == 0
    useful_score = group_df[['SUPPORTS', 'REFUTES']].abs().sum(axis=1)
    fpr, tpr, thresholds = roc_curve(usefull_vs_noise_ground_truth, useful_score)
    auc_score = auc(fpr, tpr)
    best_threshold = thresholds[np.argmax(tpr - fpr)]
    best_accuracy = accuracy_score(usefull_vs_noise_ground_truth, useful_score > best_threshold)

    best_f1_th, best_f1, f1_scores = get_best_f1(usefull_vs_noise_ground_truth, useful_score)
    best_f1_th_noise, best_f1_noise, f1_scores_noise = get_best_f1(~usefull_vs_noise_ground_truth, -useful_score)
    # precision, recall, f1_thresholds = precision_recall_curve(y_true, y_pred_proba)
    tdict.update(auc=auc_score, best_threshold=best_threshold, best_accuracy=best_accuracy, fpr=fpr, tpr=tpr,
                 # precision=precision, recall=recall,
                 thresholds=thresholds, best_f1=best_f1, best_f1_th=best_f1_th, f1_scores=f1_scores,
                 best_f1_noise=best_f1_noise, best_f1_th_noise=best_f1_th_noise, f1_scores_noise=f1_scores_noise
                 )
    res_list.append(tdict)
roc_df = pd.DataFrame(res_list)

In [15]:
avg_score = roc_df[['auc', 'best_accuracy']].mean().to_frame().T
avg_score['model_id'] = 'average'
avg_score['explainer_name'] = 'average'
with_avg = pd.concat([roc_df,
                      # avg_score
                      ]).sort_values(by=index_cols)
with_avg.drop(columns=['fpr', 'tpr', 'thresholds']).to_latex(os.path.join(save_path, 'roc_df.latex'), index=False,
                                                             float_format='%.2f',
                                                             bold_rows=True,
                                                             caption='ROC curve results',
                                                             label='tab:roc_df'
                                                             )
tcols = index_cols + ['auc', 'best_threshold', 'best_accuracy', 'best_f1',
                      'best_f1_th', 'best_f1_noise', 'best_f1_th_noise']
roc_df[tcols].sort_values(by=index_cols).to_csv(os.path.join(save_path, 'roc_df.csv'))
roc_df[tcols].drop(columns=['best_threshold', 'best_f1_th', 'best_f1_th_noise'])

Unnamed: 0,model_id,explainer_name,auc,best_accuracy,best_f1,best_f1_noise
0,feverous_verdict_predictor,lime,0.761034,0.84881,0.396832,0.971695
1,feverous_verdict_predictor,shap,0.78105,0.805593,0.402715,0.97178


In [16]:
avg_score

Unnamed: 0,auc,best_accuracy,model_id,explainer_name
0,0.771042,0.827201,average,average


In [17]:
# The histogram of scores compared to true labels
fig_hist = px.histogram(
    x=useful_score, color=usefull_vs_noise_ground_truth, nbins=50,
    labels=dict(color='True Labels', x='Score')
)

fig_hist.show()



In [18]:
fig = go.Figure()
for i, row in roc_df.iterrows():
    fig.add_trace(go.Scatter(x=row['fpr'], y=row['tpr'],
                             mode='lines',
                             name=f'{" ".join([row[k] for k in index_cols])} auc={row["auc"]:.2f}',
                             line=dict(width=2),
                             ))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1],
                         mode='lines',
                         name='Random',
                         line=dict(color='black', width=2, dash='dash')
                         ))

fig.update_xaxes(title='False Positive Rate')
fig.update_yaxes(title='True Positive Rate')

fig.update_layout(title='ROC curve',
                  **layout_dict, **h_legend_dict)
fig = end_fig_func(fig)
fig.show()
save_fig(fig, 'noise_detection_roc_curve')

In [19]:
fig = go.Figure()
for i, row in roc_df.iterrows():
    fig.add_trace(go.Scatter(x=row['precision'], y=row['recall'],
                             mode='lines',
                             name=f'{" ".join([row[k] for k in index_cols])} auc={row["auc"]:.2f}',
                             line=dict(width=2),
                             ))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1],
                         mode='lines',
                         name='Random',
                         line=dict(color='black', width=2, dash='dash')
                         ))

fig.update_xaxes(title='False Positive Rate')
fig.update_yaxes(title='True Positive Rate')

fig.update_layout(title='ROC curve',
                  **layout_dict, **h_legend_dict)
fig = end_fig_func(fig)
fig.show()

KeyError: 'precision'

# F1 models on SUPPORTS and REFUTES

In [None]:
# take the first element for each id, model_id, dataset_file_name, for explainer_name 'LIME'
# filter explainer_name 'LIME'
LIME_mask = df['explainer_name'] == 'lime'
# xclude dataset_file_name 'ex_AB_00.jsonl'
normal_dataset_mask = df['dataset_file_name'] != 'ex_AB_00.jsonl'
first_elements = df[LIME_mask & normal_dataset_mask].copy().groupby(['id', 'dataset_file_name', 'model_id'],
                                                                    as_index=False).first()

In [None]:
# define a function to compute the f1 score to be used in the groupby
def f1_score_func(x):
    predicted_dummies = pd.get_dummies(x['predicted_label'])
    # if NEI is not present in the predicted_dummies add it with all zeros
    if 'NEI' not in predicted_dummies.columns:
        predicted_dummies['NEI'] = 0
    true_dummies = pd.get_dummies(x['label'])
    f1_score_list = []
    for class_ in xfc.xfc_utils.class_names:
        if class_ in true_dummies.columns:
            f1_score_list.append(f1_score(true_dummies[class_], predicted_dummies[class_]))
        else:
            f1_score_list.append(np.nan)
    return pd.Series(f1_score_list, index=xfc.xfc_utils.class_names)



In [None]:
# compute the f1 score for each id, model_id, dataset_file_name
f1_score_df = first_elements.groupby(['model_id']).apply(f1_score_func)


In [None]:
f1_score_df