# ImpPres Baseline

This notebook illustrates how to use the DeBERTa-v3-base-mnli-fever-anli model to perform specialized inference on the ImpPres dataset.

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [2]:
label_names = ["entailment", "neutral", "contradiction"]
def evaluate(premise, hypothesis):
    input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
    output = model(input["input_ids"].to(device))
    prediction = torch.softmax(output["logits"][0], -1).tolist()
    prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
    return prediction

In [3]:
evaluate("The weather is nice today.", "It is sunny outside.")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'entailment': 0.1, 'neutral': 99.8, 'contradiction': 0.0}

In [3]:
def get_prediction(pred_dict):
    if pred_dict["entailment"] > pred_dict["contradiction"]  and pred_dict["entailment"] > pred_dict["neutral"]:
        return "entailment"
    elif pred_dict["contradiction"] > pred_dict["entailment"]:
        return "contradiction"
    else:
        return "neutral"

## Load ImpPres Dataset

In [4]:
# Evaluate the model on the ImpPres dataset
from tqdm import tqdm
def evaluate_on_dataset(dataset):
    results = []
    label_names = ["entailment", "neutral", "contradiction"]
    for example in tqdm(dataset):
        premise = example['premise']
        hypothesis = example['hypothesis']
        prediction = evaluate(premise, hypothesis)
        results.append({
            'premise': premise,
            'hypothesis': hypothesis,
            'prediction': prediction,
            'pred_label': get_prediction(prediction),
            'gold_label': label_names[example['gold_label']],
        })
    return results

## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [5]:
from evaluate import load

accuracy = load("accuracy")
precision = load("precision")
recall = load("recall")
f1 = load("f1")


In [6]:
from evaluate import combine
clf_metrics = combine(["accuracy", "f1", "precision", "recall"])

In [8]:
clf_metrics.compute(predictions=[0, 1, 0], references=[0, 1, 1])

{'accuracy': 0.6666666666666666,
 'f1': 0.6666666666666666,
 'precision': 1.0,
 'recall': 0.5}

## Your Turn

Compute the classification metrics on the baseline model on each section of the ImpPres dataset.

https://www.kaggle.com/code/faijanahamadkhan/llm-evaluation-framework-hugging-face provides good documentation on how to use the Huggingface evaluate library.

In [7]:
def strip_presupposition_prefix(s: str) -> str:
    prefix = "presupposition_"
    if s.startswith(prefix):
        return s[len(prefix):]
    return s

In [11]:
import pandas as pd
from evaluate import load

# load metrics separately
accuracy_metric = load("accuracy")
f1_metric = load("f1")
precision_metric = load("precision")
recall_metric = load("recall")

def evaluate_from_loaded_csv(csv_path):
    df = pd.read_csv(csv_path)
    label_to_id = {"entailment": 0, "neutral": 1, "contradiction": 2}

    all_results = []
    metrics_table = []

    for section, section_df in df.groupby("section"):
        print(f"Evaluating section: {section}")

        results = evaluate_on_dataset(section_df.to_dict(orient="records"))
        preds = [label_to_id[res['pred_label']] for res in results]
        refs  = [label_to_id[res['gold_label']] for res in results]

        metrics = {
            "accuracy": accuracy_metric.compute(predictions=preds, references=refs)["accuracy"],
            "f1": f1_metric.compute(predictions=preds, references=refs, average="macro")["f1"],
            "precision": precision_metric.compute(predictions=preds, references=refs, average="macro")["precision"],
            "recall": recall_metric.compute(predictions=preds, references=refs, average="macro")["recall"],
        }

        row = {"section": section, **metrics}
        metrics_table.append(row)
        all_results.extend(results)

    # overall metrics
    all_preds = [label_to_id[res['pred_label']] for res in all_results]
    all_refs  = [label_to_id[res['gold_label']] for res in all_results]

    overall = {
        "accuracy": accuracy_metric.compute(predictions=all_preds, references=all_refs)["accuracy"],
        "f1": f1_metric.compute(predictions=all_preds, references=all_refs, average="macro")["f1"],
        "precision": precision_metric.compute(predictions=all_preds, references=all_refs, average="macro")["precision"],
        "recall": recall_metric.compute(predictions=all_preds, references=all_refs, average="macro")["recall"],
    }
    metrics_table.append({"section": "ALL", **overall})

    metrics_df = pd.DataFrame(metrics_table)
    print("\nFinal Metrics Table:")
    print(metrics_df.to_string(index=False))

    all_results_df = pd.DataFrame(all_results)
    all_results_df.to_csv("baseline_results.csv", index=False)
    return metrics_df

# usage
metrics_df = evaluate_from_loaded_csv("combined_presuppositions.csv")


Evaluating section: all_n_presupposition


100%|██████████| 633/633 [03:43<00:00,  2.83it/s]


Evaluating section: both_presupposition


100%|██████████| 633/633 [03:32<00:00,  2.98it/s]


Evaluating section: change_of_state


100%|██████████| 633/633 [03:17<00:00,  3.20it/s]


Evaluating section: cleft_existence


100%|██████████| 633/633 [03:21<00:00,  3.15it/s]


Evaluating section: cleft_uniqueness


100%|██████████| 633/633 [03:43<00:00,  2.83it/s]


Evaluating section: only_presupposition


100%|██████████| 633/633 [12:19<00:00,  1.17s/it]    


Evaluating section: possessed_definites_existence


100%|██████████| 633/633 [03:46<00:00,  2.79it/s]


Evaluating section: possessed_definites_uniqueness


100%|██████████| 633/633 [06:57<00:00,  1.52it/s]  


Evaluating section: question_presupposition


100%|██████████| 633/633 [03:47<00:00,  2.78it/s]



Final Metrics Table:
                       section  accuracy       f1  precision   recall
          all_n_presupposition  0.448657 0.398995   0.409991 0.459229
           both_presupposition  0.379147 0.300995   0.260814 0.380893
               change_of_state  0.304897 0.311641   0.336039 0.320337
               cleft_existence  0.646130 0.638290   0.677988 0.694040
              cleft_uniqueness  0.181675 0.175513   0.197492 0.171711
           only_presupposition  0.567141 0.547770   0.624397 0.622952
 possessed_definites_existence  0.658768 0.649411   0.797065 0.726513
possessed_definites_uniqueness  0.382306 0.290248   0.246895 0.374799
       question_presupposition  0.619273 0.587102   0.712293 0.692381
                           ALL  0.465333 0.457223   0.482703 0.493651


In [12]:
import pandas as pd
from IPython.display import display


def show_metrics_table(metrics_df, decimals=3):
    # Round numeric columns safely
    metrics_df_rounded = metrics_df.copy()
    numeric_cols = ["accuracy", "f1", "precision", "recall"]
    for col in numeric_cols:
        if col in metrics_df_rounded.columns:
            metrics_df_rounded[col] = pd.to_numeric(metrics_df_rounded[col], errors="coerce").round(decimals)

    # Style the DataFrame
    styled = (
        metrics_df_rounded.style
        .set_table_styles(
            [
                {"selector": "thead th", "props": [("background-color", "#4CAF50"),
                                                   ("color", "white"),
                                                   ("font-weight", "bold"),
                                                   ("text-align", "center")]},
                {"selector": "tbody td", "props": [("text-align", "center")]}
            ]
        )
        .bar(subset=numeric_cols, color="#87A7EBE8")
        .format({col: f"{{:.{decimals}f}}" for col in numeric_cols})
    )

    print("\nBeautiful Results Table:")
    display(styled)


# Example usage after evaluation:
show_metrics_table(metrics_df)



Beautiful Results Table:


Unnamed: 0,section,accuracy,f1,precision,recall
0,all_n_presupposition,0.449,0.399,0.41,0.459
1,both_presupposition,0.379,0.301,0.261,0.381
2,change_of_state,0.305,0.312,0.336,0.32
3,cleft_existence,0.646,0.638,0.678,0.694
4,cleft_uniqueness,0.182,0.176,0.197,0.172
5,only_presupposition,0.567,0.548,0.624,0.623
6,possessed_definites_existence,0.659,0.649,0.797,0.727
7,possessed_definites_uniqueness,0.382,0.29,0.247,0.375
8,question_presupposition,0.619,0.587,0.712,0.692
9,ALL,0.465,0.457,0.483,0.494
