# ImpPres with LLM

You have to implement in this notebook a better ImpPres classifier using an LLM.
This classifier must be implemented using DSPy.


In [1]:
# Configure the DSPy environment with the language model - for grok the parameters must be:
# env variable should be in os.environ['XAI_API_KEY']
# "xai/grok-3-mini"
import os
import dspy
import api_key

lm = dspy.LM('xai/grok-3-mini', api_key=api_key.grok_key)
# for ollama 
# lm = dspy.LM('ollama_chat/devstral', api_base='http://localhost:11434', api_key='')
dspy.configure(lm=lm)

In [2]:
from typing import Literal

class NLIClassifier(dspy.Signature):
    """Classify a premise-hypothesis pair into entailment, contradiction, or neutral."""
    premise = dspy.InputField(desc="The premise sentence")
    hypothesis = dspy.InputField(desc="The hypothesis sentence")
    label = dspy.OutputField(
        desc="The NLI relation",
        format=Literal["entailment", "contradiction", "neutral"]
    )

class NLIProgram(dspy.Program):
    def __init__(self):
        super().__init__()
        self.classifier = dspy.Predict(NLIClassifier)

    def forward(self, premise: str, hypothesis: str):
        result = self.classifier(premise=premise, hypothesis=hypothesis)
        return result.label

## Load ImpPres Dataset

## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [6]:
from evaluate import load

accuracy = load("accuracy")
precision = load("precision")
recall = load("recall")
f1 = load("f1")


In [7]:
import evaluate
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

## Your Turn

Compute the classification metrics on the baseline LLM model on each test section of the ANLI dataset for samples that have a non-empty 'reason' field.

You also must show a comparison between the DeBERTa baseline model and this LLM baseline model. The comparison metric should compute the agreement between the two models:
* On how many samples they are both correct [Correct]
* On how many samples Model1 is correct and Model2 is incorrect [Correct1]
* On how many samples Model1 is incorrect and Model2 is correct [Correct2]
* On how many samples both are incorrect [Incorrect]

In [3]:
import pandas as pd
from sklearn.metrics import accuracy_score
from IPython.display import display

In [28]:
combined_df = pd.read_csv("combined_presuppositions.csv")
print("Combined dataset shape:", combined_df.shape)
# Sample 10% of paradigmIDs
sampled_ids = combined_df["paradigmID"].drop_duplicates().sample(frac=0.1, random_state=42)

# Filter dataset
filtered_df = combined_df[combined_df["paradigmID"].isin(sampled_ids)].reset_index(drop=True)

print("Original size:", combined_df.shape)
print("Sample size - 10%:", filtered_df.shape)


Combined dataset shape: (5697, 11)
Original size: (5697, 11)
Sample size - 10%: (513, 11)


In [29]:
def paradigm_reward(y_true, y_pred, alpha=0.5):
    acc = accuracy_score(y_true, y_pred)
    majority = max(set(y_pred), key=y_pred.count)
    consistency = sum(p == majority for p in y_pred) / len(y_pred)
    return alpha * acc + (1 - alpha) * consistency


def build_paradigm_examples(df):
    examples = []
    for paradigm_id, group in df.groupby("paradigmID"):
        group = group.sample(frac=1, random_state=42).reset_index(drop=True)
        premises = group["premise"].tolist()
        hypotheses = group["hypothesis"].tolist()
        golds = group["gold_label"].tolist()
        examples.append(dspy.Example(
            premises=premises,
            hypotheses=hypotheses,
            labels=golds,
            section=group["section"].iloc[0],
            paradigmID=paradigm_id
        ))
    return examples


# Use sampled data instead of full data
train_examples = build_paradigm_examples(filtered_df)

In [30]:
teleprompter = dspy.BootstrapFewShot(
    NLIProgram,
    paradigm_reward
)


optimized_program = teleprompter.compile(student=NLIProgram(), trainset=train_examples)

results = []
for ex in train_examples:
    preds = []
    for p, h in zip(ex.premises, ex.hypotheses):
        pred = optimized_program.forward(premise=p, hypothesis=h)
        preds.append(pred)

    preds_labels = [2 if p == "entailment" else 0 if p== "contradiction"else 1 for p in preds]
    reward = paradigm_reward(preds_labels, ex["labels"])
    results.append({
        "section": ex.section,
        "paradigmID": ex.paradigmID,
        "accuracy": accuracy_score(ex["labels"], preds_labels),
        "consistency": sum(p == max(set(preds), key=preds.count) for p in preds) / len(preds),
        "reward": reward
    })

metrics_df = pd.DataFrame(results)

  0%|          | 0/3 [00:00<?, ?it/s]2025/08/08 15:05:16 ERROR dspy.teleprompt.bootstrap: Failed to run or to evaluate example Example({'premises': ['Was Brad only spinning around?', 'Alan might have been climbing up ladders.', "If Alan was climbing up ladders, it's okay.", "Does Carrie's spotlight that was scaring Barbara vanish?", "Monet doesn't find out why the glove shrank.", 'It is James who is climbing up the stairs.', "Carrie's spotlight that was scaring Barbara does vanish.", 'Brad might have been only spinning around.', "If Brad was only spinning around, it's okay.", "If Brad was only spinning around, it's okay.", 'Monet might find out why the glove shrank.', "Carrie's spotlight that was scaring Barbara doesn't vanish.", "If Alan was climbing up ladders, it's okay.", "All three libraries that haven't needed to listen to George didn't forget the waiter.", 'Was Alan climbing up ladders?', 'Both senators who break a bicycle might have scratched.', "If it is Gerald who is arguing 

Bootstrapped 0 full traces after 2 examples for up to 1 rounds, amounting to 3 attempts.


In [32]:
section_summary = metrics_df.groupby("section")[["accuracy", "consistency", "reward"]].mean().reset_index()
overall_summary = pd.DataFrame([{
    "section": "ALL",
    "accuracy": metrics_df["accuracy"].mean(),
    "consistency": metrics_df["consistency"].mean(),
    "reward": metrics_df["reward"].mean()
}])
final_summary = pd.concat([section_summary, overall_summary], ignore_index=True)


def show_metrics_table(metrics_df, decimals=3):
    numeric_cols = ["accuracy", "consistency", "reward"]
    metrics_df_rounded = metrics_df.copy()
    for col in numeric_cols:
        metrics_df_rounded[col] = metrics_df_rounded[col].round(decimals)
    
    styled = (
        metrics_df_rounded.style
        .set_table_styles([
            {"selector": "thead th", "props": [("background-color", "#4CAF50"),
                                               ("color", "white"),
                                               ("font-weight", "bold"),
                                               ("text-align", "center")]},
            {"selector": "tbody td", "props": [("text-align", "center")]}
        ])
        .bar(subset=numeric_cols, color="#87A7EBE8")
    )
    display(styled)

print("Paradigm-level Results:")
display(metrics_df.head())

print("Section-level Summary:")
show_metrics_table(final_summary)

Paradigm-level Results:


Unnamed: 0,section,paradigmID,accuracy,consistency,reward
0,only_presupposition,15,0.421053,0.660819,0.421053
1,only_presupposition,19,0.409357,0.643275,0.415205
2,only_presupposition,27,0.415205,0.649123,0.418129


Section-level Summary:


Unnamed: 0,section,accuracy,consistency,reward
0,only_presupposition,0.415,0.651,0.418
1,ALL,0.415,0.651,0.418
