In [1]:
# Cell 1: Environment Setup

print("Uninstalling potentially conflicting libraries...")
!pip uninstall -y transformers accelerate peft torch optimum auto-gptq datasets evaluate

print("\nInstalling a stable, compatible set of required libraries...")
# Install specific versions known to be compatible.
!pip install -q "transformers==4.40.2"
!pip install -q "accelerate==0.29.3"
!pip install -q "peft==0.10.0"
!pip install -q "torch==2.3.0"
!pip install -q "datasets==2.19.0" "evaluate==0.4.2" "pandas"
!pip install -q "bert_score" "sentencepiece" "rouge_score"
# Install the AutoGPTQ library and its dependency, optimum.
!pip install -q "optimum==1.19.1" "auto-gptq==0.7.1" --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu121/

print("\nInstallations complete.")
print("Important: Restart the session from the 'Run' menu before proceeding.")

Uninstalling potentially conflicting libraries...
Found existing installation: transformers 4.52.4
Uninstalling transformers-4.52.4:
  Successfully uninstalled transformers-4.52.4
Found existing installation: accelerate 1.8.1
Uninstalling accelerate-1.8.1:
  Successfully uninstalled accelerate-1.8.1
Found existing installation: peft 0.15.2
Uninstalling peft-0.15.2:
  Successfully uninstalled peft-0.15.2
Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
[0mFound existing installation: datasets 3.6.0
Uninstalling datasets-3.6.0:
  Successfully uninstalled datasets-3.6.0
[0m
Installing a stable, compatible set of required libraries...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m94.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━

In [2]:
# Cell 2: Load Model and Dataset

import torch
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login

# Configuration
MODEL_NAME = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
BENCHMARK_NAME = "snli"
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Using device: {device}")

# Authentication
# Your token is hard-coded. This is a security risk.
# Delete this token from your Hugging Face account after use.
HF_TOKEN_DIRECT = "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"
login(token=HF_TOKEN_DIRECT)
print("Login to Hugging Face successful.")


# Load Model and Tokenizer
print(f"Loading baseline model: {MODEL_NAME}...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("Baseline model and tokenizer loaded.")


# Prepare Benchmark Dataset
print(f"Loading benchmark dataset: {BENCHMARK_NAME}...")
dataset = load_dataset(BENCHMARK_NAME, split="test")

# Filter out examples where annotators did not agree (label == -1).
dataset_filtered = dataset.filter(lambda example: example['label'] != -1)
print(f"Original size of SNLI test set: {len(dataset)}")
print(f"Size after filtering ambiguous examples: {len(dataset_filtered)}")

# Use a smaller subset for the evaluation.
eval_subset = dataset_filtered.shuffle(seed=42).select(range(100))
print(f"Using a subset of {len(eval_subset)} examples for this run.")
print("Dataset prepared.")

Using device: cuda
Login to Hugging Face successful.
Loading baseline model: TheBloke/Mistral-7B-Instruct-v0.2-GPTQ...




config.json: 0.00B [00:00, ?B/s]

2025-07-20 20:13:05.859100: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753042386.074778      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753042386.135364      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/4.16G [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]



tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

Baseline model and tokenizer loaded.
Loading benchmark dataset: snli...


Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/412k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/413k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/550152 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Original size of SNLI test set: 10000
Size after filtering ambiguous examples: 9824
Using a subset of 100 examples for this run.
Dataset prepared.


In [3]:
# Cell 3: Run Evaluation Loop

from tqdm import tqdm

# Define the prompt template for the model.
prompt_template = """### INSTRUCTION:
Explain the step-by-step reasoning that connects the following premise to the hypothesis.

### PREMISE:
{premise}

### CONCLUSION:
{hypothesis}

### RESPONSE:
"""

print(f"Running evaluation loop on {len(eval_subset)} examples...")
results_list = []

model.eval()
with torch.no_grad():
    for example in tqdm(eval_subset):
        prompt = prompt_template.format(premise=example['premise'], hypothesis=example['hypothesis'])
        
        # Tokenize the input prompt.
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
        
        # Generate a response from the model.
        # Explicitly move input tensors to the primary GPU.
        outputs = model.generate(input_ids=inputs["input_ids"].to(device), max_new_tokens=128, do_sample=False)
        
        # Decode the generated tokens into text.
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Clean the output to isolate the response.
        try:
            explanation = generated_text.split("### RESPONSE:")[1].strip()
        except IndexError:
            explanation = "Model did not generate a valid response."

        results_list.append({
            "premise": example['premise'],
            "hypothesis": example['hypothesis'],
            "true_label_id": example['label'],
            "generated_explanation": explanation
        })

# Store results in a pandas DataFrame.
results_df = pd.DataFrame(results_list)
print("Evaluation complete. Results collected.")

Running evaluation loop on 100 examples...


  0%|          | 0/100 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 1/100 [00:51<1:25:32, 51.85s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  2%|▏         | 2/100 [02:00<1:40:38, 61.61s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for op

Evaluation complete. Results collected.





In [7]:
# Cell 4: Analyze Results (Corrected with Manual Metric Calculation)

# Import the metric libraries directly, bypassing the broken evaluate.load()
from rouge_score import rouge_scorer
from bert_score import score as bert_score_calc
import numpy as np
import pandas as pd
import torch

print("Analyzing results...")

# --- Qualitative Review ---
print("\nQualitative Review of Baseline Model's Reasoning (Top 10 Results):")

# Define human-readable labels.
labels_map = {0: "Entailment", 1: "Neutral", 2: "Contradiction"}
results_df['true_label'] = results_df['true_label_id'].map(labels_map)

# Configure pandas for better display.
pd.set_option('display.max_colwidth', 400)
display(results_df.head(10)[['premise', 'hypothesis', 'true_label', 'generated_explanation']])


# --- Automated Metrics (Manual Calculation) ---
print("\nAutomated Metrics on 'Entailment' Examples:")

# Filter for examples that are true entailments.
entailment_df = results_df[results_df['true_label'] == 'Entailment']

if not entailment_df.empty:
    predictions = entailment_df['generated_explanation'].tolist()
    references = entailment_df['hypothesis'].tolist()

    # 1. Manual ROUGE-2 Calculation
    print("Calculating ROUGE scores...")
    scorer = rouge_scorer.RougeScorer(['rouge2'], use_stemmer=True)
    rouge2_scores = [scorer.score(ref, pred)['rouge2'].fmeasure for ref, pred in zip(references, predictions)]
    avg_rouge2 = np.mean(rouge2_scores)
    
    # 2. Manual BERTScore Calculation
    print("Calculating BERTScore... (This may take a moment)")
    # BERTScore needs to know the GPU device if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    # The score function returns Precision, Recall, and F1 tensors. We want the F1.
    P, R, F1 = bert_score_calc(predictions, references, lang="en", device=device, verbose=False)
    # Calculate the average of the F1 scores.
    avg_bert_f1 = F1.mean().item()

    print(f"BERTScore F1 (Semantic Similarity): {avg_bert_f1:.4f}")
    print(f"ROUGE-2 Score (Lexical Overlap):   {avg_rouge2:.4f}")

else:
    print("No 'Entailment' examples were present in the random subset to calculate scores.")

print("\nPipeline execution finished.")

Analyzing results...

Qualitative Review of Baseline Model's Reasoning (Top 10 Results):


Unnamed: 0,premise,hypothesis,true_label,generated_explanation
0,A man is using what looks to be a fax machine.,A person is using what looks to be a fax machine.,Entailment,"The conclusion follows directly from the premise. The man's use of the fax machine is what is being described in the premise, and the conclusion simply restates that a person is using a fax machine based on that observation."
1,A man in a grassy field throws a stick for a group of three brown dogs.,A man is playing with dogs.,Entailment,"The man in the premise is engaging in an activity with the dogs. He is throwing a stick for them to retrieve and play with. This is a common behavior associated with playing with dogs. Therefore, based on the information given in the premise, it can be concluded that a man is playing with dogs."
2,Two people bicycle on a path separated by small mountains.,Two people are hiking up a mountain.,Contradiction,"The premise describes two people bicycling on a path, but the conclusion states that they are hiking up a mountain. This is a significant difference, as bicycling and hiking are two distinct activities.\n\nTo understand how the premise could lead to the conclusion, we need to consider some possible connections. However, without more information, it's difficult to make a definitive conclusion. ..."
3,Workers wearing orange vests stand near rail tracks.,Workers wearing orange are about to repair rail tracks.,Neutral,"The premise states that workers are wearing orange vests and they are standing near rail tracks. This observation alone does not definitively prove that the workers are about to repair the rail tracks. However, it is a common practice for workers to wear orange vests when they are working on or near rail tracks for safety reasons. Additionally, repairing rail tracks is a common activity that r..."
4,Three men in hats and a woman in a red shirt examine the produce presented to them by the man in the blue shirt and white hat.,The group is going to buy something,Neutral,"The premise describes a scene where three men and a woman are examining the produce presented to them by one of the men. While the premise does not explicitly state that the group intends to buy the produce, it is a common behavior for people to examine items before making a purchase. Therefore, based on this assumption, it can be inferred that the group is likely to buy something.\n\nHere's a..."
5,A man is talking on a cellphone while filling his car with gas.,A man talks to his boss while getting gas for his car.,Neutral,"The premise states that a man is talking on a cellphone while filling his car with gas. This means that the man is engaged in a conversation while performing another task, which is getting gas for his car. The hypothesis suggests that the man is talking to his boss. While we don't have any direct evidence from the premise that the man is talking to his boss, we can't rule it out either. There'..."
6,An older gentleman looks at the camera while he is building a deck.,An older gentleman looks away from the camera while he is building a birdhouse.,Contradiction,"The premise and the conclusion are not directly related. The premise only describes an action (building a deck) and the appearance (older gentleman looking at the camera) of the person performing the action. The conclusion, on the other hand, describes a different action (building a birdhouse) and a different behavior (looking away from the camera). There is no logical connection between the t..."
7,A woman is leaning against a wall with her shoe off.,"A woman is standing in high heels, leaning against a wall.",Contradiction,"The hypothesis goes beyond the information given in the premise. However, based on the given premise alone, it's impossible to rule out the hypothesis. Here's the reasoning:\n\n1. The premise states that a woman is leaning against a wall with one shoe off.\n2. The hypothesis states that a woman is standing in high heels, leaning against a wall.\n3. The premise does not provide any information ..."
8,A small quaint town all lit up during the holiday season.,The town is totally dark.,Contradiction,"The premise and the conclusion are contradictory to each other. The premise describes a town that is all lit up during the holiday season, while the conclusion states that the town is totally dark. There is no logical connection between the two statements."
9,"A young woman cooks a meal in a wok while conversing with another woman, as an illuminated shrine to Mr. T looks on.",Both of these women are clinically mute.,Contradiction,"The premise and the conclusion are not logically connected. The premise describes an action (cooking in a wok while conversing) and the presence of an object (an illuminated shrine to Mr. T), but it does not provide any information about the women's abilities to speak or communicate in any way. The conclusion, on the other hand, asserts that both women are clinically mute, which is not support..."



Automated Metrics on 'Entailment' Examples:
Calculating ROUGE scores...
Calculating BERTScore... (This may take a moment)




tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1 (Semantic Similarity): 0.8667
ROUGE-2 Score (Lexical Overlap):   0.1183

Pipeline execution finished.
