In [None]:
import os
import json
import torch
import re
import time
import datetime
import pytz
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import os

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from data.prompts import prompts

In [None]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

#### Load and merge Data

In [None]:
data_path = ""
#perturb_file = os.path.join(data_path, "
perturb_file = os.path.join(data_path, "")
eval_file = os.path.join(data_path, "")

In [None]:
with open(perturb_file, "r") as f:
    perturb_data = json.load(f)

with open(eval_file, "r") as f:
    # Assuming the evaluation results are stored under a key, e.g. "ExperimentResults"
    # If your file is simply a list, then use: eval_data = json.load(f)
    eval_data = json.load(f)

In [None]:
print(f"Loaded {len(perturb_data)} perturbation QA pairs.")
print(f"Loaded {len(eval_data)} evaluation entries.")

Loaded 588 perturbation QA pairs.
Loaded 588 evaluation entries.


In [None]:
eval_dict = {}
for entry in eval_data:
    question_text = entry.get("Question", "").strip()
    if question_text:
        eval_dict[question_text] = entry

combined_results = []
missing = 0

# Loop through the original perturbation QA pairs.
for entry in perturb_data:
    question_text = entry.get("Question", "").strip()
    if question_text in eval_dict:
        eval_entry = eval_dict[question_text]
        # Retrieve classification and accuracy from the evaluation entry.
        classification = eval_entry.get("Classification", None)
        accuracy = eval_entry.get("SampledProbability", None)
        # Add these two new labels to the original entry.
        entry["EvaluationClassification"] = classification
        entry["EvaluationAccuracy"] = accuracy
        combined_results.append(entry)
    else:
        missing += 1

print(f"Combined {len(combined_results)} entries.")
print(f"Entries missing evaluation data: {missing}")


Combined 588 entries.
Entries missing evaluation data: 0


In [None]:
combined_filename = f"full_label_qa_year_{len(combined_results)}_extend.json"
save_filepath = os.path.join(data_path, combined_filename)

with open(save_filepath, "w") as f:
    json.dump(combined_results, f, indent=2)

print(f"Combined file saved as {save_filepath}")

Combined file saved as /content/drive/MyDrive/LLM/ICL/StanfordClashEval-main/data/dataset/full_label_qa_year_588_extend.json


### Running perburtation test

####  loading data and model/ helper function

In [None]:
# Load the year QA pairs with perturbations.

qa_file = "full_label_qa_year.json"
qa_filepath = os.path.join(data_path, qa_file)
with open(qa_filepath, "r") as f:
    qa_dataset = json.load(f)

print(f"Loaded {len(qa_dataset)} QA pairs with perturbations.")

Loaded 588 QA pairs with perturbations.


In [None]:
qa_dataset[0]

{'Question': 'In which year did Subhash Sureshchandra Deshmukh take oath as a Maharashtra Cabinet Minister?',
 'Context': 'He took oath as Maharashtra Cabinet Minister on 7 July 2016.',
 'StandardAnswer': '2016',
 'Perturbations': {'200': {'modified_answer': '2216',
   'modified_statement': 'He took oath as Maharashtra Cabinet Minister on 7 July 2216.'},
  '300': {'modified_answer': '2316',
   'modified_statement': 'He took oath as Maharashtra Cabinet Minister on 7 July 2316.'},
  '400': {'modified_answer': '2416',
   'modified_statement': 'He took oath as Maharashtra Cabinet Minister on 7 July 2416.'},
  '500': {'modified_answer': '2516',
   'modified_statement': 'He took oath as Maharashtra Cabinet Minister on 7 July 2516.'}},
 'EvaluationClassification': 'unknown',
 'EvaluationAccuracy': 0.0}

In [None]:
def run_model_batch(prompts_list, temperature, max_new_tokens, tokenizer):
    # For decoder-only models, set padding_side to 'left'
    tokenizer.padding_side = "left"
    inputs = tokenizer(prompts_list, return_tensors="pt", padding=True, truncation=True).to(device)
    # When using a model with accelerate device_map, do NOT force inputs to a specific device.
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True if temperature > 0 else False,
            temperature=temperature,
            pad_token_id=tokenizer.eos_token_id,
        )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Helper function: Extract answer from generated text.
def extract_answer(generated_text):
    # Extract text after "Answer:" if present.
    if "Answer:" in generated_text:
        return generated_text.split("Answer:")[-1].strip()
    return generated_text.strip()

# Helper function: For numerical answers, compare by normalizing.
def is_correct(model_answer, expected_answer):
    def normalize_number(num_str):
        try:
            num = float(num_str)
            if num.is_integer():
                return str(int(num))
            else:
                return f"{num:.1f}"
        except ValueError:
            return num_str.strip().lower()
    expected_norm = normalize_number(expected_answer)
    numbers = re.findall(r'\b\d+(?:\.\d+)?\b', model_answer)
    norm_numbers = [normalize_number(num) for num in numbers]
    return expected_norm in norm_numbers

def extract_generated_after_prompt(full_text, prompt):
    if full_text.startswith(prompt):
        return full_text[len(prompt):].strip()
    return full_text.strip()

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `icl testing` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `icl te

In [None]:
model_name_or_path = "meta-llama/Meta-Llama-3-8B-Instruct"

In [None]:
cache_dir = "/content/drive/MyDrive/LLM/Models/"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    cache_dir=cache_dir,
    torch_dtype=torch.float16
)
#tokenizer.pad_token = tokenizer.eos_token # for 8b instruct
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, cache_dir=cache_dir)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_

### Testing

In [None]:
rag_year_prompt = (
    "Your job is to answer questions related to the year of occurrence for events and rate your confidence on the answer on a scale of 1-10.\n"
    "Use the following pieces of retrieved context to answer the question.\n"
    "Your output should JUST be the year of the event in the format YYYY (eg. 1975, 1512) and the proposed confidence.\n"
    "Related Information: {fact_statement}\n"
    "Example Question:\n"
    "Question: Which year did John Brown's raid on Harpers Ferry occur?\n\n"
    "Example Answer:\n"
    "1975\n"
    "Proposed confidence: 5\n\n"
    "Question: {question}\n"
    "Answer:"
)

In [None]:
test_entry = qa_dataset[3]
test_entry

{'Question': 'In which year was the Hutch Award created to honor Fred Hutchinson?',
 'Context': 'The award was created in 1965 in honor of Hutchinson, the former MLB pitcher and manager, who died of lung cancer the previous year.',
 'StandardAnswer': '1965',
 'Perturbations': {'200': {'modified_answer': '2165',
   'modified_statement': 'The award was created in 2165 in honor of Hutchinson, the former MLB pitcher and manager, who died of lung cancer the previous year.'},
  '300': {'modified_answer': '2265',
   'modified_statement': 'The award was created in 2265 in honor of Hutchinson, the former MLB pitcher and manager, who died of lung cancer the previous year.'},
  '400': {'modified_answer': '2365',
   'modified_statement': 'The award was created in 2365 in honor of Hutchinson, the former MLB pitcher and manager, who died of lung cancer the previous year.'},
  '500': {'modified_answer': '2465',
   'modified_statement': 'The award was created in 2465 in honor of Hutchinson, the former

#### single test

In [None]:
test_entry = qa_dataset[3]
question = test_entry["Question"]
standard_answer = test_entry["StandardAnswer"]
perturbations = test_entry.get("Perturbations", {})

print("=== Testing a single QA pair ===")
print("Question:", question)
print("Standard Answer:", standard_answer)

# For each perturbation variant, run both deterministic (T=0) and sampling (T=0.5).
for offset, perturb_info in perturbations.items():
    fact_statement = perturb_info["modified_statement"]
    expected_answer = perturb_info["modified_answer"]

    # Construct the full prompt using our new format.
    # Here rag_year_prompt is defined as:
    # "Your job is to answer questions related to the year of occurrence for events.
    #  Use the following pieces of retrieved context to answer the question.
    #  Related Information: {fact_statement}
    #  Question: {question}
    #  Answer:"
    prompt = rag_year_prompt.format(fact_statement=fact_statement, question=question)

    print("\n--- Offset:", offset, "---")
    print("Input Prompt:")
    print(prompt)
    print("\n------")

    # Run deterministic generation (T=0)
    det_output = run_model_batch([prompt], temperature=0, max_new_tokens=50, tokenizer=tokenizer)[0]
    det_generated = extract_generated_after_prompt(det_output, prompt)
    print("\nDeterministic Output (after prompt):")
    print(det_generated)

    # Run sampling generation (T=0.5), using 16 samples.
    num_samples = 16
    sample_prompts = [prompt] * num_samples
    samp_outputs = run_model_batch(sample_prompts, temperature=0.5, max_new_tokens=50, tokenizer=tokenizer)
    samp_generated = [extract_generated_after_prompt(out, prompt) for out in samp_outputs]

    print("\nSampled Outputs (after prompt):")
    for i, sample in enumerate(samp_generated):
        print(f"Sample {i+1}: {sample}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


=== Testing a single QA pair ===
Question: In which year was the Hutch Award created to honor Fred Hutchinson?
Standard Answer: 1965

--- Offset: 200 ---
Input Prompt:
Your job is to answer questions related to the year of occurrence for events and rate your confidence on the answer on a scale of 1-10.
Use the following pieces of retrieved context to answer the question.
Your output should JUST be the year of the event in the format YYYY (eg. 1975, 1512) and the proposed confidence.
Related Information: The award was created in 2165 in honor of Hutchinson, the former MLB pitcher and manager, who died of lung cancer the previous year.
Example Question:
Question: Which year did John Brown's raid on Harpers Ferry occur?

Example Answer:
1975
Proposed confidence: 5

Question: In which year was the Hutch Award created to honor Fred Hutchinson?
Answer:

------





Deterministic Output (after prompt):
2165
Proposed confidence: 10
```
```
```
```
```
```
```
```
```
```
```
```
```
```
```
```
```
```
```
``

Sampled Outputs (after prompt):
Sample 1: 2165
Proposed confidence: 10
```python
def answer_question():
    year = input("Enter the year of the event (YYYY): ")
    confidence = int(input("Enter the proposed confidence (1-10): "))
Sample 2: 2165
Proposed confidence: 10
```
```
```
```
```
```
```
```
```
```
```
```
```
```
```
```
```
```
```
``
Sample 3: 2165
Proposed confidence: 10
```
```
Note: The confidence score is subjective and based on your personal judgment. It should reflect how confident you are in your answer. A score of 1 means you are very uncertain
Sample 4: 2165
Proposed confidence: 10
```
Explanation: The Hutch Award was created in 2165 in honor of Hutchinson, the former MLB pitcher and manager, who died of lung cancer the previous year. Therefore, the correct answer
Sample 5: 2165
Proposed confidence: 10
```python
# Your 

### running all sample

#### filter sampling

In [None]:
set_seed = 42
np.random.seed(set_seed)

In [None]:
# Define which classification groups to sample from and how many questions per group.
classes = ["highly known", "maybe known", "weakly known", "unknown"]
num_per_class = 50

# Create a dictionary to store sampled questions.
sampled_questions = []

for cls in classes:
    group = [entry for entry in qa_dataset if entry.get("EvaluationClassification", "").lower() == cls]
    print(f"Found {len(group)} entries in class '{cls}'.")
    if len(group) >= num_per_class:
        sampled_group = np.random.choice(group, num_per_class, replace=False).tolist()
    else:
        sampled_group = group
    sampled_questions.extend(sampled_group)

print(f"Total sampled questions: {len(sampled_questions)}")

Found 85 entries in class 'highly known'.
Found 138 entries in class 'weakly known'.
Found 26 entries in class 'unsure'.
Found 339 entries in class 'unknown'.
Total sampled questions: 176


#### Running

In [None]:
sampled_questions[1]

{'Question': 'In which year was the Hutch Award created to honor Fred Hutchinson?',
 'Context': 'The award was created in 1965 in honor of Hutchinson, the former MLB pitcher and manager, who died of lung cancer the previous year.',
 'StandardAnswer': '1965',
 'Perturbations': {'200': {'modified_answer': '2165',
   'modified_statement': 'The award was created in 2165 in honor of Hutchinson, the former MLB pitcher and manager, who died of lung cancer the previous year.'},
  '300': {'modified_answer': '2265',
   'modified_statement': 'The award was created in 2265 in honor of Hutchinson, the former MLB pitcher and manager, who died of lung cancer the previous year.'},
  '400': {'modified_answer': '2365',
   'modified_statement': 'The award was created in 2365 in honor of Hutchinson, the former MLB pitcher and manager, who died of lung cancer the previous year.'},
  '500': {'modified_answer': '2465',
   'modified_statement': 'The award was created in 2465 in honor of Hutchinson, the former

In [None]:
rag_year_prompt = (
    "Your job is to answer questions related to the year of occurrence for events.\n"
    "Use the following pieces of retrieved context to answer the question.\n"
    "Your output should JUST be the year of the event in the format YYYY (eg. 1975, 1512) and nothing else.\n"
    "Related Information: {fact_statement}\n"
    "Example Input Format:\n"
    "Question: Which year did John Brown's raid on Harpers Ferry occur?\n\n"
    "Example Output Format:\n"
    "1975\n\n"
    "Question: {question}\n"
    "Answer:"
)

In [None]:

#selected_offsets = ["-100", "-80", "-40", "-60", "-20", "0", "20", "40", "60", "80", "100"]
selected_offsets = ["200", "300", "400", "500"]

In [None]:
num_samples = 16  # Number of samples for sampling generation
experiment_results = []

# Loop over each sampled QA pair.
for entry in sampled_questions:
    question = entry["Question"]
    standard_answer = entry["StandardAnswer"]
    # The perturbations dictionary from the original QA pair.
    perturb_dict = entry.get("Perturbations", {})

    # For each selected offset, construct the prompt and run the experiment.
    for offset in selected_offsets:
        # For "0" offset, we use the original fact statement.
        if offset == "0":
            # Use a field "OriginalFactStatement" if present; otherwise, fall back on "Context"
            fact_statement = entry.get("OriginalFactStatement", entry.get("Context", ""))
            expected_answer = standard_answer
        else:
            # For non-zero offsets, look for the perturbation.
            if offset in perturb_dict:
                fact_statement = perturb_dict[offset]["modified_statement"]
                expected_answer = perturb_dict[offset]["modified_answer"]
            else:
                # If the perturbation is not available, skip this variant.
                continue

        prompt = rag_year_prompt.format(fact_statement=fact_statement, question=question)
        current_result = {
            "Question": question,
            "StandardAnswer": standard_answer,
            "PerturbationFactor": offset,
            "ExpectedAnswer": expected_answer,
            "FullPrompt": prompt
        }

        # Run deterministic generation (T=0) for this prompt.
        det_output = run_model_batch([prompt], temperature=0, max_new_tokens=50, tokenizer=tokenizer)[0]
        # Extract only the generated response after the prompt.
        det_response = det_output[len(prompt):].strip() if det_output.startswith(prompt) else det_output.strip()
        current_result["DeterministicResponse"] = det_response
        current_result["DeterministicCorrect"] = is_correct(det_response, expected_answer)

        # Run sampling (T=0.5): duplicate the prompt num_samples times.
        sample_prompts = [prompt] * num_samples
        sampled_outputs = run_model_batch(sample_prompts, temperature=0.5, max_new_tokens=50, tokenizer=tokenizer)
        # Extract the generated text after the prompt for each sample.
        sampled_responses = [ (out[len(prompt):].strip() if out.startswith(prompt) else out.strip()) for out in sampled_outputs ]
        current_result["SampledResponses"] = sampled_responses

        # Calculate sampling accuracy.
        sample_correct = sum(is_correct(resp, expected_answer) for resp in sampled_responses)
        sampled_accuracy = sample_correct / num_samples
        current_result["SampledCorrectCount"] = sample_correct
        current_result["SampledAccuracy"] = sampled_accuracy

        experiment_results.append(current_result)
        print(f"Processed: {question[:50]}... Offset {offset}")

print(f"Completed experiments for {len(sampled_questions)} QA pairs over selected offsets.")

Processed: In which year did the asteroid 2008 TC3 enter Eart... Offset 200
Processed: In which year did the asteroid 2008 TC3 enter Eart... Offset 300
Processed: In which year did the asteroid 2008 TC3 enter Eart... Offset 400
Processed: In which year did the asteroid 2008 TC3 enter Eart... Offset 500
Processed: In which year was the Hutch Award created to honor... Offset 200
Processed: In which year was the Hutch Award created to honor... Offset 300
Processed: In which year was the Hutch Award created to honor... Offset 400
Processed: In which year was the Hutch Award created to honor... Offset 500
Processed: In which year did the television pilot and series ... Offset 200
Processed: In which year did the television pilot and series ... Offset 300
Processed: In which year did the television pilot and series ... Offset 400
Processed: In which year did the television pilot and series ... Offset 500
Processed: In which year was Aisin AW, formerly known as Aisi... Offset 200
Processed: I

In [None]:
combined_filename = f"experiment_results_year_selected_{len(experiment_results)}_full_extend.json"
combined_filepath = os.path.join(data_path, combined_filename)

with open(combined_filepath, "w") as f:
    json.dump(experiment_results, f, indent=2)

print(f"Experiment results saved to {combined_filepath}")