In [None]:
import pandas as pd
import json
import numpy as np
import re
import os

#### read data

In [None]:
from data.prompts import prompts
from openai import OpenAI

In [None]:
data_path = ""
qa_file = "year_qa_pairs_600.json"
qa_filepath = os.path.join(data_path, qa_file)

with open(qa_filepath, "r") as f:
    year_qa_data = json.load(f)

print(f"Loaded {len(year_qa_data)} year QA pairs.")

Loaded 600 year QA pairs.


#### single test

In [None]:
# Select one entry for testing.
sample_entry = year_qa_data[0]  # for example, the first entry
question = sample_entry["Question"]
context = sample_entry["Context"]
year_str = sample_entry["Answer"]

print("Original Question:", question)
print("Original Context (excerpt):", context[:300])
print("Original Answer:", year_str)

# Convert the answer to an integer.
try:
    year_int = int(year_str)
except Exception as e:
    print("Error converting answer to int:", e)
    year_int = None

# Define the offsets (10 modifications, excluding 0)
offsets = [-500, -400, -300, -200, -100, -80, -60, -40, -20, -10, 10, 20, 40, 60, 80, 100, 200, 300, 400, 500]
#offsets = [200, 300, 400, 500]

# Generate perturbations.
perturbations = {}
for offset in offsets:
    new_year = year_int + offset
    # Create a modified context by replacing the first occurrence of the original year.
    # (You might adjust this if the year appears multiple times.)
    new_statement = context.replace(year_str, str(new_year), 1)
    perturbations[str(offset)] = {
        "modified_answer": str(new_year),
        "modified_statement": new_statement
    }

# Print the result.
print(json.dumps(perturbations, indent=2))

Original Question: In which year did Subhash Sureshchandra Deshmukh take oath as a Maharashtra Cabinet Minister?
Original Context (excerpt): He took oath as Maharashtra Cabinet Minister on 7 July 2016.
Original Answer: 2016
{
  "-500": {
    "modified_answer": "1516",
    "modified_statement": "He took oath as Maharashtra Cabinet Minister on 7 July 1516."
  },
  "-400": {
    "modified_answer": "1616",
    "modified_statement": "He took oath as Maharashtra Cabinet Minister on 7 July 1616."
  },
  "-300": {
    "modified_answer": "1716",
    "modified_statement": "He took oath as Maharashtra Cabinet Minister on 7 July 1716."
  },
  "-200": {
    "modified_answer": "1816",
    "modified_statement": "He took oath as Maharashtra Cabinet Minister on 7 July 1816."
  },
  "-100": {
    "modified_answer": "1916",
    "modified_statement": "He took oath as Maharashtra Cabinet Minister on 7 July 1916."
  },
  "-80": {
    "modified_answer": "1936",
    "modified_statement": "He took oath as Maha

#### For all of it

In [None]:
# Containers for results and counters.
successful_perturbations = []
success_count = 0
failure_count = 0

# Process each QA pair.
for entry in year_qa_data:
    question = entry["Question"]
    context = entry["Context"]
    year_str = entry["Answer"].strip()  # standard answer (a year as a string)

    # Check if the standard answer is present in the context.
    if year_str not in context:
        failure_count += 1
        continue  # Skip this QA pair if the year is not found in the context.

    try:
        year_int = int(year_str)
    except Exception as e:
        # If conversion fails, skip.
        failure_count += 1
        continue

    # Define the absolute offsets: 10 modifications in increments of 20 years over [-100, 100] (excluding 0).
    #offsets = [-15, -10, -5, -2, 2, 5, 10, 15]
    #offsets = [200, 300, 400, 500]
    offsets = [-500, -400, -300, -200, -100, -80, -60, -40, -20, -10, 10, 20, 40, 60, 80, 100, 200, 300, 400, 500]

    perturbations = {}
    # For each offset, create a modified year and a modified statement.
    for offset in offsets:
        new_year = year_int + offset
        # Replace only the first occurrence of the standard answer in the context.
        modified_statement = context.replace(year_str, str(new_year), 1)
        perturbations[str(offset)] = {
            "modified_answer": str(new_year),
            "modified_statement": modified_statement
        }

    # Build a new entry with the original QA pair and the generated perturbations.
    new_entry = {
        "Question": question,
        "Context": context,
        "StandardAnswer": year_str,
        "Perturbations": perturbations
    }
    successful_perturbations.append(new_entry)
    success_count += 1

print(f"Successful perturbations: {success_count}")
print(f"Failed QA pairs (year not found or conversion error): {failure_count}")

# Save only the successful entries to a new JSON file with a timestamp.

save_filename = f"year_qa_perturbations_{len(successful_perturbations)}_full_extended.json"
save_filepath = os.path.join(data_path, save_filename)

with open(save_filepath, "w") as f:
    json.dump(successful_perturbations, f, indent=2)

print(f"Perturbation data saved to {save_filepath}")

Successful perturbations: 588
Failed QA pairs (year not found or conversion error): 12
Perturbation data saved to /content/drive/MyDrive/LLM/ICL/StanfordClashEval-main/data/dataset/year_qa_perturbations_588_full_extended.json
