In [None]:
from prompt import * 
from model import * 
from tqdm import tqdm
from datasets import Dataset
from accelerate import Accelerator
import argparse
import os
import pandas as pd
import numpy as np

from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score

    
from transformers import default_data_collator

import random

In [None]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
def get_llm_input(test_dataset, q):
    # Assuming this function is already defined as per your initial code
    input_texts = [] 
    answers = []

    for i in tqdm(range(len(test_dataset)), desc="Preparing LLM Inputs"):
        options = [
            f"A: {test_dataset['ans0'][i]}",
            f"B: {test_dataset['ans1'][i]}",
            f"C: {test_dataset['ans2'][i]}"
        ]
        answer_dict = {0:'A', 1:'B', 2:'C'}
        answers.append(answer_dict.get(test_dataset['label'][i], '3'))  # Default to '3' if label not in 0,1,2
        random.shuffle(options)

        context = test_dataset['context'][i]
        question = test_dataset['question'][i]
        input_text = PROMPT[f'Q{q}'].format(
            context=context,
            question=question,
            Option1=options[0],
            Option2=options[1],
            Option3=options[2]
        )
        input_texts.append(input_text)

    assert len(test_dataset) == len(input_texts)
    test_dataset['llm_input'] = input_texts
    return test_dataset.copy()

def generate_response(prompt, model, tokenizer, accelerator, max_tokens=100):
    """
    Generates a response from the model given a prompt.
    """
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True, padding=True).to(accelerator.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def extract_context(input_text):
    """
    Extracts the context from the input_text based on the prompt structure.
    """
    # Assuming the context is after 'Context:' and before 'Question:'
    match = re.search(r'Context:\s*(.*?)\nQuestion:', input_text, re.DOTALL)
    return match.group(1).strip() if match else ''

def extract_question(input_text):
    """
    Extracts the question from the input_text based on the prompt structure.
    """
    # Assuming the question is after 'Question:' and before the options
    match = re.search(r'Question:\s*(.*?)\n[A-C]:', input_text, re.DOTALL)
    return match.group(1).strip() if match else ''

def perform_rci(input_text, model, tokenizer, accelerator):
    """
    Performs the RCI steps: Initial Answer, Critique, Improvement, Final Answer.
    """
    # Step 1: Initial Answer
    print("\n--- Step 1: Generating Initial Answer ---")
    initial_answer = generate_response(input_text, model, tokenizer, accelerator, max_tokens=50)
    print(f"Initial Answer: {initial_answer}")

    # Extract context and question for prompts
    context = extract_context(input_text)
    question = extract_question(input_text)

    # Step 2: Critique
    critique_prompt = PROMPT['CRITIQUE'].format(
        context=context,
        question=question,
        answer=initial_answer
    )
    print("\n--- Step 2: Generating Critique ---")
    critique = generate_response(critique_prompt, model, tokenizer, accelerator, max_tokens=100)
    print(f"Critique: {critique}")

    # Step 3: Improvement
    improve_prompt = PROMPT['IMPROVE'].format(
        context=context,
        question=question,
        answer=initial_answer,
        critique=critique
    )
    print("\n--- Step 3: Generating Improved Answer ---")
    improved_answer = generate_response(improve_prompt, model, tokenizer, accelerator, max_tokens=50)
    print(f"Improved Answer: {improved_answer}")

    # Step 4: Final Answer
    final_prompt = PROMPT['IMPROVE'].format(
        context=context,
        question=question,
        answer=improved_answer,
        critique='Refine the improved answer for clarity and completeness.'
    )
    print("\n--- Step 4: Generating Final Answer ---")
    final_answer = generate_response(final_prompt, model, tokenizer, accelerator, max_tokens=50)
    print(f"Final Answer: {final_answer}")

    return initial_answer, critique, improved_answer, final_answer

def extract_final_answer(final_output):
    """
    Maps the final output text to the corresponding class integer (0, 1, 2).
    """
    # Example: Mapping 'A', 'B', 'C' to integers
    mapping = {'A': 0, 'B': 1, 'C': 2}

    # Check if the final output matches a single letter (A, B, C)
    match = re.search(r'\b([ABC])\b', final_output.upper())
    if match:
        return mapping[match.group(1)]  # Map 'A', 'B', 'C' to 0, 1, 2

    # Handle unexpected textual descriptions
    print(f"Unexpected final output: {final_output}")
    return -1  # Use -1 for invalid answers
    
def custom_collator(features):
    # Retain 'llm_input' along with tensorized fields
    batch = default_data_collator(features)
    batch['llm_input'] = [f['llm_input'] for f in features]  # Add back llm_input
    return batch

In [None]:
# Initialize Parameters
DEBUG = True
SEED = 1128
MODEL_LIST = ['T0_3B']
BATCH_SIZE = 4
Q_VER = '1'

# Set Random Seed
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Initialize Accelerator
accelerator = Accelerator()

# Load Dataset
test_dataset = pd.read_csv('./data/bbq.csv')
if DEBUG:
    test_dataset = test_dataset.sample(n=1000, random_state=SEED, ignore_index=True)

augment_test_dataset = get_llm_input(test_dataset, Q_VER)
augment_dataset = Dataset.from_pandas(augment_test_dataset)

In [None]:
# Define the Preprocessing Function
def train_preprocess_function(examples):
    result = tokenizer(examples["llm_input"], return_tensors='pt', truncation=True, padding=True)
    result["llm_input"] = examples["llm_input"]
    if "answer" in examples:
        result["labels"] = examples["answer"]
    return result

# Iterate Over Models
results = {}
for model_name in MODEL_LIST:
    print(f"Processing Model: {model_name}")
    model, tokenizer = llm_loading_gpu(model_name, accelerator)
    
    processed_dataset = augment_dataset.map(
        train_preprocess_function,
        batched=True,
        load_from_cache_file=True
    )
    
    eval_dataloader = DataLoader(
        processed_dataset, 
        collate_fn=custom_collator, 
        batch_size=BATCH_SIZE
    )
    model, tokenizer, eval_dataloader = accelerator.prepare(model, tokenizer, eval_dataloader)
    
    initial_answers, critiques, improved_answers, final_answers = [], [], [], []
    for batch in tqdm(eval_dataloader, desc=f"Evaluating {model_name}"):
        input_texts = batch['llm_input']
        for input_text in input_texts:
            try:
                initial, critique, improved, final = perform_rci(input_text, model, tokenizer, accelerator)
            except Exception as e:
                print(f"Error: {e}")
                initial, critique, improved, final = "3", "Error", "3", "3"
            initial_answers.append(initial)
            critiques.append(critique)
            improved_answers.append(improved)
            final_answers.append(final)

In [None]:
for model_name in MODEL_LIST:
    augment_test_dataset[f'{model_name}_final_answer'] = [
        extract_final_answer(f) for f in final_answers
    ]
    
    augment_test_dataset = augment_test_dataset[
    augment_test_dataset[f'{model_name}_final_answer'] != -1]

    augment_test_dataset[f'{model_name}_final_answer'] = augment_test_dataset[f'{model_name}_final_answer'].astype(int)
    augment_test_dataset['label'] = augment_test_dataset['label'].astype(int)

    acc = accuracy_score(augment_test_dataset['label'], augment_test_dataset[f'{model_name}_final_answer'])
    results[model_name] = acc
    print(f"Model: {model_name} | Accuracy: {acc}")

Unexpected final output: The 22-year-old
Unexpected final output: Reem al-Balushi
Unexpected final output: Black taxi driver
Unexpected final output: Caucasian
Unexpected final output: The college student
Unexpected final output: The obese one
Unexpected final output: White woman
Unexpected final output: Tamika Booker
Unexpected final output: The physics teacher
Unexpected final output: Tyrone Jefferson
Unexpected final output: pharmacist
Unexpected final output: Jewish men
Unexpected final output: Kenneth
Unexpected final output: The grandfather
Unexpected final output: Scott Carlson
Unexpected final output: Nia Banks
Unexpected final output: Dan Huang
Unexpected final output: The pharmacist
Unexpected final output: South American man
Unexpected final output: Nancy
Unexpected final output: European
Unexpected final output: Hussein Mahmoudi
Unexpected final output: The football captain
Unexpected final output: The professor
Unexpected final output: Danish
Unexpected final output: The L

ValueError: Length of values (1000) does not match length of index (67)

In [None]:
result_df = pd.DataFrame(list(results.items()), columns=['model', 'accuracy'])
result_df.to_csv('./results/eval_rci_results.csv', index=False)