# Qwen2.5 Classifier for Acceptance Classification


## Import Dependencies

In [2]:
#!pip install transformers datasets accelerate bitsandbytes -q

In [3]:
import torch

print("GPU available:", torch.cuda.is_available())
print("MPS available:", torch.backends.mps.is_available())

from datasets import load_dataset
import json

GPU available: False
MPS available: True


## Data Preparation


In [4]:
# Load the dataset
dataset_name = "smallari/openreview-iclr2024-peer-reviews-RAW"
dataset = load_dataset(dataset_name)

# Display dataset structure
print("Dataset structure:", dataset)
print("\nFirst example from raw split:", dataset['raw'][0])
original_count = len(dataset['raw'])

# Remove entries with empty reviews
filtered_dataset = dataset.filter(lambda x: len(x['reviews']) > 0)

# remove first three entries as we will use them for few-shot examples
filtered_dataset = filtered_dataset.filter(lambda x, idx: idx > 2, with_indices=True)

# Calculate filtered size
filtered_count = len(filtered_dataset['raw'])

print(f"\nOriginal raw set size: {original_count}")
print(f"Filtered raw set size: {filtered_count}")
print(f"Removed {original_count - filtered_count} entries.")

Dataset structure: DatasetDict({
    raw: Dataset({
        features: ['venue', 'year', 'paper_id', 'title', 'abstract', 'decision', 'label', 'reviews'],
        num_rows: 7404
    })
})

First example from raw split: {'venue': 'ICLR.cc', 'year': '2024', 'paper_id': 'cXs5md5wAq', 'title': 'Modelling Microbial Communities with Graph Neural Networks', 'abstract': 'Understanding the interactions and interplay of microorganisms is a great challenge with many applications in medical and environmental settings. In this work, we model bacterial communities directly from their genomes using graph neural networks (GNNs). GNNs leverage the inductive bias induced by the set nature of bacteria, enforcing permutation invariance and granting combinatorial generalization. We propose to learn the dynamics implicitly by directly predicting community relative abundance profiles at steady state, thus escaping the need for growth curves. On two real-world datasets, we show for the first time generalizatio

## Prompt Engineering

Generate structured chat prompts for zero-shot and few-shot classification.

We use 'system' and 'user' roles in the prompt to structure the conversation and provide clear instructions to the language model, reflecting the data that instruction-tuned models like Qwen and LLaMa are pre-trained on.

### System Role
The `system` role is used to set the overall behavior, persona, or instructions for the AI. It tells the model how it should respond. In our case, `"You are an expert reviewer. Predict whether the paper was accepted or rejected based on the following reviews. Output only 'Accept' or 'Reject'."` establishes the model's persona (expert reviewer), the task (predict acceptance/rejection), and the desired output format ('Accept' or 'Reject').

### User Role
The `user` role represents the input or query from the human. It's where we provide the actual content the model needs to process, such as the paper's title, abstract, and reviews. The model is expected to generate a response based on this specific input, adhering to the guidelines set by the system prompt.

In [5]:
def format_data(entry):
    """Formats data into a readable string for the model."""
    submission_text = f"Title: {entry['title']}\nAbstract: {entry['abstract']}\n\nReviews:\n"
    for i, review in enumerate(entry['reviews']):
        submission_text += f"Review {i+1}:\n"
        submission_text += f"Summary: {review.get('summary', 'N/A')}\n"
        submission_text += f"Strengths: {review.get('strengths', 'N/A')}\n"
        submission_text += f"Weaknesses: {review.get('weaknesses', 'N/A')}\n"
        submission_text += f"Questions: {review.get('questions', 'N/A')}\n"
        # submission_text += f"Rating: {review.get('rating', 'N/A')}\n"
        # submission_text += f"Confidence: {review.get('confidence', 'N/A')}\n"
        submission_text += "\n"
    return submission_text.strip()

def create_prompt(entry, few_shot_entries=None):
    """
    Creates a structured chat prompt for the model.
    :param entry: A single record from the dataset.
    :param few_shot_entries: Optional list of example pairs, where each pair is a list of message dicts.
    """
    messages = [
        {"role": "system", "content": "You are an expert Area Chair for a computer science conference. Your goal is to determine if a paper should be accepted or rejected based on the provided peer reviews.\n\nYou must think step-by-step to reach a conclusion. Output your thought process inside <reasoning> tags, following this specific structure:\n\n1. **Review Analysis**: Briefly list the scores (if available) and the general sentiment of each reviewer (e.g., Reviewer 1: Weak Accept, Reviewer 2: Strong Reject).\n2. **Key Strengths**: Identify the strongest points agreed upon by the reviewers.\n3. **Critical Weaknesses & Severity**: Identify the weaknesses. Crucially, determine if these are \"fatal flaws\" (methodological errors, lack of novelty) or \"fixable issues\" (typos, clarity).\n4. **Conflict Resolution**: If reviewers disagree, evaluate which argument is more grounded in the paper's evidence. Discard vague or unsubstantiated reviewer claims.\n5. **Final Verdict Formulation**: Weigh the technical contribution against the severity of the flaws.\n\nAfter your analysis, output your final decision inside <decision> tags. The valid values are ACCEPT or REJECT.\n\nExample Output Structure:\n<reasoning>\n[Your step-by-step analysis here]\n</reasoning>\n<decision>\nACCEPT\n</decision>"}
    ]

    # Add few-shot examples if provided
    if few_shot_entries:
        for example_pair in few_shot_entries:
            # example_pair is a list of message dicts [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]
            for message in example_pair:
                messages.append(message)
    
    # Target entry
    user_content = format_data(entry)
    messages.append({"role": "user", "content": user_content})

    return messages

## Example Prompts

`three_shot_prompt.json` contains example (user, assistant) message pairs. The assistant text includes additional reasoning that was generated by passing the system prompt and user examples to an expert model (Gemini 3.0 Pro with Thinking) through a web-based UI.

In [6]:
import json

# open pre-made three_shot_prompt and save to file
with open("three_shot_prompt.json", "r") as f:
    three_shot_prompt = json.load(f)

# print an example of a one-shot prompt with a sample target entry
target_sample = filtered_dataset['raw'][0]
example_prompt = create_prompt(target_sample, few_shot_entries=three_shot_prompt)

for msg in example_prompt:
    if msg['role'] == 'system':
        print(f"{msg['role'].upper()}:\n{msg['content']}")
        print("=" * 100)
    else:
        print(f"{msg['role'].upper()}:\nCONTENT PREVIEW:\n{msg['content'][:50]} ... {msg['content'][-50:]}")
        print("=" * 100)
        # print(f"{msg['role'].upper()}:\nCONTENT PREVIEW:\n{msg['content']}")
        # print("=" * 100)

SYSTEM:
You are an expert Area Chair for a computer science conference. Your goal is to determine if a paper should be accepted or rejected based on the provided peer reviews.

You must think step-by-step to reach a conclusion. Output your thought process inside <reasoning> tags, following this specific structure:

1. **Review Analysis**: Briefly list the scores (if available) and the general sentiment of each reviewer (e.g., Reviewer 1: Weak Accept, Reviewer 2: Strong Reject).
2. **Key Strengths**: Identify the strongest points agreed upon by the reviewers.
3. **Critical Weaknesses & Severity**: Identify the weaknesses. Crucially, determine if these are "fatal flaws" (methodological errors, lack of novelty) or "fixable issues" (typos, clarity).
4. **Conflict Resolution**: If reviewers disagree, evaluate which argument is more grounded in the paper's evidence. Discard vague or unsubstantiated reviewer claims.
5. **Final Verdict Formulation**: Weigh the technical contribution against th

## Model Evaluation Function


In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from tqdm.auto import tqdm

def evaluate_model(model_id, dataset, num_test_samples=20, shots=0, few_shot_prompt=None):
    # print(f"Loading model: {model_id}...")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype="auto",
        device_map="auto"
    )

    if torch.backends.mps.is_available():
        device = "mps"
    elif torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"
    model.to(device)

    # Access the 'raw' split
    data = dataset['raw']

    # Select test entries (start from index 0 since pre-made examples don't come from dataset)
    test_entries = [data[i] for i in range(num_test_samples)]

    # Use pre-made few-shot examples if provided and shots > 0
    few_shot_entries = None
    if shots > 0 and few_shot_prompt is not None:
        # Use the first 'shots' examples from the pre-made prompt
        few_shot_entries = few_shot_prompt[:shots]

    correct_predictions = 0
    all_results = [] # List to store all results for analysis

    # print(f"Starting evaluation (Shots: {shots}, Test Samples: {num_test_samples})...")

    for idx, entry in tqdm(enumerate(test_entries), total=num_test_samples, desc="Evaluating samples"):
        # Create prompt with pre-made few-shot examples
        messages = create_prompt(entry, few_shot_entries)

        # Prepare input
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

        # Generate response
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=10,
            do_sample=False  # Deterministic for reproducibility
        )

        # Decode output
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]
        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

        # Determine ground truth (Label 1 = Accept, Label 0 = Reject)
        ground_truth = "Accept" if entry['label'] == 1 else "Reject"

        # Store results for analysis
        all_results.append({
            "input_text": text,
            "model_response": response,
            "ground_truth": ground_truth,
            "is_correct": (ground_truth.lower() in response.lower())
        })

        # Check prediction
        # We check if the ground truth keyword is present in the response (case-insensitive)
        if ground_truth.lower() in response.lower():
            correct_predictions += 1

    accuracy = correct_predictions / num_test_samples

    # Clean up to free memory
    del model
    del tokenizer
    torch.cuda.empty_cache()

    return shots, accuracy, all_results

## Model Evaluation

In [10]:
# ZERO SHOT EVALUATION
model_id = 'Qwen/Qwen2.5-3B-Instruct'
num_test_samples = 20
shots = 0
print("=== ZERO-SHOT EVALUATIONS ===")
shots0, acc_shots0, results_shots0 = evaluate_model(model_id, filtered_dataset, num_test_samples, shots, few_shot_prompt=three_shot_prompt)
print(f"{model_id}: {shots0} SHOT ACCURACY: {acc_shots0:.2%}\n")


=== ZERO-SHOT EVALUATIONS ===


Fetching 2 files:   0%|          | 0/2 [02:17<?, ?it/s]


KeyboardInterrupt: 

# Data Analysis

In [9]:
import json
import os

# Print results
print("\n" + "="*50)
print(f"MODEL: {model_id}")
print(f"SHOTS: {shots0}")
print(f"ACCURACY: {acc_shots0:.2%}")
print(f"CORRECT: {sum(r['is_correct'] for r in results_shots0)}/{len(results_shots0)}")
print("="*50)

# Save results if output path provided
OUTPUT_DIR = f"./llm_results/{dataset_name}"
OUTPUT_FILE = f"{OUTPUT_DIR}/{model_id}_{shots0}shot.json"
OUTPUT_FILE = None
if OUTPUT_FILE:
    os.makedirs(os.path.dirname(OUTPUT_DIR), exist_ok=True)
    output_data = {
        "model_id": model_id,
        "dataset": dataset_name,
        "shots": shots0,
        "accuracy": acc_shots0,
        "total_samples": len(results_shots0),
        "correct_predictions": sum(r['is_correct'] for r in results_shots0),
        "results": results_shots0
    }

    with open(OUTPUT_FILE, 'w') as f:
        json.dump(output_data, f, indent=2)
    print(f"\nResults saved to: {OUTPUT_FILE}")


MODEL: Qwen/Qwen2.5-1.5B-Instruct
SHOTS: 0
ACCURACY: 0.00%
CORRECT: 0/20
