In [2]:
# Import utility functions
from utils import (
    evaluate_summary,
    evaluate_summaries_batch,
    display_text,
    display_message,
    load_conversations,
    client,
    JUDGE_SYSTEM_INSTRUCTION,
    JUDGE_PROMPT_TEMPLATE_WITH_REQUEST_AND_RESPONSE
)




# Examine data

In [3]:

# train_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train.jsonl"
train_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train_filtered_10k.jsonl"
val_path =   "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_val_filtered_10k.jsonl"
test_path =  "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_test_filtered_10k.jsonl"
# train_distilled_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_abstract_train_gpt5mini_think2.jsonl"
train_conversations = load_conversations(train_path)
val_conversations = load_conversations(val_path)
test_conversations = load_conversations(test_path)
# train_distilled_conversations = load_conversations(train_distilled_path)

In [3]:
test_conversations[0]

[{'role': 'user',
  'content': 'as a common quantum phenomenon , the tunneling through a potential barrier plays a very important role in the microscopic world and has been studied extensively since the birth of quantum mechanics . \n one of the earliest applications of quantum tunneling is the explanation of @xmath0 decays in atomic nuclei . \n the quantum tunneling effect governs also many other nuclear processes such as fission and fusion . \n in particular , a lot of new features are revealed in sub - barrier fusion reactions which are closely connected with the tunneling phenomena  @xcite .    for most of the potential barriers , the penetrability can not be calculated analytically  @xcite . among those potentials for which analytical solutions can be obtained , \n the parabolic potential  @xcite is the mostly used in the study of nuclear fusion . by approximating the coulomb barrier to a parabola \n , wong derived an analytic expression for the fusion cross section  @xcite which 

# Judge

In [11]:
# Test the helper function on the first message
# First, let's examine train_conversations[0]

conv = train_conversations[6]
print("="*80)
print(f"Number of messages: {len(conv)}")
print(f"Message roles: {[msg['role'] for msg in conv]}")
print("\n" + "="*80)
print("USER MESSAGE (the paper content):")
print("="*80)
# display_message(train_conversations[0], role='user')
# print("\n" + "="*80)
print("ASSISTANT MESSAGE (the summary being evaluated):")
print("="*80)
display_message(conv, role='assistant')

print("\n" + "="*80)
print("JUDGE EVALUATION:")
print("="*80)
result, evaluation_prompt = evaluate_summary(conv, model="gpt-5")

print(f"Judgment: {result['judgment']}")
print(f"\nExplanation:\n{result['explanation']}")


Number of messages: 2
Message roles: ['user', 'assistant']

USER MESSAGE (the paper content):
ASSISTANT MESSAGE (the summary being evaluated):
Role: ASSISTANT
Characters: 1,721 | Words: 315 | Lines: 8




JUDGE EVALUATION:
Judgment: No

Explanation:
Faithfulness (60/100):
- The summary largely reflects the paper’s topic (object recognition to aid segmentation via b-scale and shape models) and the hierarchical recognition idea. However, it introduces a specific evaluation detail not present in the provided text: “a set of 20 routine clinical abdominal female and male CT data sets.” The number 20 is not supported by the text (which uses variables for the number of subjects and mentions only that data are routine PET/CT-derived CT scans).
- It also paraphrases a reported finding as “recognition accuracy” improving with more objects, whereas the paper explicitly states “specificity” increases; this is a subtle but meaningful shift in the reported metric.
- It slightly overstates the conclusion with “make delineation most accurate,” which is stronger than the paper’s framing that efficient recognition enables successful delineation.

Coverage (85/100):
- The summary states the main problem 

In [12]:
# Batch evaluation for the first 10 conversations

first_10_conversations = train_conversations[:10]

results, errors = evaluate_summaries_batch(
    first_10_conversations,
    model="gpt-5",
    temperature=1.0,
    max_workers=10,  # Adjust based on your API rate limits
    show_progress=True
)

# Display results
print("\n" + "="*80)
print("BATCH EVALUATION RESULTS")
print("="*80)

for idx, result, prompt in results:
    print(f"\nConversation {idx}:")
    print(f"  Judgment: {result['judgment']}")
    # Optionally show a snippet of the explanation
    explanation_preview = result['explanation'][:200] + "..." if len(result['explanation']) > 200 else result['explanation']
    print(f"  Explanation preview: {explanation_preview}")

if errors:
    print(f"\nErrors encountered: {len(errors)}")
    for idx, error in errors:
        print(f"  Conversation {idx}: {error}")

# Summary statistics
judgments = [result['judgment'] for _, result, _ in results]
yes_count = judgments.count('Yes')
no_count = judgments.count('No')
unknown_count = judgments.count('Unknown')

print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(f"Total evaluated: {len(results)}")
print(f"Yes: {yes_count} ({yes_count/len(results)*100:.1f}%)")
print(f"No: {no_count} ({no_count/len(results)*100:.1f}%)")
print(f"Unknown: {unknown_count} ({unknown_count/len(results)*100:.1f}%)")


Evaluating 10 conversations with 10 workers...
  Completed 1/10
  Completed 2/10
  Completed 3/10
  Completed 4/10
  Completed 5/10
  Completed 6/10
  Completed 7/10
  Completed 8/10
  Completed 9/10
  Completed 10/10

✓ Completed: 10 successful, 0 errors

BATCH EVALUATION RESULTS

Conversation 0:
  Judgment: Yes
  Explanation preview: Faithfulness (92/100):
- The summary accurately reflects the paper’s core content: learning rates for SVMs with additive kernels in additive models, favorable high-dimensional behavior compared to Gau...

Conversation 1:
  Judgment: Yes
  Explanation preview: Faithfulness (95/100):
- The summary accurately reflects the paper’s content: it describes studying the leptonic decay via a specific tau decay mode using tagged events at CLEO-c, and reports both a b...

Conversation 2:
  Judgment: Yes
  Explanation preview: Faithfulness (85/100):
- The summary accurately reflects the paper’s core content: it frames Mateos’ conjecture, Barbi and Salerno’s critique 

In [5]:
judgments

['Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes']

CREATING TRAIN DATASET
Creating Qwen4b instruct format for 10000 conversations...


100%|██████████| 10000/10000 [00:00<00:00, 307079.30it/s]



Saving 10000 conversations to /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train_qwen4b_instruct.jsonl...
✓ Saved 10000 conversations to /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train_qwen4b_instruct.jsonl

CREATING VALIDATION DATASET
Creating Qwen4b instruct format for 1000 conversations...


100%|██████████| 1000/1000 [00:00<00:00, 219058.02it/s]



Saving 1000 conversations to /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_val_qwen4b_instruct.jsonl...
✓ Saved 1000 conversations to /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_val_qwen4b_instruct.jsonl


In [5]:
# Verify the created dataset format
print("Example from train dataset:")
print("="*80)
if train_qwen4b_conversations:
    example = train_qwen4b_conversations[0]
    print(f"System instruction length: {len(example['messages'][0]['content'])} chars")
    print(f"User message length: {len(example['messages'][1]['content'])} chars")
    print(f"Assistant message length: {len(example['messages'][2]['content'])} chars")
    print("\nSystem instruction preview:")
    print(example['messages'][0]['content'][:200] + "...")
    print("\nUser message preview:")
    print(example['messages'][1]['content'][:300] + "...")
    print("\nAssistant message preview:")
    print(example['messages'][2]['content'][:200] + "...")
    
print(f"\nTotal train examples: {len(train_qwen4b_conversations)}")
print(f"Total val examples: {len(val_qwen4b_conversations)}")


Example from train dataset:
System instruction length: 1062 chars
User message length: 26760 chars
Assistant message length: 930 chars

System instruction preview:
You are an expert academic abstract writer. Your task is to create a high-quality abstract for an arXiv paper based on the paper content and judge evaluation feedback.

The judge evaluates abstracts b...

User message preview:
Paper Content:
additive models @xcite provide an important family of models for semiparametric regression or classification . some reasons for the success of additive models are their increased flexibility when compared to linear or generalized linear models and their increased interpretability when...

Assistant message preview:
additive models play an important role in semiparametric statistics . 
 this paper gives learning rates for regularized kernel based methods for additive models . 
 these learning rates compare favour...

Total train examples: 10000
Total val examples: 1000
