# Generate High-Quality Abstracts using GPT-5

This notebook generates high-quality abstracts for training and validation datasets using OpenAI's GPT-5 (or other high-quality models).

The generated abstracts will replace the existing abstracts in the training data, providing better quality examples for fine-tuning.


In [8]:
import json
import sys
from pathlib import Path
import os

from utils import (
    generate_abstracts_batch,
    load_conversations,
    get_openai_client,
    display_text,
    display_message,
)

# Set OpenAI API key if not already set
if 'OPENAI_API_KEY' not in os.environ:
    print("Warning: OPENAI_API_KEY not set. Please set it before running.")
    print("You can set it with: os.environ['OPENAI_API_KEY'] = 'your-key-here'")
else:
    print("✓ OPENAI_API_KEY found")


✓ OPENAI_API_KEY found


In [3]:
# Configuration
MODEL = "gpt-5"  # Use "gpt-4o", "o1-preview", or "gpt-4-turbo"
TEMPERATURE = 1.0
MAX_WORKERS = 1000  # Number of parallel API calls

# File paths
BASE_DIR = Path("/Users/ryanarman/code/lab/arxiv_abstract/data")
TRAIN_INPUT = BASE_DIR / "arxiv_summarization_train_instruct.jsonl"
VAL_INPUT = BASE_DIR / "arxiv_summarization_val_instruct.jsonl"

# Output paths
TRAIN_OUTPUT = BASE_DIR / "arxiv_summarization_train_instruct_gpt5.jsonl"
VAL_OUTPUT = BASE_DIR / "arxiv_summarization_val_instruct_gpt5.jsonl"

print(f"Model: {MODEL}")
print(f"Temperature: {TEMPERATURE}")
print(f"Max workers: {MAX_WORKERS}")
print(f"\nTrain input: {TRAIN_INPUT}")
print(f"Train output: {TRAIN_OUTPUT}")
print(f"\nVal input: {VAL_INPUT}")
print(f"Val output: {VAL_OUTPUT}")


Model: gpt-5
Temperature: 1.0
Max workers: 1000

Train input: /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train_instruct.jsonl
Train output: /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train_instruct_gpt5.jsonl

Val input: /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_val_instruct.jsonl
Val output: /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_val_instruct_gpt5.jsonl


In [4]:
# Load training conversations
print("Loading training data...")
train_conversations = load_conversations(TRAIN_INPUT)
print(f"Loaded {len(train_conversations)} training conversations")

# Load validation conversations
print("\nLoading validation data...")
val_conversations = load_conversations(VAL_INPUT)
print(f"Loaded {len(val_conversations)} validation conversations")


Loading training data...
Loaded 10000 training conversations

Loading validation data...
Loaded 1000 validation conversations


In [5]:
def prepare_conversations_for_generation(conversations):
    """
    Prepare conversations for abstract generation.
    Removes assistant messages and keeps only system + user (paper content).
    """
    prepared = []
    for conv in conversations:
        # Keep only system and user messages
        filtered = [msg for msg in conv if msg.get('role') in ['system', 'user']]
        if len(filtered) >= 2:  # Need at least system and user
            prepared.append(filtered)
    return prepared

# Prepare training conversations
train_prepared = prepare_conversations_for_generation(train_conversations)
print(f"Prepared {len(train_prepared)} training conversations for generation")

# Prepare validation conversations
val_prepared = prepare_conversations_for_generation(val_conversations)
print(f"Prepared {len(val_prepared)} validation conversations for generation")


Prepared 10000 training conversations for generation
Prepared 1000 validation conversations for generation


In [6]:
train_prepared[0]

[{'role': 'system',
  'content': "You are an expert academic abstract writer. Your task is to create a high-quality abstract for an arXiv paper based on the paper content and judge evaluation feedback.\n\nThe judge evaluates abstracts based on five dimensions:\n1. Faithfulness: The abstract must accurately reflect the paper's content without hallucination\n2. Coverage: The abstract must include the essential aspects (main problem, approach, and key results)\n3. Clarity: The abstract must be understandable and readable\n4. Conciseness: The abstract must be focused and not verbose\n5. Coherence: The abstract must be logically structured and flow naturally\n\nWhen creating the abstract:\n- Read the paper content carefully\n- Pay attention to the judge's feedback on what makes a good abstract\n- Ensure your abstract meets all five evaluation criteria\n- Write a concise, clear, and coherent summary that accurately covers the paper's main contributions\n- Focus on the main problem, approach,

In [9]:
display_text(train_prepared[0][0]['content']) # system instruction
# display_text(train_prepared[0][1]['content']) # user message (paper content)


Characters: 1,062 | Words: 163 | Lines: 17



In [10]:
# Generate abstracts for training set
print("="*80)
print("GENERATING ABSTRACTS FOR TRAINING SET")
print("="*80)

train_results, train_errors = generate_abstracts_batch(
    train_prepared[0:1000],
    model=MODEL,
    temperature=TEMPERATURE,
    max_workers=MAX_WORKERS,
    show_progress=True
)

print(f"\nTraining set: {len(train_results)} successful, {len(train_errors)} errors")


GENERATING ABSTRACTS FOR TRAINING SET
Generating abstracts for 1000 papers with 1000 workers...
Using model: gpt-5


Generating abstracts: 100%|██████████| 1000/1000 [02:12<00:00,  7.53abstract/s, success=1000, errors=0]



✓ Completed: 1000 successful, 0 errors

Training set: 1000 successful, 0 errors


In [11]:
display_text(train_results[0][1]['abstract'])


Characters: 1,553 | Words: 210 | Lines: 1



In [None]:
# Generate abstracts for validation set
print("="*80)
print("GENERATING ABSTRACTS FOR VALIDATION SET")
print("="*80)

val_results, val_errors = generate_abstracts_batch(
    val_prepared,
    model=MODEL,
    temperature=TEMPERATURE,
    max_workers=MAX_WORKERS,
    show_progress=True
)

print(f"\nValidation set: {len(val_results)} successful, {len(val_errors)} errors")


GENERATING ABSTRACTS FOR VALIDATION SET
Generating abstracts for 1000 papers with 1000 workers...
Using model: gpt-5


Generating abstracts:   0%|          | 1/1000 [00:13<3:50:48, 13.86s/abstract, success=1, errors=0]

In [None]:
def save_generated_abstracts(original_conversations, generation_results, output_path):
    """
    Save generated abstracts in the same format as training data.
    
    Args:
        original_conversations: Original conversations with system/user messages
        generation_results: List of (index, result_dict) tuples from generate_abstracts_batch
        output_path: Path to save the output JSONL file
    """
    # Create a mapping of index to generated abstract
    abstract_map = {idx: result['abstract'] for idx, result in generation_results}
    
    saved_count = 0
    with open(output_path, 'w', encoding='utf-8') as f:
        for idx, original_conv in enumerate(original_conversations):
            if idx in abstract_map:
                # Create new conversation with generated abstract
                new_conv = []
                for msg in original_conv:
                    if msg.get('role') == 'assistant':
                        # Replace with generated abstract
                        new_conv.append({
                            'role': 'assistant',
                            'content': abstract_map[idx]
                        })
                    else:
                        # Keep system and user messages as-is
                        new_conv.append(msg)
                
                # If no assistant message existed, add the generated one
                if not any(msg.get('role') == 'assistant' for msg in new_conv):
                    new_conv.append({
                        'role': 'assistant',
                        'content': abstract_map[idx]
                    })
                
                # Write to file
                json.dump({'messages': new_conv}, f, ensure_ascii=False)
                f.write('\n')
                saved_count += 1
            else:
                # If generation failed, skip this example
                print(f"Warning: Skipping index {idx} (generation failed)")
    
    return saved_count

# Save training results
print("Saving training results...")
train_saved = save_generated_abstracts(train_conversations[0:1000], train_results, TRAIN_OUTPUT)
print(f"Saved {train_saved} training examples to {TRAIN_OUTPUT}")

# # Save validation results
print("\nSaving validation results...")
val_saved = save_generated_abstracts(val_conversations, val_results, VAL_OUTPUT)
print(f"Saved {val_saved} validation examples to {VAL_OUTPUT}")


Saving training results...
Saved 10 training examples to /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train_instruct_gpt5.jsonl


In [30]:
gen_train_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train_instruct_gpt5.jsonl"
gen_train_conversations = load_conversations(gen_train_path)

# display_message(gen_train_conversations[0], "system")
# display_message(gen_train_conversations[0], "user")
display_message(gen_train_conversations[0], "assistant")

Role: ASSISTANT
Characters: 1,774 | Words: 245 | Lines: 1



In [None]:
print("="*80)
print("GENERATION SUMMARY")
print("="*80)
print(f"Model used: {MODEL}")
print(f"\nTraining set:")
print(f"  Generated: {len(train_results)} abstracts")
print(f"  Errors: {len(train_errors)}")
print(f"  Saved to: {TRAIN_OUTPUT}")
print(f"\nValidation set:")
print(f"  Generated: {len(val_results)} abstracts")
print(f"  Errors: {len(val_errors)}")
print(f"  Saved to: {VAL_OUTPUT}")

# Show sample generated abstract
if train_results:
    print("\n" + "="*80)
    print("SAMPLE GENERATED ABSTRACT")
    print("="*80)
    idx, result = train_results[0]
    print(f"\nIndex {idx}:")
    print(result['abstract'][:500] + "..." if len(result['abstract']) > 500 else result['abstract'])
