In [3]:
# Load the arxiv-summarization dataset from Hugging Face
from datasets import load_dataset
import json

# Read hg dataset

In [5]:
# Load the dataset
dataset = load_dataset("ccdv/arxiv-summarization")

# Explore the dataset structure
print("Dataset splits:", dataset.keys())
print("\nDataset info:")
print(dataset)

# Check the structure of a sample from each split
for split_name in dataset.keys():
    print(f"\n{split_name} split:")
    print(f"  Number of examples: {len(dataset[split_name])}")
    if len(dataset[split_name]) > 0:
        print(f"  Features: {dataset[split_name].features}")
        print(f"  First example keys: {list(dataset[split_name][0].keys())}")
        print(f"  First example:")
        for key, value in dataset[split_name][0].items():
            if isinstance(value, str) and len(value) > 200:
                print(f"    {key}: {value[:200]}...")
            else:
                print(f"    {key}: {value}")



Dataset splits: dict_keys(['train', 'validation', 'test'])

Dataset info:
DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 203037
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6436
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6440
    })
})

train split:
  Number of examples: 203037
  Features: {'article': Value(dtype='string', id=None), 'abstract': Value(dtype='string', id=None)}
  First example keys: ['article', 'abstract']
  First example:
    article: additive models @xcite provide an important family of models for semiparametric regression or classification . some reasons for the success of additive models are their increased flexibility when comp...
    abstract: additive models play an important role in semiparametric statistics . 
 this paper gives learning rates for regularized kernel based methods for additive models . 
 these learning rate

## write train

In [None]:
# Create oumi dataset for the entire training set
from pathlib import Path
from tqdm import tqdm

# Output path for the oumi dataset
output_dir = Path("/Users/ryanarman/code/lab/arxiv_abstract/data")
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / "arxiv_summarization_train_oumi.jsonl"

train_dataset = dataset['train']
print(f"Converting {len(train_dataset)} training examples to oumi format...")
print(f"Output file: {output_path}")

# Optimized: Iterate directly over the dataset and create dict directly
# This avoids the overhead of creating Conversation/Message objects
conversations_written = 0
with open(output_path, 'w', encoding='utf-8') as f:
    for example in tqdm(train_dataset, desc="Processing", total=len(train_dataset)):
        article = example['article']
        abstract = example['abstract']
        
        # Create oumi conversation format directly as dict (faster than creating objects)
        messages = [
            {"role": "user", "content": article},
            {"role": "assistant", "content": abstract}
        ]
        
        # Write to JSONL file (one JSON object per line)
        json.dump({"messages": messages}, f, ensure_ascii=False)
        f.write('\n')
        conversations_written += 1

print(f"\n✓ Successfully created oumi dataset with {conversations_written} conversations")
print(f"  File: {output_path}")
print(f"  File size: {output_path.stat().st_size / (1024*1024):.2f} MB")


# Write validation

In [6]:
# Create oumi dataset for validation set
from pathlib import Path
from tqdm import tqdm

# Output path for the oumi dataset
output_dir = Path("/Users/ryanarman/code/lab/arxiv_abstract/data")
output_dir.mkdir(parents=True, exist_ok=True)
val_output_path = output_dir / "arxiv_summarization_validation_oumi.jsonl"

val_dataset = dataset['validation']
print(f"Converting {len(val_dataset)} validation examples to oumi format...")
print(f"Output file: {val_output_path}")

# Optimized: Iterate directly over the dataset and create dict directly
conversations_written = 0
with open(val_output_path, 'w', encoding='utf-8') as f:
    for example in tqdm(val_dataset, desc="Processing validation", total=len(val_dataset)):
        article = example['article']
        abstract = example['abstract']
        
        # Create oumi conversation format directly as dict
        messages = [
            {"role": "user", "content": article},
            {"role": "assistant", "content": abstract}
        ]
        
        # Write to JSONL file (one JSON object per line)
        json.dump({"messages": messages}, f, ensure_ascii=False)
        f.write('\n')
        conversations_written += 1

print(f"\n✓ Successfully created oumi dataset with {conversations_written} conversations")
print(f"  File: {val_output_path}")
print(f"  File size: {val_output_path.stat().st_size / (1024*1024):.2f} MB")

Converting 6436 validation examples to oumi format...
Output file: /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_validation_oumi.jsonl


Processing validation: 100%|██████████| 6436/6436 [00:00<00:00, 9249.48it/s]



✓ Successfully created oumi dataset with 6436 conversations
  File: /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_validation_oumi.jsonl
  File size: 210.91 MB


## write test

In [7]:
# Create oumi dataset for test set
from pathlib import Path
from tqdm import tqdm

# Output path for the oumi dataset
output_dir = Path("/Users/ryanarman/code/lab/arxiv_abstract/data")
output_dir.mkdir(parents=True, exist_ok=True)
test_output_path = output_dir / "arxiv_summarization_test_oumi.jsonl"

test_dataset = dataset['test']
print(f"Converting {len(test_dataset)} test examples to oumi format...")
print(f"Output file: {test_output_path}")

# Optimized: Iterate directly over the dataset and create dict directly
conversations_written = 0
with open(test_output_path, 'w', encoding='utf-8') as f:
    for example in tqdm(test_dataset, desc="Processing test", total=len(test_dataset)):
        article = example['article']
        abstract = example['abstract']
        
        # Create oumi conversation format directly as dict
        messages = [
            {"role": "user", "content": article},
            {"role": "assistant", "content": abstract}
        ]
        
        # Write to JSONL file (one JSON object per line)
        json.dump({"messages": messages}, f, ensure_ascii=False)
        f.write('\n')
        conversations_written += 1

print(f"\n✓ Successfully created oumi dataset with {conversations_written} conversations")
print(f"  File: {test_output_path}")
print(f"  File size: {test_output_path.stat().st_size / (1024*1024):.2f} MB")


Converting 6440 test examples to oumi format...
Output file: /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_test_oumi.jsonl


Processing test: 100%|██████████| 6440/6440 [00:00<00:00, 9217.84it/s]



✓ Successfully created oumi dataset with 6440 conversations
  File: /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_test_oumi.jsonl
  File size: 211.29 MB


# Filter

In [1]:
# Import oumi DatasetAnalyzer to get token length for Qwen3-8B
from oumi.core.analyze import DatasetAnalyzer
from oumi.core.configs import AnalyzeConfig, DatasetSource, SampleAnalyzerParams
from oumi.datasets import TextSftJsonLinesDataset
from oumi.core.types.conversation import Conversation, Message, Role
import json
import tempfile
import os



## Train

In [18]:
# Analyze the full training dataset using DatasetAnalyzer
from pathlib import Path

# Ensure output_path is defined (in case cell 3 wasn't run)
if 'output_path' not in locals():
    output_dir = Path("/Users/ryanarman/code/lab/arxiv_abstract/data")
    output_path = output_dir / "arxiv_summarization_train_oumi.jsonl"

# Check if file exists
if not Path(output_path).exists():
    raise FileNotFoundError(
        f"Dataset file not found: {output_path}\n"
        f"Please run the previous cell to create the dataset first."
    )

print("Loading oumi dataset and running analysis...")
print(f"Dataset file: {output_path}")

# Load the oumi dataset
train_oumi_dataset = TextSftJsonLinesDataset(dataset_path=str(output_path))

print(f"Dataset loaded: {len(train_oumi_dataset)} conversations")

Loading oumi dataset and running analysis...
Dataset file: /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train_oumi.jsonl
[2025-11-12 16:40:12,131][oumi][rank0][pid:33744][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: TextSftJsonLinesDataset)... dataset_name: 'custom'
Dataset loaded: 203037 conversations


In [19]:
# Configure analyzer with Qwen3-8B tokenizer and length analyzer
config = AnalyzeConfig(
    dataset_source=DatasetSource.DIRECT,
    dataset_name="arxiv_summarization_train",
    sample_count=20000,
    analyzers=[
        SampleAnalyzerParams(
            id="length",

            params={
                "char_count": True,
                "word_count": True,
                "token_count": True,
                "include_special_tokens": True
            }
        )
    ],
    tokenizer_config={
        "model_name": "Qwen/Qwen3-8B",
        "trust_remote_code": True
    }
)

# Create and run the analyzer
print("Running DatasetAnalyzer (this may take a while for the full dataset)...")
analyzer = DatasetAnalyzer(config, dataset=train_oumi_dataset)

# Run analysis - catch numpy compatibility error in summary generation
try:
    analyzer.analyze_dataset()
    print("✓ Analysis completed successfully")
except AttributeError as e:
    if "module 'numpy' has no attribute 'ma'" in str(e):
        print("⚠ Warning: Analysis completed but summary generation failed due to numpy compatibility issue.")
        print("  The analysis data is still available, but summary statistics may be incomplete.")
        # The analysis should still have completed - the error is only in summary generation
        # Try to access the results anyway
    else:
        raise

# Get the analysis results (should be available even if summary generation failed)
analysis_df = analyzer.analysis_df


Running DatasetAnalyzer (this may take a while for the full dataset)...
[2025-11-12 16:40:13,278][oumi][rank0][pid:33744][MainThread][INFO]][models.py:544] Using the model's built-in chat template for model 'Qwen/Qwen3-8B'.
[2025-11-12 16:40:13,279][oumi.utils.analysis_utils][rank0][pid:33744][MainThread][INFO]][analysis_utils.py:57] Built tokenizer for model: Qwen/Qwen3-8B
[2025-11-12 16:40:13,280][oumi][rank0][pid:33744][MainThread][INFO]][dataset_analyzer.py:142] Using provided dataset 'arxiv_summarization_train' with 203037 conversations
[2025-11-12 16:40:13,280][oumi][rank0][pid:33744][MainThread][INFO]][dataset_analyzer.py:275] Initialized sample analyzer: length
[2025-11-12 16:40:13,280][oumi][rank0][pid:33744][MainThread][INFO]][dataset_analyzer.py:301] Starting analysis of dataset: arxiv_summarization_train
[2025-11-12 16:40:13,282][oumi][rank0][pid:33744][MainThread][INFO]][dataset_analyzer.py:302] Using 1 sample analyzers: ['length']
[2025-11-12 16:40:13,283][oumi][rank0][pi

Converting arxiv_summarization_train to DataFrames: 100%|██████████| 20000/20000 [00:04<00:00, 4640.84item/s]




Token indices sequence length is longer than the specified maximum sequence length for this model (136905 > 131072). Running this sequence through the model will result in indexing errors


✓ Analysis completed successfully


In [28]:
analyzer.analysis_df.query("role == 'assistant'")['text_content_token_count'].std()

839.5079106846358

In [21]:
# Filter dataset to conversations where user (article) token count < 7500
train_filtered_dataset = analyzer.filter("text_content_token_count < 7500 & role == 'user'")

print(f"Filtered dataset size: {len(train_filtered_dataset)} conversations")

[2025-11-12 16:44:34,727][oumi][rank0][pid:33744][MainThread][INFO]][dataset_analyzer.py:435] Query 'text_content_token_count < 7500 & role == 'user'' returned 10402 rows
[2025-11-12 16:44:34,732][oumi][rank0][pid:33744][MainThread][INFO]][dataset_analyzer.py:580] Filtered dataset: 10402 conversations out of 203037 total
Filtered dataset size: 10402 conversations


In [None]:

# Take first 10k samples and save to JSONL using oumi's save_jsonlines utility
from pathlib import Path
from oumi.utils.io_utils import save_jsonlines

output_dir = Path("/Users/ryanarman/code/lab/arxiv_abstract/data")
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "arxiv_summarization_train_filtered_10k.jsonl"

# Limit to first 10k conversations
num_samples = min(10000, len(train_filtered_dataset))
print(f"Saving first {num_samples} conversations to {output_file}...")

# Extract conversations and convert to dict format
# Use raw() method to get unconverted data to avoid tokenization
conversations_data = []
for i in range(num_samples):
    # Access raw data (pandas Series) to avoid tokenization
    raw_sample = train_filtered_dataset.raw(i)
    
    # The raw data is stored in _messages_column (pandas Series)
    # Extract the conversation dict from the Series
    if '_messages_column' in raw_sample:
        conversation_dict = raw_sample['_messages_column']
    else:
        # Fallback: get the first value if it's a Series
        conversation_dict = raw_sample.iloc[0] if hasattr(raw_sample, 'iloc') else raw_sample
    
    # conversation_dict should now be the original dict from the JSONL file
    # Ensure it's in the right format
    if isinstance(conversation_dict, dict):
        conversations_data.append(conversation_dict)
    else:
        # If it's somehow not a dict, try to convert
        if hasattr(conversation_dict, 'to_dict'):
            conversations_data.append(conversation_dict.to_dict())
        else:
            conversations_data.append({"messages": getattr(conversation_dict, 'messages', [])})

# Save using oumi's save_jsonlines utility
save_jsonlines(output_file, conversations_data)

print(f"✓ Successfully saved {len(conversations_data)} conversations to {output_file}")
print(f"  File size: {output_file.stat().st_size / (1024*1024):.2f} MB")

## validation

In [9]:
# Analyze the validation dataset using DatasetAnalyzer
from pathlib import Path

# Validation dataset path
val_output_path = Path("/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_validation_oumi.jsonl")

# Check if file exists
if not val_output_path.exists():
    raise FileNotFoundError(
        f"Dataset file not found: {val_output_path}\n"
        f"Please run the previous cell to create the dataset first."
    )

print("Loading validation oumi dataset and running analysis...")
print(f"Dataset file: {val_output_path}")

# Load the oumi dataset
val_oumi_dataset = TextSftJsonLinesDataset(dataset_path=str(val_output_path))

print(f"Validation dataset loaded: {len(val_oumi_dataset)} conversations")

# Configure analyzer with Qwen3-8B tokenizer and length analyzer
val_config = AnalyzeConfig(
    dataset_source=DatasetSource.DIRECT,
    dataset_name="arxiv_summarization_validation",
    sample_count=2000,
    analyzers=[
        SampleAnalyzerParams(
            id="length",
            params={
                "char_count": True,
                "word_count": True,
                "token_count": True,
                "include_special_tokens": True
            }
        )
    ],
    tokenizer_config={
        "model_name": "Qwen/Qwen3-8B",
        "trust_remote_code": True
    }
)

# Create and run the analyzer
print("Running DatasetAnalyzer on validation set...")
val_analyzer = DatasetAnalyzer(val_config, dataset=val_oumi_dataset)

# Run analysis - catch numpy compatibility error in summary generation
try:
    val_analyzer.analyze_dataset()
    print("✓ Validation analysis completed successfully")
except AttributeError as e:
    if "module 'numpy' has no attribute 'ma'" in str(e):
        print("⚠ Warning: Analysis completed but summary generation failed due to numpy compatibility issue.")
        print("  The analysis data is still available, but summary statistics may be incomplete.")
    else:
        raise

# Get the analysis results
val_analysis_df = val_analyzer.analysis_df

print(f"\n✓ Validation analysis complete!")
print(f"  Total messages analyzed: {len(val_analysis_df)}")

# Find columns with metrics
token_cols = [col for col in val_analysis_df.columns if 'token_count' in col.lower()]
char_cols = [col for col in val_analysis_df.columns if 'char_count' in col.lower()]
word_cols = [col for col in val_analysis_df.columns if 'word_count' in col.lower()]

# Display summary statistics by role
print("\n" + "=" * 80)
print("Validation Set Summary Statistics by Role:")
print("=" * 80)

for role in ['user', 'assistant']:
    role_rows = val_analysis_df[val_analysis_df['role'] == role]
    if len(role_rows) > 0:
        print(f"\n{role.upper()} messages ({len(role_rows)} total):")
        if token_cols:
            token_col = token_cols[0]
            print(f"  Token count - Mean: {role_rows[token_col].mean():.1f}, "
                  f"Median: {role_rows[token_col].median():.1f}, "
                  f"Min: {role_rows[token_col].min()}, "
                  f"Max: {role_rows[token_col].max()}")
        if char_cols:
            char_col = char_cols[0]
            print(f"  Character count - Mean: {role_rows[char_col].mean():.1f}, "
                  f"Median: {role_rows[char_col].median():.1f}")
        if word_cols:
            word_col = word_cols[0]
            print(f"  Word count - Mean: {role_rows[word_col].mean():.1f}, "
                  f"Median: {role_rows[word_col].median():.1f}")


Loading validation oumi dataset and running analysis...
Dataset file: /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_validation_oumi.jsonl
[2025-11-12 16:04:29,196][oumi][rank0][pid:33744][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: TextSftJsonLinesDataset)... dataset_name: 'custom'
Validation dataset loaded: 6436 conversations
Running DatasetAnalyzer on validation set...
[2025-11-12 16:04:29,651][oumi][rank0][pid:33744][MainThread][INFO]][models.py:544] Using the model's built-in chat template for model 'Qwen/Qwen3-8B'.
[2025-11-12 16:04:29,652][oumi.utils.analysis_utils][rank0][pid:33744][MainThread][INFO]][analysis_utils.py:57] Built tokenizer for model: Qwen/Qwen3-8B
[2025-11-12 16:04:29,653][oumi][rank0][pid:33744][MainThread][INFO]][dataset_analyzer.py:142] Using provided dataset 'arxiv_summarization_validation' with 6436 conversations
[2025-11-12 16:04:29,653][oumi][rank0][pid:33744][MainThread][INFO]][dataset_analyzer.py:275] Initia

Converting arxiv_summarization_validation to DataFrames: 100%|██████████| 2000/2000 [00:00<00:00, 4276.16item/s]


✓ Validation analysis completed successfully

✓ Validation analysis complete!
  Total messages analyzed: 4000

Validation Set Summary Statistics by Role:

USER messages (2000 total):
  Token count - Mean: 8948.6, Median: 7461.0, Min: 245, Max: 55237
  Character count - Mean: 33872.9, Median: 29023.5
  Word count - Mean: 6035.0, Median: 5166.0

ASSISTANT messages (2000 total):
  Token count - Mean: 199.7, Median: 192.0, Min: 52, Max: 697
  Character count - Mean: 958.7, Median: 937.0
  Word count - Mean: 161.4, Median: 157.0


In [13]:
# Filter dataset to conversations where user (article) token count < 7500
val_filtered_dataset = val_analyzer.filter("text_content_token_count < 7500 & role == 'user'")
print(f"Filtered dataset size: {len(val_filtered_dataset)} conversations")

[2025-11-12 16:07:32,031][oumi][rank0][pid:33744][MainThread][INFO]][dataset_analyzer.py:435] Query 'text_content_token_count < 7500 & role == 'user'' returned 1008 rows
[2025-11-12 16:07:32,033][oumi][rank0][pid:33744][MainThread][INFO]][dataset_analyzer.py:580] Filtered dataset: 1008 conversations out of 6436 total
Filtered dataset size: 1008 conversations


In [14]:
# Take first 1k samples and save to JSONL using oumi's save_jsonlines utility
from pathlib import Path
from oumi.utils.io_utils import save_jsonlines

output_dir = Path("/Users/ryanarman/code/lab/arxiv_abstract/data")
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "arxiv_summarization_val_filtered_10k.jsonl"

# Limit to first 10k conversations
num_samples = min(1000, len(val_filtered_dataset))
print(f"Saving first {num_samples} conversations to {output_file}...")

# Extract conversations and convert to dict format
# Use raw() method to get unconverted data to avoid tokenization
conversations_data = []
for i in range(num_samples):
    # Access raw data (pandas Series) to avoid tokenization
    raw_sample = val_filtered_dataset.raw(i)
    
    # The raw data is stored in _messages_column (pandas Series)
    # Extract the conversation dict from the Series
    if '_messages_column' in raw_sample:
        conversation_dict = raw_sample['_messages_column']
    else:
        # Fallback: get the first value if it's a Series
        conversation_dict = raw_sample.iloc[0] if hasattr(raw_sample, 'iloc') else raw_sample
    
    # conversation_dict should now be the original dict from the JSONL file
    # Ensure it's in the right format
    if isinstance(conversation_dict, dict):
        conversations_data.append(conversation_dict)
    else:
        # If it's somehow not a dict, try to convert
        if hasattr(conversation_dict, 'to_dict'):
            conversations_data.append(conversation_dict.to_dict())
        else:
            conversations_data.append({"messages": getattr(conversation_dict, 'messages', [])})

# Save using oumi's save_jsonlines utility
save_jsonlines(output_file, conversations_data)

print(f"✓ Successfully saved {len(conversations_data)} conversations to {output_file}")
print(f"  File size: {output_file.stat().st_size / (1024*1024):.2f} MB")

Saving first 1000 conversations to /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_val_filtered_10k.jsonl...
✓ Successfully saved 1000 conversations to /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_val_filtered_10k.jsonl
  File size: 18.96 MB


## Test

In [15]:
# Analyze the test dataset using DatasetAnalyzer
from pathlib import Path

# Test dataset path
test_output_path = Path("/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_test_oumi.jsonl")

# Check if file exists
if not test_output_path.exists():
    raise FileNotFoundError(
        f"Dataset file not found: {test_output_path}\n"
        f"Please run the previous cell to create the dataset first."
    )

print("Loading test oumi dataset and running analysis...")
print(f"Dataset file: {test_output_path}")

# Load the oumi dataset
test_oumi_dataset = TextSftJsonLinesDataset(dataset_path=str(test_output_path))

print(f"Test dataset loaded: {len(test_oumi_dataset)} conversations")

# Configure analyzer with Qwen3-8B tokenizer and length analyzer
test_config = AnalyzeConfig(
    dataset_source=DatasetSource.DIRECT,
    dataset_name="arxiv_summarization_test",
    sample_count=2000,
    analyzers=[
        SampleAnalyzerParams(
            id="length",
            params={
                "char_count": True,
                "word_count": True,
                "token_count": True,
                "include_special_tokens": True
            }
        )
    ],
    tokenizer_config={
        "model_name": "Qwen/Qwen3-8B",
        "trust_remote_code": True
    }
)

# Create and run the analyzer
print("Running DatasetAnalyzer on test set...")
test_analyzer = DatasetAnalyzer(test_config, dataset=test_oumi_dataset)

# Run analysis - catch numpy compatibility error in summary generation
try:
    test_analyzer.analyze_dataset()
    print("✓ Test analysis completed successfully")
except AttributeError as e:
    if "module 'numpy' has no attribute 'ma'" in str(e):
        print("⚠ Warning: Analysis completed but summary generation failed due to numpy compatibility issue.")
        print("  The analysis data is still available, but summary statistics may be incomplete.")
    else:
        raise

# Get the analysis results
test_analysis_df = test_analyzer.analysis_df

print(f"\n✓ Test analysis complete!")
print(f"  Total messages analyzed: {len(test_analysis_df)}")

# Find columns with metrics
token_cols = [col for col in test_analysis_df.columns if 'token_count' in col.lower()]
char_cols = [col for col in test_analysis_df.columns if 'char_count' in col.lower()]
word_cols = [col for col in test_analysis_df.columns if 'word_count' in col.lower()]

# Display summary statistics by role
print("\n" + "=" * 80)
print("Test Set Summary Statistics by Role:")
print("=" * 80)

for role in ['user', 'assistant']:
    role_rows = test_analysis_df[test_analysis_df['role'] == role]
    if len(role_rows) > 0:
        print(f"\n{role.upper()} messages ({len(role_rows)} total):")
        if token_cols:
            token_col = token_cols[0]
            print(f"  Token count - Mean: {role_rows[token_col].mean():.1f}, "
                  f"Median: {role_rows[token_col].median():.1f}, "
                  f"Min: {role_rows[token_col].min()}, "
                  f"Max: {role_rows[token_col].max()}")
        if char_cols:
            char_col = char_cols[0]
            print(f"  Character count - Mean: {role_rows[char_col].mean():.1f}, "
                  f"Median: {role_rows[char_col].median():.1f}")
        if word_cols:
            word_col = word_cols[0]
            print(f"  Word count - Mean: {role_rows[word_col].mean():.1f}, "
                  f"Median: {role_rows[word_col].median():.1f}")


Loading test oumi dataset and running analysis...
Dataset file: /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_test_oumi.jsonl
[2025-11-12 16:08:47,406][oumi][rank0][pid:33744][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: TextSftJsonLinesDataset)... dataset_name: 'custom'
Test dataset loaded: 6440 conversations
Running DatasetAnalyzer on test set...
[2025-11-12 16:08:47,803][oumi][rank0][pid:33744][MainThread][INFO]][models.py:544] Using the model's built-in chat template for model 'Qwen/Qwen3-8B'.
[2025-11-12 16:08:47,803][oumi.utils.analysis_utils][rank0][pid:33744][MainThread][INFO]][analysis_utils.py:57] Built tokenizer for model: Qwen/Qwen3-8B
[2025-11-12 16:08:47,804][oumi][rank0][pid:33744][MainThread][INFO]][dataset_analyzer.py:142] Using provided dataset 'arxiv_summarization_test' with 6440 conversations
[2025-11-12 16:08:47,804][oumi][rank0][pid:33744][MainThread][INFO]][dataset_analyzer.py:275] Initialized sample analyzer: length


Converting arxiv_summarization_test to DataFrames: 100%|██████████| 2000/2000 [00:00<00:00, 6007.58item/s]


✓ Test analysis completed successfully

✓ Test analysis complete!
  Total messages analyzed: 4000

Test Set Summary Statistics by Role:

USER messages (2000 total):
  Token count - Mean: 8772.7, Median: 7324.0, Min: 282, Max: 67349
  Character count - Mean: 33309.4, Median: 28449.0
  Word count - Mean: 5937.6, Median: 5080.0

ASSISTANT messages (2000 total):
  Token count - Mean: 203.5, Median: 198.0, Min: 54, Max: 608
  Character count - Mean: 973.0, Median: 962.5
  Word count - Mean: 164.2, Median: 161.0


In [16]:
# Filter dataset to conversations where user (article) token count < 7500
test_filtered_dataset = test_analyzer.filter("text_content_token_count < 7500 & role == 'user'")
print(f"Filtered dataset size: {len(test_filtered_dataset)} conversations")

[2025-11-12 16:09:12,514][oumi][rank0][pid:33744][MainThread][INFO]][dataset_analyzer.py:435] Query 'text_content_token_count < 7500 & role == 'user'' returned 1026 rows
[2025-11-12 16:09:12,516][oumi][rank0][pid:33744][MainThread][INFO]][dataset_analyzer.py:580] Filtered dataset: 1026 conversations out of 6440 total
Filtered dataset size: 1026 conversations


In [17]:


# Take first 1k samples and save to JSONL using oumi's save_jsonlines utility
from pathlib import Path
from oumi.utils.io_utils import save_jsonlines

output_dir = Path("/Users/ryanarman/code/lab/arxiv_abstract/data")
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "arxiv_summarization_test_filtered_10k.jsonl"

# Limit to first 10k conversations
num_samples = min(1000, len(test_filtered_dataset))
print(f"Saving first {num_samples} conversations to {output_file}...")

# Extract conversations and convert to dict format
# Use raw() method to get unconverted data to avoid tokenization
conversations_data = []
for i in range(num_samples):
    # Access raw data (pandas Series) to avoid tokenization
    raw_sample = test_filtered_dataset.raw(i)
    
    # The raw data is stored in _messages_column (pandas Series)
    # Extract the conversation dict from the Series
    if '_messages_column' in raw_sample:
        conversation_dict = raw_sample['_messages_column']
    else:
        # Fallback: get the first value if it's a Series
        conversation_dict = raw_sample.iloc[0] if hasattr(raw_sample, 'iloc') else raw_sample
    
    # conversation_dict should now be the original dict from the JSONL file
    # Ensure it's in the right format
    if isinstance(conversation_dict, dict):
        conversations_data.append(conversation_dict)
    else:
        # If it's somehow not a dict, try to convert
        if hasattr(conversation_dict, 'to_dict'):
            conversations_data.append(conversation_dict.to_dict())
        else:
            conversations_data.append({"messages": getattr(conversation_dict, 'messages', [])})

# Save using oumi's save_jsonlines utility
save_jsonlines(output_file, conversations_data)

print(f"✓ Successfully saved {len(conversations_data)} conversations to {output_file}")
print(f"  File size: {output_file.stat().st_size / (1024*1024):.2f} MB")

Saving first 1000 conversations to /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_test_filtered_10k.jsonl...
✓ Successfully saved 1000 conversations to /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_test_filtered_10k.jsonl
  File size: 19.09 MB


# System instructions

In [32]:
from utils import (
    evaluate_summary,
    evaluate_summaries_batch,
    display_text,
    display_message,
    load_conversations,
    client,
    JUDGE_SYSTEM_INSTRUCTION,
    JUDGE_PROMPT_TEMPLATE_WITH_REQUEST_AND_RESPONSE
)

In [33]:
train_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train_filtered_10k.jsonl"
val_path =   "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_val_filtered_10k.jsonl"
test_path =  "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_test_filtered_10k.jsonl"
# train_distilled_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_abstract_train_gpt5mini_think2.jsonl"
train_conversations = load_conversations(train_path)
val_conversations = load_conversations(val_path)
test_conversations = load_conversations(test_path)


In [42]:
def convert_conversations_to_content_format(conversations):
    """
    Convert conversations from messages format to content format.
    
    Args:
        conversations: List of conversation message lists, where each conversation
                      is a list of dicts with 'role' and 'content' keys.
                      Expected format: [[{'role': 'user', 'content': '...'}, 
                                        {'role': 'assistant', 'content': '...'}], ...]
    
    Returns:
        List of dicts in format: [{"content": {"request": "...", "response": "..."}}, ...]
    """
    converted = []
    
    for conv in conversations:
        user_message = None
        assistant_message = None
        
        # Extract user and assistant messages
        for msg in conv:
            if msg['role'] == 'user':
                user_message = msg['content']
            elif msg['role'] == 'assistant':
                assistant_message = msg['content']
        
        # Skip if either message is missing
        if user_message is None or assistant_message is None:
            continue
        
        # Create the new format
        converted.append({
            "content": {
                "request": user_message,
                "response": assistant_message
            }
        })
    
    return converted

# Example usage:
converted_test = convert_conversations_to_content_format(test_conversations)
print(converted_test[1])


{'content': {'request': 'for the hybrid monte carlo algorithm ( hmc)@xcite , often used to study quantum chromodynamics ( qcd ) on the lattice , one is interested in efficient numerical time integration schemes which are optimal in terms of computational costs per trajectory for a given acceptance rate . high order \n numerical methods allow the use of larger step sizes , but demand a larger computational effort per step ; low order schemes do not require such large computational costs per step , but need more steps per trajectory . \n so there is a need to balance these opposing effects . \n omelyan integration schemes @xcite of a force - gradient type have proved to be an efficient choice , since it is easy to obtain higher order schemes that demand a small additional computational effort . \n these schemes use higher - order information from force - gradient terms to both increase the convergence of the method and decrease the size of the leading error coefficient . other ideas to a

In [None]:
def save_converted_to_jsonl(converted_data, output_path):
    """
    Save converted conversations to a JSONL file.
    
    Args:
        converted_data: List of dicts in format [{"content": {"request": "...", "response": "..."}}, ...]
        output_path: Path to save the output JSONL file (str or Path)
    """
    from pathlib import Path
    
    # Ensure output_path is a Path object
    output_path = Path(output_path)
    
    # Create parent directories if they don't exist
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    # Write to JSONL file
    print(f"Saving {len(converted_data)} converted conversations to {output_path}...")
    with open(output_path, 'w', encoding='utf-8') as f:
        for item in converted_data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    
    print(f"✓ Successfully saved {len(converted_data)} conversations to {output_path}")
    print(f"  File size: {output_path.stat().st_size / (1024*1024):.2f} MB")

# Example usage:
# converted_test = convert_conversations_to_content_format(test_conversations)
# save_converted_to_jsonl(converted_test, "/path/to/output.jsonl")


In [38]:
# Create Qwen4b Instruct datasets with judge evaluation feedback

# System instruction for Qwen4b instruct model
SYSTEM_INSTRUCTION = """You are an expert academic abstract writer. Your task is to create a high-quality abstract for an arXiv paper based on the paper content and judge evaluation feedback.

The judge evaluates abstracts based on five dimensions:
1. Faithfulness: The abstract must accurately reflect the paper's content without hallucination
2. Coverage: The abstract must include the essential aspects (main problem, approach, and key results)
3. Clarity: The abstract must be understandable and readable
4. Conciseness: The abstract must be focused and not verbose
5. Coherence: The abstract must be logically structured and flow naturally

When creating the abstract:
- Read the paper content carefully
- Pay attention to the judge's feedback on what makes a good abstract
- Ensure your abstract meets all five evaluation criteria
- Write a concise, clear, and coherent summary that accurately covers the paper's main contributions
- Focus on the main problem, approach, and key results

Your response should be the abstract only, without any additional commentary or explanation."""

import json
from tqdm import tqdm

def create_instruct_dataset(conversations, output_path, add_abstract=True):
    """
    Create Qwen4b instruct dataset with judge evaluation criteria.
    
    Args:
        conversations: List of conversation message lists
        output_path: Path to save the output JSONL file
    """
    # Create instruct format
    instruct_conversations = []
    
    print(f"Creating instruct format for {len(conversations)} conversations...")
    for conv in tqdm(conversations):
        # Extract paper content (user message) and abstract (assistant message)
        paper_content = None
        abstract = None
        
        for msg in conv:
            if msg['role'] == 'user':
                paper_content = msg['content']
            elif msg['role'] == 'assistant':
                abstract = msg['content']
        
        if paper_content is None or abstract is None:
            continue
        
        # Create user message with paper content and instructions about judge expectations
        user_content = f"""Paper Content:
{paper_content}

---

Create a high-quality abstract for this paper that meets all five evaluation criteria:
1. Faithfulness: Accurately reflect the paper's content without hallucination
2. Coverage: Include the essential aspects (main problem, approach, and key results)
3. Clarity: Be understandable and readable
4. Conciseness: Be focused and not verbose
5. Coherence: Be logically structured and flow naturally"""
        
        # Create Qwen4b instruct format
        new_conv = {
            "messages": [
                {
                    "role": "system",
                    "content": SYSTEM_INSTRUCTION
                },
                {
                    "role": "user",
                    "content": user_content
                }
            ]
        }

        if add_abstract:
            new_conv['messages'].append({
                "role": "assistant",
                "content": abstract
            })
        
        # Append the conversation to the list
        instruct_conversations.append(new_conv)
    
    # Save to JSONL
    print(f"\nSaving {len(instruct_conversations)} conversations to {output_path}...")
    with open(output_path, 'w', encoding='utf-8') as f:
        for conv in instruct_conversations:
            f.write(json.dumps(conv, ensure_ascii=False) + '\n')
    
    print(f"✓ Saved {len(instruct_conversations)} conversations to {output_path}")
    
    return instruct_conversations

# Create train dataset
train_instruct_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train_instruct.jsonl"
print("="*80)
print("CREATING TRAIN DATASET")
print("="*80)


train_instruct_conversations = create_instruct_dataset(
    train_conversations,
    train_instruct_path,
    add_abstract=True
)

# Create validation dataset
val_instruct_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_val_instruct.jsonl"
print("\n" + "="*80)
print("CREATING VALIDATION DATASET")
print("="*80)
val_instruct_conversations = create_instruct_dataset(
    val_conversations,
    val_instruct_path,
    add_abstract=True
)

# Create validation dataset
test_instruct_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_test_instruct.jsonl"
print("\n" + "="*80)
print("CREATING VALIDATION DATASET")
print("="*80)
test_instruct_conversations = create_instruct_dataset(
    test_conversations,
    test_instruct_path,
    add_abstract=False
)

CREATING TRAIN DATASET
Creating instruct format for 10000 conversations...


100%|██████████| 10000/10000 [00:00<00:00, 316055.10it/s]



Saving 10000 conversations to /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train_instruct.jsonl...
✓ Saved 10000 conversations to /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train_instruct.jsonl

CREATING VALIDATION DATASET
Creating instruct format for 1000 conversations...


100%|██████████| 1000/1000 [00:00<00:00, 374792.60it/s]



Saving 1000 conversations to /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_val_instruct.jsonl...
✓ Saved 1000 conversations to /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_val_instruct.jsonl

CREATING VALIDATION DATASET
Creating instruct format for 1000 conversations...


100%|██████████| 1000/1000 [00:00<00:00, 321846.53it/s]



Saving 1000 conversations to /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_test_instruct.jsonl...
✓ Saved 1000 conversations to /Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_test_instruct.jsonl
