In [2]:
# Import utility functions
from utils import (
    evaluate_summary,
    evaluate_summaries_batch,
    display_text,
    display_message,
    load_conversations,
    client,
    JUDGE_SYSTEM_INSTRUCTION,
    JUDGE_PROMPT_TEMPLATE_WITH_REQUEST_AND_RESPONSE
)




# Examine data

In [3]:

# train_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train.jsonl"
train_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train_filtered_10k.jsonl"
val_path =   "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_val_filtered_10k.jsonl"
test_path =  "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_test_filtered_10k.jsonl"
# train_distilled_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_abstract_train_gpt5mini_think2.jsonl"
train_conversations = load_conversations(train_path)
val_conversations = load_conversations(val_path)
test_conversations = load_conversations(test_path)
# train_distilled_conversations = load_conversations(train_distilled_path)

In [3]:
test_conversations[0]

[{'role': 'user',
  'content': 'as a common quantum phenomenon , the tunneling through a potential barrier plays a very important role in the microscopic world and has been studied extensively since the birth of quantum mechanics . \n one of the earliest applications of quantum tunneling is the explanation of @xmath0 decays in atomic nuclei . \n the quantum tunneling effect governs also many other nuclear processes such as fission and fusion . \n in particular , a lot of new features are revealed in sub - barrier fusion reactions which are closely connected with the tunneling phenomena  @xcite .    for most of the potential barriers , the penetrability can not be calculated analytically  @xcite . among those potentials for which analytical solutions can be obtained , \n the parabolic potential  @xcite is the mostly used in the study of nuclear fusion . by approximating the coulomb barrier to a parabola \n , wong derived an analytic expression for the fusion cross section  @xcite which 

# Judge

In [None]:
import json
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

# File paths
finetuned_eval = '/Users/ryanarman/code/lab/arxiv_abstract/data/fine_tuned_eval_qwen3_4b_test.jsonl'
original_eval = '/Users/ryanarman/code/lab/arxiv_abstract/data/original_eval_qwen3_4b_test.jsonl'
base_eval = '/Users/ryanarman/code/lab/arxiv_abstract/data/baseline_eval_qwen3_4b_test.jsonl'

def parse_scores(explanation):
    """
    Parse scores from explanation text, handling different formats:
    - "Faithfulness (78/100):"
    - "Faithfulness: 78/100"
    - "Faithfulness (78):"
    - "Overall (82/100):"
    """
    scores = {}
    
    # Dimensions to extract
    dimensions = ['Faithfulness', 'Coverage', 'Clarity', 'Conciseness', 'Coherence', 'Overall']
    
    for dim in dimensions:
        # Try different patterns
        patterns = [
            rf'{dim}\s*\((\d+)/100\)',  # "Faithfulness (78/100):"
            rf'{dim}\s*\((\d+)\)',      # "Faithfulness (78):"
            rf'{dim}:\s*(\d+)/100',     # "Faithfulness: 78/100"
            rf'{dim}:\s*(\d+)',         # "Faithfulness: 78"
            rf'{dim}\s+(\d+)/100',      # "Faithfulness 78/100"
            rf'{dim}\s+(\d+)',          # "Faithfulness 78"
        ]
        
        for pattern in patterns:
            match = re.search(pattern, explanation, re.IGNORECASE)
            if match:
                score = int(match.group(1))
                scores[dim] = score
                break
    
    return scores

def load_eval_results(filepath):
    """Load evaluation results from JSONL file"""
    results = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                data = json.loads(line)
                scores = parse_scores(data.get('explanation', ''))
                results.append({
                    'index': data.get('index', len(results)),
                    'judgment': data.get('judgment', ''),
                    'scores': scores,
                    'explanation': data.get('explanation', '')
                })
    return results

# Load all results
print("Loading evaluation results...")
finetuned_results = load_eval_results(finetuned_eval)
original_results = load_eval_results(original_eval)
base_results = load_eval_results(base_eval)

print(f"Loaded {len(finetuned_results)} fine-tuned results")
print(f"Loaded {len(original_results)} original results")
print(f"Loaded {len(base_results)} baseline results")

# Create comparison DataFrame
def create_comparison_df(results_list, names):
    """Create a comparison DataFrame from multiple result sets"""
    all_data = []
    dimensions = ['Faithfulness', 'Coverage', 'Clarity', 'Conciseness', 'Coherence', 'Overall']
    
    for idx in range(len(results_list[0])):
        row = {'index': idx}
        for i, (results, name) in enumerate(zip(results_list, names)):
            if idx < len(results):
                result = results[idx]
                row[f'{name}_judgment'] = result['judgment']
                for dim in dimensions:
                    score = result['scores'].get(dim, np.nan)
                    row[f'{name}_{dim}'] = score
        all_data.append(row)
    
    return pd.DataFrame(all_data)

# Create comparison DataFrame
comparison_df = create_comparison_df(
    [finetuned_results, original_results, base_results],
    ['finetuned', 'original', 'base']
)

print("\nComparison DataFrame created!")
print(f"Shape: {comparison_df.shape}")
print("\nFirst few rows:")
print(comparison_df.head())


In [None]:
# Calculate summary statistics
dimensions = ['Faithfulness', 'Coverage', 'Clarity', 'Conciseness', 'Coherence', 'Overall']
models = ['finetuned', 'original', 'base']

summary_stats = []
for dim in dimensions:
    for model in models:
        col = f'{model}_{dim}'
        if col in comparison_df.columns:
            scores = comparison_df[col].dropna()
            if len(scores) > 0:
                summary_stats.append({
                    'Dimension': dim,
                    'Model': model,
                    'Mean': scores.mean(),
                    'Std': scores.std(),
                    'Min': scores.min(),
                    'Max': scores.max(),
                    'Median': scores.median(),
                    'Count': len(scores)
                })

summary_df = pd.DataFrame(summary_stats)
print("Summary Statistics by Dimension and Model:")
print("=" * 80)
print(summary_df.to_string(index=False))

# Pivot for easier comparison
pivot_mean = summary_df.pivot(index='Dimension', columns='Model', values='Mean')
print("\n\nMean Scores Comparison:")
print("=" * 80)
print(pivot_mean.round(2))

# Calculate improvements
print("\n\nFine-tuned vs Baseline Improvements:")
print("=" * 80)
improvements = []
for dim in dimensions:
    base_col = f'base_{dim}'
    finetuned_col = f'finetuned_{dim}'
    if base_col in comparison_df.columns and finetuned_col in comparison_df.columns:
        base_mean = comparison_df[base_col].mean()
        finetuned_mean = comparison_df[finetuned_col].mean()
        improvement = finetuned_mean - base_mean
        improvement_pct = (improvement / base_mean * 100) if base_mean > 0 else 0
        improvements.append({
            'Dimension': dim,
            'Baseline': base_mean,
            'Fine-tuned': finetuned_mean,
            'Improvement': improvement,
            'Improvement %': improvement_pct
        })

improvements_df = pd.DataFrame(improvements)
print(improvements_df.round(2).to_string(index=False))


In [None]:
# Yes/No judgment comparison
print("Judgment (Yes/No) Comparison:")
print("=" * 80)

judgment_counts = {}
for model in models:
    col = f'{model}_judgment'
    if col in comparison_df.columns:
        counts = comparison_df[col].value_counts()
        judgment_counts[model] = counts
        print(f"\n{model.upper()}:")
        print(counts)
        if len(counts) > 0:
            yes_pct = (counts.get('Yes', 0) / len(comparison_df[col].dropna()) * 100) if len(comparison_df[col].dropna()) > 0 else 0
            print(f"  Yes: {yes_pct:.1f}%")

# Create judgment comparison DataFrame
judgment_df = pd.DataFrame(judgment_counts).fillna(0)
print("\n\nJudgment Counts Table:")
print(judgment_df)


In [None]:
# Visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, dim in enumerate(dimensions):
    ax = axes[idx]
    
    # Prepare data for box plot
    data_to_plot = []
    labels = []
    for model in models:
        col = f'{model}_{dim}'
        if col in comparison_df.columns:
            scores = comparison_df[col].dropna().tolist()
            if scores:
                data_to_plot.append(scores)
                labels.append(model.capitalize())
    
    if data_to_plot:
        bp = ax.boxplot(data_to_plot, labels=labels, patch_artist=True)
        
        # Color the boxes
        colors = ['#FF6B6B', '#4ECDC4', '#95E1D3']
        for patch, color in zip(bp['boxes'], colors[:len(bp['boxes'])]):
            patch.set_facecolor(color)
            patch.set_alpha(0.7)
        
        ax.set_title(f'{dim} Scores Comparison', fontsize=14, fontweight='bold')
        ax.set_ylabel('Score', fontsize=12)
        ax.grid(True, alpha=0.3)
        ax.set_ylim(0, 100)

plt.tight_layout()
plt.suptitle('Score Distributions by Dimension and Model', fontsize=16, fontweight='bold', y=1.02)
plt.show()


In [None]:
# Bar plot comparing mean scores
fig, ax = plt.subplots(figsize=(14, 8))

x = np.arange(len(dimensions))
width = 0.25

for i, model in enumerate(models):
    means = []
    stds = []
    for dim in dimensions:
        col = f'{model}_{dim}'
        if col in comparison_df.columns:
            scores = comparison_df[col].dropna()
            means.append(scores.mean() if len(scores) > 0 else 0)
            stds.append(scores.std() if len(scores) > 0 else 0)
        else:
            means.append(0)
            stds.append(0)
    
    offset = (i - 1) * width
    ax.bar(x + offset, means, width, label=model.capitalize(), 
           yerr=stds, capsize=5, alpha=0.8)

ax.set_xlabel('Dimension', fontsize=12, fontweight='bold')
ax.set_ylabel('Mean Score', fontsize=12, fontweight='bold')
ax.set_title('Mean Scores by Dimension and Model', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(dimensions, rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')
ax.set_ylim(0, 100)

plt.tight_layout()
plt.show()


In [None]:
# Heatmap of mean scores
heatmap_data = []
for dim in dimensions:
    row = []
    for model in models:
        col = f'{model}_{dim}'
        if col in comparison_df.columns:
            mean_score = comparison_df[col].mean()
            row.append(mean_score)
        else:
            row.append(np.nan)
    heatmap_data.append(row)

heatmap_df = pd.DataFrame(heatmap_data, index=dimensions, columns=[m.capitalize() for m in models])

plt.figure(figsize=(10, 8))
sns.heatmap(heatmap_df, annot=True, fmt='.1f', cmap='RdYlGn', 
            vmin=0, vmax=100, cbar_kws={'label': 'Mean Score'}, 
            linewidths=0.5, linecolor='gray')
plt.title('Mean Scores Heatmap', fontsize=14, fontweight='bold', pad=20)
plt.xlabel('Model', fontsize=12, fontweight='bold')
plt.ylabel('Dimension', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()


In [None]:
# Detailed per-example comparison
print("Examples where fine-tuned model improved over baseline:")
print("=" * 80)

improved_examples = []
for idx in range(len(comparison_df)):
    row = comparison_df.iloc[idx]
    
    # Check overall score improvement
    base_overall = row.get('base_Overall', np.nan)
    finetuned_overall = row.get('finetuned_Overall', np.nan)
    
    if not np.isnan(base_overall) and not np.isnan(finetuned_overall):
        if finetuned_overall > base_overall:
            improvement = finetuned_overall - base_overall
            improved_examples.append({
                'index': idx,
                'baseline_overall': base_overall,
                'finetuned_overall': finetuned_overall,
                'improvement': improvement
            })

improved_df = pd.DataFrame(improved_examples)
if len(improved_df) > 0:
    print(f"\nTotal examples improved: {len(improved_df)} out of {len(comparison_df)}")
    print(f"Average improvement: {improved_df['improvement'].mean():.2f} points")
    print(f"Max improvement: {improved_df['improvement'].max():.2f} points")
    print("\nTop 10 improvements:")
    print(improved_df.nlargest(10, 'improvement')[['index', 'baseline_overall', 'finetuned_overall', 'improvement']].to_string(index=False))
else:
    print("No examples found where fine-tuned improved over baseline")

# Count improvements by dimension
print("\n\nImprovements by Dimension:")
print("=" * 80)
for dim in dimensions:
    base_col = f'base_{dim}'
    finetuned_col = f'finetuned_{dim}'
    if base_col in comparison_df.columns and finetuned_col in comparison_df.columns:
        improved = (comparison_df[finetuned_col] > comparison_df[base_col]).sum()
        same = (comparison_df[finetuned_col] == comparison_df[base_col]).sum()
        worse = (comparison_df[finetuned_col] < comparison_df[base_col]).sum()
        total = improved + same + worse
        print(f"\n{dim}:")
        print(f"  Improved: {improved} ({improved/total*100:.1f}%)")
        print(f"  Same: {same} ({same/total*100:.1f}%)")
        print(f"  Worse: {worse} ({worse/total*100:.1f}%)")


In [None]:
# Export comparison results to CSV
output_file = '/Users/ryanarman/code/lab/arxiv_abstract/data/evaluation_comparison.csv'
comparison_df.to_csv(output_file, index=False)
print(f"Comparison results exported to: {output_file}")

# Export summary statistics
summary_file = '/Users/ryanarman/code/lab/arxiv_abstract/data/evaluation_summary_stats.csv'
summary_df.to_csv(summary_file, index=False)
print(f"Summary statistics exported to: {summary_file}")

# Export improvements
if len(improvements_df) > 0:
    improvements_file = '/Users/ryanarman/code/lab/arxiv_abstract/data/evaluation_improvements.csv'
    improvements_df.to_csv(improvements_file, index=False)
    print(f"Improvements data exported to: {improvements_file}")

print("\nAll comparison data has been exported!")


In [11]:
# Test the helper function on the first message
# First, let's examine train_conversations[0]

conv = train_conversations[6]
print("="*80)
print(f"Number of messages: {len(conv)}")
print(f"Message roles: {[msg['role'] for msg in conv]}")
print("\n" + "="*80)
print("USER MESSAGE (the paper content):")
print("="*80)
# display_message(train_conversations[0], role='user')
# print("\n" + "="*80)
print("ASSISTANT MESSAGE (the summary being evaluated):")
print("="*80)
display_message(conv, role='assistant')

print("\n" + "="*80)
print("JUDGE EVALUATION:")
print("="*80)
result, evaluation_prompt = evaluate_summary(conv, model="gpt-5")

print(f"Judgment: {result['judgment']}")
print(f"\nExplanation:\n{result['explanation']}")


Number of messages: 2
Message roles: ['user', 'assistant']

USER MESSAGE (the paper content):
ASSISTANT MESSAGE (the summary being evaluated):
Role: ASSISTANT
Characters: 1,721 | Words: 315 | Lines: 8




JUDGE EVALUATION:
Judgment: No

Explanation:
Faithfulness (60/100):
- The summary largely reflects the paper’s topic (object recognition to aid segmentation via b-scale and shape models) and the hierarchical recognition idea. However, it introduces a specific evaluation detail not present in the provided text: “a set of 20 routine clinical abdominal female and male CT data sets.” The number 20 is not supported by the text (which uses variables for the number of subjects and mentions only that data are routine PET/CT-derived CT scans).
- It also paraphrases a reported finding as “recognition accuracy” improving with more objects, whereas the paper explicitly states “specificity” increases; this is a subtle but meaningful shift in the reported metric.
- It slightly overstates the conclusion with “make delineation most accurate,” which is stronger than the paper’s framing that efficient recognition enables successful delineation.

Coverage (85/100):
- The summary states the main problem 

## Eval original abstracts

In [64]:
# Batch evaluation for the first 10 conversations

original_results, original_errors = evaluate_summaries_batch(
    test_conversations,
    model="gpt-5",
    temperature=1.0,
    max_workers=1000,  # Adjust based on your API rate limits
    show_progress=True
)

# Display results
print("\n" + "="*80)
print("BATCH EVALUATION RESULTS")
print("="*80)

for idx, result, prompt in original_results:
    print(f"\nConversation {idx}:")
    print(f"  Judgment: {result['judgment']}")
    # Optionally show a snippet of the explanation
    explanation_preview = result['explanation'][:200] + "..." if len(result['explanation']) > 200 else result['explanation']
    print(f"  Explanation preview: {explanation_preview}")

if original_errors:
    print(f"\nErrors encountered: {len(original_errors)}")
    for idx, error in original_errors:
        print(f"  Conversation {idx}: {error}")

# Summary statistics
judgments = [result['judgment'] for _, result, _ in original_results]
yes_count = judgments.count('Yes')
no_count = judgments.count('No')
unknown_count = judgments.count('Unknown')

print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(f"Total evaluated: {len(original_results)}")
print(f"Yes: {yes_count} ({yes_count/len(original_results)*100:.1f}%)")
print(f"No: {no_count} ({no_count/len(original_results)*100:.1f}%)")
print(f"Unknown: {unknown_count} ({unknown_count/len(original_results)*100:.1f}%)")


Evaluating 1000 conversations with 1000 workers...
  Completed 1/1000
  Completed 2/1000
  Completed 3/1000
  Completed 4/1000
  Completed 5/1000
  Completed 6/1000
  Completed 7/1000
  Completed 8/1000
  Completed 9/1000
  Completed 10/1000
  Completed 11/1000
  Completed 12/1000
  Completed 13/1000
  Completed 14/1000
  Completed 15/1000
  Completed 16/1000
  Completed 17/1000
  Completed 18/1000
  Completed 19/1000
  Completed 20/1000
  Completed 21/1000
  Completed 22/1000
  Completed 23/1000
  Completed 24/1000
  Completed 25/1000
  Completed 26/1000
  Completed 27/1000
  Completed 28/1000
  Completed 29/1000
  Completed 30/1000
  Completed 31/1000
  Completed 32/1000
  Completed 33/1000
  Completed 34/1000
  Completed 35/1000
  Completed 36/1000
  Completed 37/1000
  Completed 38/1000
  Completed 39/1000
  Completed 40/1000
  Completed 41/1000
  Completed 42/1000
  Completed 43/1000
  Completed 44/1000
  Completed 45/1000
  Completed 46/1000
  Completed 47/1000
  Completed 48/100

In [65]:
# Save results to JSONL file
import json

# Create filename with baseline_eval in it
output_filename = f"/Users/ryanarman/code/lab/arxiv_abstract/data/original_eval_qwen3_4b_test.jsonl"

# Write results to JSONL file
with open(output_filename, 'w', encoding='utf-8') as f:
    for idx, result, prompt in original_results:
        output_data = {
            'index': idx,
            'judgment': result['judgment'],
            'explanation': result['explanation'],
            'evaluation_prompt': prompt
        }
        f.write(json.dumps(output_data, ensure_ascii=False) + '\n')

print(f"Saved {len(original_results)} results to {output_filename}")


Saved 1000 results to /Users/ryanarman/code/lab/arxiv_abstract/data/original_eval_qwen3_4b_test.jsonl


## original abstract: 75%

# Run inference on the base model

In [59]:
train_instruct_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train_instruct.jsonl"
val_instruct_path =   "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_val_instruct.jsonl"
test_instruct_path =  "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_test_instruct.jsonl"
# train_distilled_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_abstract_train_gpt5mini_think2.jsonl"
train_instruct_conversations = load_conversations(train_instruct_path)
val_instruct_conversations = load_conversations(val_instruct_path)
test_instruct_conversations = load_conversations(test_instruct_path)


In [60]:
train_instruct_conversations[0]

[{'role': 'system',
  'content': "You are an expert academic abstract writer. Your task is to create a high-quality abstract for an arXiv paper based on the paper content and judge evaluation feedback.\n\nThe judge evaluates abstracts based on five dimensions:\n1. Faithfulness: The abstract must accurately reflect the paper's content without hallucination\n2. Coverage: The abstract must include the essential aspects (main problem, approach, and key results)\n3. Clarity: The abstract must be understandable and readable\n4. Conciseness: The abstract must be focused and not verbose\n5. Coherence: The abstract must be logically structured and flow naturally\n\nWhen creating the abstract:\n- Read the paper content carefully\n- Pay attention to the judge's feedback on what makes a good abstract\n- Ensure your abstract meets all five evaluation criteria\n- Write a concise, clear, and coherent summary that accurately covers the paper's main contributions\n- Focus on the main problem, approach,

In [14]:
test_instruct_conversations[0]

[{'role': 'system',
  'content': "You are an expert academic abstract writer. Your task is to create a high-quality abstract for an arXiv paper based on the paper content and judge evaluation feedback.\n\nThe judge evaluates abstracts based on five dimensions:\n1. Faithfulness: The abstract must accurately reflect the paper's content without hallucination\n2. Coverage: The abstract must include the essential aspects (main problem, approach, and key results)\n3. Clarity: The abstract must be understandable and readable\n4. Conciseness: The abstract must be focused and not verbose\n5. Coherence: The abstract must be logically structured and flow naturally\n\nWhen creating the abstract:\n- Read the paper content carefully\n- Pay attention to the judge's feedback on what makes a good abstract\n- Ensure your abstract meets all five evaluation criteria\n- Write a concise, clear, and coherent summary that accurately covers the paper's main contributions\n- Focus on the main problem, approach,

How to run inference on the cluster

From the arxiv_abstract root directory:

Option 1: Basic usage (uses defaults)
  cd scripts
  ./submit_inference_rsync.sh

Option 2: Custom input file
  cd scripts
  ./submit_inference_rsync.sh ../data/arxiv_summarization_test_instruct.jsonl

Option 3: Full specification
  cd scripts
  ./submit_inference_rsync.sh \
    ../data/arxiv_summarization_test_instruct.jsonl \
    ../configs/4b_instruct_vllm_infer.yaml \
    qwen3_4b_test

Option 4: With trained LoRA adapter
  cd scripts
  ./submit_inference_rsync.sh \
    ../data/arxiv_summarization_test_instruct.jsonl \
    ../configs/4b_instruct_vllm_infer.yaml \
    output_name \
    path/to/adapter/checkpoint \
    ryan@exun

Defaults:
  - Input: data/arxiv_summarization_test_instruct.jsonl
  - Config: configs/4b_instruct_vllm_infer.yaml
  - Output name: output
  - Cluster: ryan@exun

The script will:
  1. Copy files to the cluster using rsync
  2. Submit a SLURM job for inference
  3. Output will be saved to: data/output_<job_id>.jsonl on the cluster

To check job status:
  ssh ryan@exun 'squeue -u ryan'

To view logs:
  ssh ryan@exun 'tail -f /home/ryan/code/oumi/lab/arxiv_abstract/logs/arxiv_abstract_inference_qwen3_4b_*.log'

To download results:
  scp ryan@exun:/home/ryan/code/oumi/lab/arxiv_abstract/data/output_*.jsonl ./data/

In [30]:
base_qwen3_4b_test_path =  "/Users/ryanarman/code/lab/arxiv_abstract/data/qwen3_4b_test_2795.jsonl"
# train_distilled_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_abstract_train_gpt5mini_think2.jsonl"
base_qwen3_4b_test_conversations = load_conversations(base_qwen3_4b_test_path)


In [41]:
base_qwen3_4b_test_conversations_eval = [conv[1:] for conv in base_qwen3_4b_test_conversations]

In [None]:
conv = base_qwen3_4b_test_conversations_eval[0]

[{'content': 'Paper Content:\nas a common quantum phenomenon , the tunneling through a potential barrier plays a very important role in the microscopic world and has been studied extensively since the birth of quantum mechanics . \n one of the earliest applications of quantum tunneling is the explanation of @xmath0 decays in atomic nuclei . \n the quantum tunneling effect governs also many other nuclear processes such as fission and fusion . \n in particular , a lot of new features are revealed in sub - barrier fusion reactions which are closely connected with the tunneling phenomena  @xcite .    for most of the potential barriers , the penetrability can not be calculated analytically  @xcite . among those potentials for which analytical solutions can be obtained , \n the parabolic potential  @xcite is the mostly used in the study of nuclear fusion . by approximating the coulomb barrier to a parabola \n , wong derived an analytic expression for the fusion cross section  @xcite which is

In [None]:
result, evaluation_prompt = evaluate_summary(conv, model="gpt-5")
print(f"Judgment: {result['judgment']}")
print(f"\nExplanation:\n{result['explanation']}")



In [None]:
# Batch evaluation for the first 10 conversations

base_results, base_errors = evaluate_summaries_batch(
    # base_qwen3_4b_test_conversations_eval[:10],
    base_qwen3_4b_test_conversations_eval,
    model="gpt-5",
    temperature=1.0,
    max_workers=1000,  # Adjust based on your API rate limits
    show_progress=True
)

# Display results
print("\n" + "="*80)
print("BATCH EVALUATION RESULTS")
print("="*80)

for idx, result, prompt in base_results:
    print(f"\nConversation {idx}:")
    print(f"  Judgment: {result['judgment']}")
    # Optionally show a snippet of the explanation
    explanation_preview = result['explanation'][:200] + "..." if len(result['explanation']) > 200 else result['explanation']
    print(f"  Explanation preview: {explanation_preview}")

if base_errors:
    print(f"\nErrors encountered: {len(base_errors)}")
    for idx, error in base_errors:
        print(f"  Conversation {idx}: {error}")

# Summary statistics
judgments = [result['judgment'] for _, result, _ in base_results]
yes_count = judgments.count('Yes')
no_count = judgments.count('No')
unknown_count = judgments.count('Unknown')

print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(f"Total evaluated: {len(base_results)}")
print(f"Yes: {yes_count} ({yes_count/len(base_results)*100:.1f}%)")
print(f"No: {no_count} ({no_count/len(base_results)*100:.1f}%)")
print(f"Unknown: {unknown_count} ({unknown_count/len(base_results)*100:.1f}%)")


Evaluating 1000 conversations with 1000 workers...
  Completed 1/1000
  Completed 2/1000
  Completed 3/1000
  Completed 4/1000
  Completed 5/1000
  Completed 6/1000
  Completed 7/1000
  Completed 8/1000
  Completed 9/1000
  Completed 10/1000
  Completed 11/1000
  Completed 12/1000
  Completed 13/1000
  Completed 14/1000
  Completed 15/1000
  Completed 16/1000
  Completed 17/1000
  Completed 18/1000
  Completed 19/1000
  Completed 20/1000
  Completed 21/1000
  Completed 22/1000
  Completed 23/1000
  Completed 24/1000
  Completed 25/1000
  Completed 26/1000
  Completed 27/1000
  Completed 28/1000
  Completed 29/1000
  Completed 30/1000
  Completed 31/1000
  Completed 32/1000
  Completed 33/1000
  Completed 34/1000
  Completed 35/1000
  Completed 36/1000
  Completed 37/1000
  Completed 38/1000
  Completed 39/1000
  Completed 40/1000
  Completed 41/1000
  Completed 42/1000
  Completed 43/1000
  Completed 44/1000
  Completed 45/1000
  Completed 46/1000
  Completed 47/1000
  Completed 48/100

In [None]:
# Save results to JSONL file
import json

# Create filename with baseline_eval in it
output_filename = f"/Users/ryanarman/code/lab/arxiv_abstract/data/baseline_eval_qwen3_4b_test.jsonl"

# Write results to JSONL file
with open(output_filename, 'w', encoding='utf-8') as f:
    for idx, result, prompt in results:
        output_data = {
            'index': idx,
            'judgment': result['judgment'],
            'explanation': result['explanation'],
            'evaluation_prompt': prompt
        }
        f.write(json.dumps(output_data, ensure_ascii=False) + '\n')

print(f"Saved {len(results)} results to {output_filename}")


Saved 1000 results to /Users/ryanarman/code/lab/arxiv_abstract/data/baseline_eval_qwen3_4b_test.jsonl


score of basemodel: 49.7%

# Training

train_instruct_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train_instruct.jsonl"
val_instruct_path =   "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_val_instruct.jsonl"

./submit_training_rsync.sh \
  data/arxiv_summarization_train_instruct.jsonl \
  data/arxiv_summarization_val_instruct.jsonl \
  configs/qwen4b_train_lora.yaml \
  my_custom_output_name \
  ryan@exun \
  arxiv-abstract-qwen3-4b \
  my-team

In [None]:
cd /Users/ryanarman/code/lab/arxiv_abstract/scripts
./submit_training_rsync.sh

#Eval trained model

In [None]:
cd /Users/ryanarman/code/lab/arxiv_abstract/scripts

./submit_inference_rsync.sh \
  data/arxiv_summarization_test_instruct.jsonl \
  configs/4b_instruct_vllm_infer_checkpoint.yaml \
  fine_tuned_results \
  output/arxiv_abstract_qwen3_4b_lora_2800

# Run judge

In [61]:
tuned_qwen3_4b_test_path =  "/Users/ryanarman/code/lab/arxiv_abstract/data/fine_tuned_results_2801.jsonl"
tuned_qwen3_4b_test_conversations = load_conversations(tuned_qwen3_4b_test_path)


In [None]:
# Batch evaluation for the first 10 conversations

fine_tuned_results, fine_tuned_errors = evaluate_summaries_batch(
    # base_qwen3_4b_test_conversations_eval[:10],
    tuned_qwen3_4b_test_conversations,
    model="gpt-5",
    temperature=1.0,
    max_workers=1000,  # Adjust based on your API rate limits
    show_progress=True
)

# Display results
print("\n" + "="*80)
print("BATCH EVALUATION RESULTS")
print("="*80)

for idx, result, prompt in fine_tuned_results:
    print(f"\nConversation {idx}:")
    print(f"  Judgment: {result['judgment']}")
    # Optionally show a snippet of the explanation
    explanation_preview = result['explanation'][:200] + "..." if len(result['explanation']) > 200 else result['explanation']
    print(f"  Explanation preview: {explanation_preview}")

if fine_tuned_errors:
    print(f"\nErrors encountered: {len(fine_tuned_errors)}")
    for idx, error in fine_tuned_errors:
        print(f"  Conversation {idx}: {error}")

# Summary statistics
judgments = [result['judgment'] for _, result, _ in fine_tuned_results]
yes_count = judgments.count('Yes')
no_count = judgments.count('No')
unknown_count = judgments.count('Unknown')

print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(f"Total evaluated: {len(fine_tuned_results)}")
print(f"Yes: {yes_count} ({yes_count/len(fine_tuned_results)*100:.1f}%)")
print(f"No: {no_count} ({no_count/len(fine_tuned_results)*100:.1f}%)")
print(f"Unknown: {unknown_count} ({unknown_count/len(fine_tuned_results)*100:.1f}%)")


Evaluating 1000 conversations with 1000 workers...
  Completed 1/1000
  Completed 2/1000
  Completed 3/1000
  Completed 4/1000
  Completed 5/1000
  Completed 6/1000
  Completed 7/1000
  Completed 8/1000
  Completed 9/1000
  Completed 10/1000
  Completed 11/1000
  Completed 12/1000
  Completed 13/1000
  Completed 14/1000
  Completed 15/1000
  Completed 16/1000
  Completed 17/1000
  Completed 18/1000
  Completed 19/1000
  Completed 20/1000
  Completed 21/1000
  Completed 22/1000
  Completed 23/1000
  Completed 24/1000
  Completed 25/1000
  Completed 26/1000
  Completed 27/1000
  Completed 28/1000
  Completed 29/1000
  Completed 30/1000
  Completed 31/1000
  Completed 32/1000
  Completed 33/1000
  Completed 34/1000
  Completed 35/1000
  Completed 36/1000
  Completed 37/1000
  Completed 38/1000
  Completed 39/1000
  Completed 40/1000
  Completed 41/1000
  Completed 42/1000
  Completed 43/1000
  Completed 44/1000
  Completed 45/1000
  Completed 46/1000
  Completed 47/1000
  Completed 48/100

In [None]:
# Save results to JSONL file
import json

# Create filename with baseline_eval in it
output_filename = f"/Users/ryanarman/code/lab/arxiv_abstract/data/fine_tuned_eval_qwen3_4b_test.jsonl"

# Write results to JSONL file
with open(output_filename, 'w', encoding='utf-8') as f:
    for idx, result, prompt in fine_tuned_results:
        output_data = {
            'index': idx,
            'judgment': result['judgment'],
            'explanation': result['explanation'],
            'evaluation_prompt': prompt
        }
        f.write(json.dumps(output_data, ensure_ascii=False) + '\n')

print(f"Saved {len(fine_tuned_results)} results to {output_filename}")


Saved 1000 results to /Users/ryanarman/code/lab/arxiv_abstract/data/fine_tuned_eval_qwen3_4b_test.jsonl


## score finetuned: 7%

# Compare results

In [None]:
finetuned_eval = '/Users/ryanarman/code/lab/arxiv_abstract/data/fine_tuned_eval_qwen3_4b_test.jsonl'
original_eval = '/Users/ryanarman/code/lab/arxiv_abstract/data/original_eval_qwen3_4b_test.jsonl'
base_eval = '/Users/ryanarman/code/lab/arxiv_abstract/data/baseline_eval_qwen3_4b_test.jsonl'





NameError: name 'load_results' is not defined