In [2]:
# Import utility functions
from utils import (
    evaluate_summary,
    evaluate_summaries_batch,
    display_text,
    display_message,
    load_conversations,
    client,
    JUDGE_SYSTEM_INSTRUCTION,
    JUDGE_PROMPT_TEMPLATE_WITH_REQUEST_AND_RESPONSE
)




# Examine data

In [3]:

# train_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train.jsonl"
train_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train_filtered_10k.jsonl"
val_path =   "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_val_filtered_10k.jsonl"
test_path =  "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_test_filtered_10k.jsonl"
# train_distilled_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_abstract_train_gpt5mini_think2.jsonl"
train_conversations = load_conversations(train_path)
val_conversations = load_conversations(val_path)
test_conversations = load_conversations(test_path)
# train_distilled_conversations = load_conversations(train_distilled_path)

In [3]:
test_conversations[0]

[{'role': 'user',
  'content': 'as a common quantum phenomenon , the tunneling through a potential barrier plays a very important role in the microscopic world and has been studied extensively since the birth of quantum mechanics . \n one of the earliest applications of quantum tunneling is the explanation of @xmath0 decays in atomic nuclei . \n the quantum tunneling effect governs also many other nuclear processes such as fission and fusion . \n in particular , a lot of new features are revealed in sub - barrier fusion reactions which are closely connected with the tunneling phenomena  @xcite .    for most of the potential barriers , the penetrability can not be calculated analytically  @xcite . among those potentials for which analytical solutions can be obtained , \n the parabolic potential  @xcite is the mostly used in the study of nuclear fusion . by approximating the coulomb barrier to a parabola \n , wong derived an analytic expression for the fusion cross section  @xcite which 

# Judge

In [11]:
# Test the helper function on the first message
# First, let's examine train_conversations[0]

conv = train_conversations[6]
print("="*80)
print(f"Number of messages: {len(conv)}")
print(f"Message roles: {[msg['role'] for msg in conv]}")
print("\n" + "="*80)
print("USER MESSAGE (the paper content):")
print("="*80)
# display_message(train_conversations[0], role='user')
# print("\n" + "="*80)
print("ASSISTANT MESSAGE (the summary being evaluated):")
print("="*80)
display_message(conv, role='assistant')

print("\n" + "="*80)
print("JUDGE EVALUATION:")
print("="*80)
result, evaluation_prompt = evaluate_summary(conv, model="gpt-5")

print(f"Judgment: {result['judgment']}")
print(f"\nExplanation:\n{result['explanation']}")


Number of messages: 2
Message roles: ['user', 'assistant']

USER MESSAGE (the paper content):
ASSISTANT MESSAGE (the summary being evaluated):
Role: ASSISTANT
Characters: 1,721 | Words: 315 | Lines: 8




JUDGE EVALUATION:
Judgment: No

Explanation:
Faithfulness (60/100):
- The summary largely reflects the paper’s topic (object recognition to aid segmentation via b-scale and shape models) and the hierarchical recognition idea. However, it introduces a specific evaluation detail not present in the provided text: “a set of 20 routine clinical abdominal female and male CT data sets.” The number 20 is not supported by the text (which uses variables for the number of subjects and mentions only that data are routine PET/CT-derived CT scans).
- It also paraphrases a reported finding as “recognition accuracy” improving with more objects, whereas the paper explicitly states “specificity” increases; this is a subtle but meaningful shift in the reported metric.
- It slightly overstates the conclusion with “make delineation most accurate,” which is stronger than the paper’s framing that efficient recognition enables successful delineation.

Coverage (85/100):
- The summary states the main problem 

## Eval original abstracts

In [64]:
# Batch evaluation for the first 10 conversations

original_results, original_errors = evaluate_summaries_batch(
    test_conversations,
    model="gpt-5",
    temperature=1.0,
    max_workers=1000,  # Adjust based on your API rate limits
    show_progress=True
)

# Display results
print("\n" + "="*80)
print("BATCH EVALUATION RESULTS")
print("="*80)

for idx, result, prompt in original_results:
    print(f"\nConversation {idx}:")
    print(f"  Judgment: {result['judgment']}")
    # Optionally show a snippet of the explanation
    explanation_preview = result['explanation'][:200] + "..." if len(result['explanation']) > 200 else result['explanation']
    print(f"  Explanation preview: {explanation_preview}")

if original_errors:
    print(f"\nErrors encountered: {len(original_errors)}")
    for idx, error in original_errors:
        print(f"  Conversation {idx}: {error}")

# Summary statistics
judgments = [result['judgment'] for _, result, _ in original_results]
yes_count = judgments.count('Yes')
no_count = judgments.count('No')
unknown_count = judgments.count('Unknown')

print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(f"Total evaluated: {len(original_results)}")
print(f"Yes: {yes_count} ({yes_count/len(original_results)*100:.1f}%)")
print(f"No: {no_count} ({no_count/len(original_results)*100:.1f}%)")
print(f"Unknown: {unknown_count} ({unknown_count/len(original_results)*100:.1f}%)")


Evaluating 1000 conversations with 1000 workers...
  Completed 1/1000
  Completed 2/1000
  Completed 3/1000
  Completed 4/1000
  Completed 5/1000
  Completed 6/1000
  Completed 7/1000
  Completed 8/1000
  Completed 9/1000
  Completed 10/1000
  Completed 11/1000
  Completed 12/1000
  Completed 13/1000
  Completed 14/1000
  Completed 15/1000
  Completed 16/1000
  Completed 17/1000
  Completed 18/1000
  Completed 19/1000
  Completed 20/1000
  Completed 21/1000
  Completed 22/1000
  Completed 23/1000
  Completed 24/1000
  Completed 25/1000
  Completed 26/1000
  Completed 27/1000
  Completed 28/1000
  Completed 29/1000
  Completed 30/1000
  Completed 31/1000
  Completed 32/1000
  Completed 33/1000
  Completed 34/1000
  Completed 35/1000
  Completed 36/1000
  Completed 37/1000
  Completed 38/1000
  Completed 39/1000
  Completed 40/1000
  Completed 41/1000
  Completed 42/1000
  Completed 43/1000
  Completed 44/1000
  Completed 45/1000
  Completed 46/1000
  Completed 47/1000
  Completed 48/100

In [65]:
# Save results to JSONL file
import json

# Create filename with baseline_eval in it
output_filename = f"/Users/ryanarman/code/lab/arxiv_abstract/data/original_eval_qwen3_4b_test.jsonl"

# Write results to JSONL file
with open(output_filename, 'w', encoding='utf-8') as f:
    for idx, result, prompt in original_results:
        output_data = {
            'index': idx,
            'judgment': result['judgment'],
            'explanation': result['explanation'],
            'evaluation_prompt': prompt
        }
        f.write(json.dumps(output_data, ensure_ascii=False) + '\n')

print(f"Saved {len(original_results)} results to {output_filename}")


Saved 1000 results to /Users/ryanarman/code/lab/arxiv_abstract/data/original_eval_qwen3_4b_test.jsonl


## original abstract: 75%

# Evaluate the teacher

In [86]:
gpt5_test_path =  "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_test_instruct_gpt5.jsonl"
gpt5_test_conversations = load_conversations(gpt5_test_path)

In [None]:
gpt5_results, gpt5_errors = evaluate_summaries_batch(
    gpt5_test_conversations,
    model="gpt-5",
    temperature=1.0,
    max_workers=1000,  # Adjust based on your API rate limits
    show_progress=True
)

# Display results
print("\n" + "="*80)
print("BATCH EVALUATION RESULTS")
print("="*80)

In [89]:


for idx, result, prompt in gpt5_results:
    print(f"\nConversation {idx}:")
    print(f"  Judgment: {result['judgment']}")
    # Optionally show a snippet of the explanation
    explanation_preview = result['explanation'][:200] + "..." if len(result['explanation']) > 200 else result['explanation']
    print(f"  Explanation preview: {explanation_preview}")

if gpt5_errors:
    print(f"\nErrors encountered: {len(gpt5_errors)}")
    for idx, error in gpt5_errors:
        print(f"  Conversation {idx}: {error}")

# Summary statistics
judgments = [result['judgment'] for _, result, _ in gpt5_results]
yes_count = judgments.count('Yes')
no_count = judgments.count('No')
unknown_count = judgments.count('Unknown')

print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(f"Total evaluated: {len(gpt5_results)}")
print(f"Yes: {yes_count} ({yes_count/len(gpt5_results)*100:.1f}%)")
print(f"No: {no_count} ({no_count/len(gpt5_results)*100:.1f}%)")
print(f"Unknown: {unknown_count} ({unknown_count/len(gpt5_results)*100:.1f}%)")



Conversation 0:
  Judgment: Yes
  Explanation preview: Faithfulness (97/100):
- The summary accurately reflects the paper’s topic (quantum tunneling/barrier penetration), the limitation of the parabolic (Hill–Wheeler) approximation at deep sub-barrier ene...

Conversation 1:
  Judgment: Yes
  Explanation preview: Faithfulness (98/100):
- The summary accurately reflects the paper’s goals, methods, and findings. It correctly describes the combination of force-gradient and multirate (nested) ideas, the introducti...

Conversation 2:
  Judgment: Yes
  Explanation preview: Faithfulness (94/100):
- The summary accurately reflects the paper’s main contributions and scope. It correctly identifies two integration-by-parts–free methods: (1) algebraic relations between produc...

Conversation 3:
  Judgment: Yes
  Explanation preview: Faithfulness (0-100): 96
- The summary accurately reflects the paper’s topic (hierarchical Hough-transform searches for continuous GWs), the proposed method (freque

In [91]:
# Save results to JSONL file
import json

# Create filename with baseline_eval in it
output_filename = f"/Users/ryanarman/code/lab/arxiv_abstract/data/eval_gpt5_test.jsonl"

# Write results to JSONL file
with open(output_filename, 'w', encoding='utf-8') as f:
    for idx, result, prompt in gpt5_results:
        output_data = {
            'index': idx,
            'judgment': result['judgment'],
            'explanation': result['explanation'],
            'evaluation_prompt': prompt
        }
        f.write(json.dumps(output_data, ensure_ascii=False) + '\n')

print(f"Saved {len(gpt5_results)} results to {output_filename}")


Saved 1000 results to /Users/ryanarman/code/lab/arxiv_abstract/data/eval_gpt5_test.jsonl


# Run inference on the base model

In [59]:
train_instruct_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train_instruct.jsonl"
val_instruct_path =   "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_val_instruct.jsonl"
test_instruct_path =  "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_test_instruct.jsonl"
# train_distilled_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_abstract_train_gpt5mini_think2.jsonl"
train_instruct_conversations = load_conversations(train_instruct_path)
val_instruct_conversations = load_conversations(val_instruct_path)
test_instruct_conversations = load_conversations(test_instruct_path)


In [60]:
train_instruct_conversations[0]

[{'role': 'system',
  'content': "You are an expert academic abstract writer. Your task is to create a high-quality abstract for an arXiv paper based on the paper content and judge evaluation feedback.\n\nThe judge evaluates abstracts based on five dimensions:\n1. Faithfulness: The abstract must accurately reflect the paper's content without hallucination\n2. Coverage: The abstract must include the essential aspects (main problem, approach, and key results)\n3. Clarity: The abstract must be understandable and readable\n4. Conciseness: The abstract must be focused and not verbose\n5. Coherence: The abstract must be logically structured and flow naturally\n\nWhen creating the abstract:\n- Read the paper content carefully\n- Pay attention to the judge's feedback on what makes a good abstract\n- Ensure your abstract meets all five evaluation criteria\n- Write a concise, clear, and coherent summary that accurately covers the paper's main contributions\n- Focus on the main problem, approach,

In [14]:
test_instruct_conversations[0]

[{'role': 'system',
  'content': "You are an expert academic abstract writer. Your task is to create a high-quality abstract for an arXiv paper based on the paper content and judge evaluation feedback.\n\nThe judge evaluates abstracts based on five dimensions:\n1. Faithfulness: The abstract must accurately reflect the paper's content without hallucination\n2. Coverage: The abstract must include the essential aspects (main problem, approach, and key results)\n3. Clarity: The abstract must be understandable and readable\n4. Conciseness: The abstract must be focused and not verbose\n5. Coherence: The abstract must be logically structured and flow naturally\n\nWhen creating the abstract:\n- Read the paper content carefully\n- Pay attention to the judge's feedback on what makes a good abstract\n- Ensure your abstract meets all five evaluation criteria\n- Write a concise, clear, and coherent summary that accurately covers the paper's main contributions\n- Focus on the main problem, approach,

How to run inference on the cluster

From the arxiv_abstract root directory:

Option 1: Basic usage (uses defaults)
  cd scripts
  ./submit_inference_rsync.sh

Option 2: Custom input file
  cd scripts
  ./submit_inference_rsync.sh ../data/arxiv_summarization_test_instruct.jsonl

Option 3: Full specification
  cd scripts
  ./submit_inference_rsync.sh \
    ../data/arxiv_summarization_test_instruct.jsonl \
    ../configs/4b_instruct_vllm_infer.yaml \
    qwen3_4b_test

Option 4: With trained LoRA adapter
  cd scripts
  ./submit_inference_rsync.sh \
    ../data/arxiv_summarization_test_instruct.jsonl \
    ../configs/4b_instruct_vllm_infer.yaml \
    output_name \
    path/to/adapter/checkpoint \
    ryan@exun

Defaults:
  - Input: data/arxiv_summarization_test_instruct.jsonl
  - Config: configs/4b_instruct_vllm_infer.yaml
  - Output name: output
  - Cluster: ryan@exun

The script will:
  1. Copy files to the cluster using rsync
  2. Submit a SLURM job for inference
  3. Output will be saved to: data/output_<job_id>.jsonl on the cluster

To check job status:
  ssh ryan@exun 'squeue -u ryan'

To view logs:
  ssh ryan@exun 'tail -f /home/ryan/code/oumi/lab/arxiv_abstract/logs/arxiv_abstract_inference_qwen3_4b_*.log'

To download results:
  scp ryan@exun:/home/ryan/code/oumi/lab/arxiv_abstract/data/output_*.jsonl ./data/

In [30]:
base_qwen3_4b_test_path =  "/Users/ryanarman/code/lab/arxiv_abstract/data/qwen3_4b_test_2795.jsonl"
# train_distilled_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_abstract_train_gpt5mini_think2.jsonl"
base_qwen3_4b_test_conversations = load_conversations(base_qwen3_4b_test_path)


In [41]:
base_qwen3_4b_test_conversations_eval = [conv[1:] for conv in base_qwen3_4b_test_conversations]

In [None]:
conv = base_qwen3_4b_test_conversations_eval[0]

[{'content': 'Paper Content:\nas a common quantum phenomenon , the tunneling through a potential barrier plays a very important role in the microscopic world and has been studied extensively since the birth of quantum mechanics . \n one of the earliest applications of quantum tunneling is the explanation of @xmath0 decays in atomic nuclei . \n the quantum tunneling effect governs also many other nuclear processes such as fission and fusion . \n in particular , a lot of new features are revealed in sub - barrier fusion reactions which are closely connected with the tunneling phenomena  @xcite .    for most of the potential barriers , the penetrability can not be calculated analytically  @xcite . among those potentials for which analytical solutions can be obtained , \n the parabolic potential  @xcite is the mostly used in the study of nuclear fusion . by approximating the coulomb barrier to a parabola \n , wong derived an analytic expression for the fusion cross section  @xcite which is

In [None]:
result, evaluation_prompt = evaluate_summary(conv, model="gpt-5")
print(f"Judgment: {result['judgment']}")
print(f"\nExplanation:\n{result['explanation']}")



In [None]:
# Batch evaluation for the first 10 conversations

base_results, base_errors = evaluate_summaries_batch(
    # base_qwen3_4b_test_conversations_eval[:10],
    base_qwen3_4b_test_conversations_eval,
    model="gpt-5",
    temperature=1.0,
    max_workers=1000,  # Adjust based on your API rate limits
    show_progress=True
)

# Display results
print("\n" + "="*80)
print("BATCH EVALUATION RESULTS")
print("="*80)

for idx, result, prompt in base_results:
    print(f"\nConversation {idx}:")
    print(f"  Judgment: {result['judgment']}")
    # Optionally show a snippet of the explanation
    explanation_preview = result['explanation'][:200] + "..." if len(result['explanation']) > 200 else result['explanation']
    print(f"  Explanation preview: {explanation_preview}")

if base_errors:
    print(f"\nErrors encountered: {len(base_errors)}")
    for idx, error in base_errors:
        print(f"  Conversation {idx}: {error}")

# Summary statistics
judgments = [result['judgment'] for _, result, _ in base_results]
yes_count = judgments.count('Yes')
no_count = judgments.count('No')
unknown_count = judgments.count('Unknown')

print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(f"Total evaluated: {len(base_results)}")
print(f"Yes: {yes_count} ({yes_count/len(base_results)*100:.1f}%)")
print(f"No: {no_count} ({no_count/len(base_results)*100:.1f}%)")
print(f"Unknown: {unknown_count} ({unknown_count/len(base_results)*100:.1f}%)")


Evaluating 1000 conversations with 1000 workers...
  Completed 1/1000
  Completed 2/1000
  Completed 3/1000
  Completed 4/1000
  Completed 5/1000
  Completed 6/1000
  Completed 7/1000
  Completed 8/1000
  Completed 9/1000
  Completed 10/1000
  Completed 11/1000
  Completed 12/1000
  Completed 13/1000
  Completed 14/1000
  Completed 15/1000
  Completed 16/1000
  Completed 17/1000
  Completed 18/1000
  Completed 19/1000
  Completed 20/1000
  Completed 21/1000
  Completed 22/1000
  Completed 23/1000
  Completed 24/1000
  Completed 25/1000
  Completed 26/1000
  Completed 27/1000
  Completed 28/1000
  Completed 29/1000
  Completed 30/1000
  Completed 31/1000
  Completed 32/1000
  Completed 33/1000
  Completed 34/1000
  Completed 35/1000
  Completed 36/1000
  Completed 37/1000
  Completed 38/1000
  Completed 39/1000
  Completed 40/1000
  Completed 41/1000
  Completed 42/1000
  Completed 43/1000
  Completed 44/1000
  Completed 45/1000
  Completed 46/1000
  Completed 47/1000
  Completed 48/100

In [None]:
# Save results to JSONL file
import json

# Create filename with baseline_eval in it
output_filename = f"/Users/ryanarman/code/lab/arxiv_abstract/data/baseline_eval_qwen3_4b_test.jsonl"

# Write results to JSONL file
with open(output_filename, 'w', encoding='utf-8') as f:
    for idx, result, prompt in results:
        output_data = {
            'index': idx,
            'judgment': result['judgment'],
            'explanation': result['explanation'],
            'evaluation_prompt': prompt
        }
        f.write(json.dumps(output_data, ensure_ascii=False) + '\n')

print(f"Saved {len(results)} results to {output_filename}")


Saved 1000 results to /Users/ryanarman/code/lab/arxiv_abstract/data/baseline_eval_qwen3_4b_test.jsonl


score of basemodel: 49.7%

# Training

train_instruct_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train_instruct.jsonl"
val_instruct_path =   "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_val_instruct.jsonl"

./submit_training_rsync.sh \
  data/arxiv_summarization_train_instruct.jsonl \
  data/arxiv_summarization_val_instruct.jsonl \
  configs/qwen4b_train_lora.yaml \
  my_custom_output_name \
  ryan@exun \
  arxiv-abstract-qwen3-4b \
  my-team

# Trained on gpt-5 distilled on 1k train, 1k validation
   /home/ryan/code/oumi/lab/arxiv_abstract/output/arxiv_abstract_qwen3_4b_gpt5_lora_2807

In [None]:
gen_train_path = "/Users/ryanarman/code/lab/arxiv_abstract/data/arxiv_summarization_train_instruct_article_gpt5.jsonl"


cd /Users/ryanarman/code/lab/arxiv_abstract/scripts
./submit_training_rsync.sh

#Eval trained model

In [None]:
cd /Users/ryanarman/code/lab/arxiv_abstract/scripts

./submit_inference_rsync.sh \
  data/arxiv_summarization_test_instruct.jsonl \
  configs/4b_instruct_vllm_infer_checkpoint.yaml \
  fine_tuned_results \
  output/arxiv_abstract_qwen3_4b_lora_2800


./submit_inference_rsync.sh \
  data/arxiv_summarization_test_instruct.jsonl \
  configs/4b_instruct_vllm_infer_checkpoint.yaml \
  fine_tuned_gpt5_distilled_results \
  output/arxiv_abstract_qwen3_4b_gpt5_lora_2808

  ./submit_inference_rsync.sh \
  data/arxiv_summarization_test_instruct.jsonl \
  configs/4b_instruct_vllm_infer_checkpoint.yaml \
  fine_tuned_gpt5_distilled_article_results \
  output/arxiv_abstract_qwen3_4b_gpt5_article_lora_fixed_2820

./submit_inference_rsync.sh \
  data/arxiv_summarization_test_instruct.jsonl \
  configs/4b_instruct_vllm_infer_checkpoint.yaml \
  fine_tuned_gpt5_distilled_article_results_v2 \
  arxiv_abstract_qwen3_4b_gpt5_article_lora_fixed_v2_2826

  ./submit_inference_rsync.sh ... output/arxiv_abstract_qwen3_4b_gpt5_article_lora_fixed_v2_2826/checkpoint-400

# Run judge

In [81]:
# tuned_qwen3_4b_test_path =  "/Users/ryanarman/code/lab/arxiv_abstract/data/fine_tuned_results_2801.jsonl"
# tuned_qwen3_4b_test_path =  "/Users/ryanarman/code/lab/arxiv_abstract/data/fine_tuned_gpt5_distilled_results_2809.jsonl"
tuned_qwen3_4b_test_path =  "/Users/ryanarman/code/lab/arxiv_abstract/data/fine_tuned_gpt5_distilled_article_results_2821.jsonl"
tuned_qwen3_4b_test_conversations = load_conversations(tuned_qwen3_4b_test_path)


In [98]:
display_message(tuned_qwen3_4b_test_conversations[5], role='assistant')


Role: ASSISTANT
Characters: 1,532 | Words: 234 | Lines: 1



In [83]:

fine_tuned_results, fine_tuned_errors = evaluate_summaries_batch(
    tuned_qwen3_4b_test_conversations,
    model="gpt-5",
    temperature=1.0,
    max_workers=1000,  # Adjust based on your API rate limits
    show_progress=True
)

# Display results
print("\n" + "="*80)
print("BATCH EVALUATION RESULTS")
print("="*80)

for idx, result, prompt in fine_tuned_results:
    print(f"\nConversation {idx}:")
    print(f"  Judgment: {result['judgment']}")
    # Optionally show a snippet of the explanation
    explanation_preview = result['explanation'][:200] + "..." if len(result['explanation']) > 200 else result['explanation']
    print(f"  Explanation preview: {explanation_preview}")

if fine_tuned_errors:
    print(f"\nErrors encountered: {len(fine_tuned_errors)}")
    for idx, error in fine_tuned_errors:
        print(f"  Conversation {idx}: {error}")

# Summary statistics
judgments = [result['judgment'] for _, result, _ in fine_tuned_results]
yes_count = judgments.count('Yes')
no_count = judgments.count('No')
unknown_count = judgments.count('Unknown')

print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(f"Total evaluated: {len(fine_tuned_results)}")
print(f"Yes: {yes_count} ({yes_count/len(fine_tuned_results)*100:.1f}%)")
print(f"No: {no_count} ({no_count/len(fine_tuned_results)*100:.1f}%)")
print(f"Unknown: {unknown_count} ({unknown_count/len(fine_tuned_results)*100:.1f}%)")


Evaluating 1000 conversations with 1000 workers...
  Completed 1/1000
  Completed 2/1000
  Completed 3/1000
  Completed 4/1000
  Completed 5/1000
  Completed 6/1000
  Completed 7/1000
  Completed 8/1000
  Completed 9/1000
  Completed 10/1000
  Completed 11/1000
  Completed 12/1000
  Completed 13/1000
  Completed 14/1000
  Completed 15/1000
  Completed 16/1000
  Completed 17/1000
  Completed 18/1000
  Completed 19/1000
  Completed 20/1000
  Completed 21/1000
  Completed 22/1000
  Completed 23/1000
  Completed 24/1000
  Completed 25/1000
  Completed 26/1000
  Completed 27/1000
  Completed 28/1000
  Completed 29/1000
  Completed 30/1000
  Completed 31/1000
  Completed 32/1000
  Completed 33/1000
  Completed 34/1000
  Completed 35/1000
  Completed 36/1000
  Completed 37/1000
  Completed 38/1000
  Completed 39/1000
  Completed 40/1000
  Completed 41/1000
  Completed 42/1000
  Completed 43/1000
  Completed 44/1000
  Completed 45/1000
  Completed 46/1000
  Completed 47/1000
  Completed 48/100

In [84]:
# Save results to JSONL file
import json

# Create filename with baseline_eval in it
output_filename = f"/Users/ryanarman/code/lab/arxiv_abstract/data/fine_tuned_eval_qwen3_4b_test_gpt5_instruct_article_lora_fixed.jsonl"

# Write results to JSONL file
with open(output_filename, 'w', encoding='utf-8') as f:
    for idx, result, prompt in fine_tuned_results:
        output_data = {
            'index': idx,
            'judgment': result['judgment'],
            'explanation': result['explanation'],
            'evaluation_prompt': prompt
        }
        f.write(json.dumps(output_data, ensure_ascii=False) + '\n')

print(f"Saved {len(fine_tuned_results)} results to {output_filename}")


Saved 1000 results to /Users/ryanarman/code/lab/arxiv_abstract/data/fine_tuned_eval_qwen3_4b_test_gpt5_instruct_article_lora_fixed.jsonl


## score finetuned: 7%