In [63]:
import json
from data.prm800k_loader import load_prm800k
from data.reveal_loader import load_reveal
from data.eqasc_loader import load_eqasc

# --- Load Prediction Scores (from your run) ---
# NOTE: Using the correct path from your latest successful run
PREDICTIONS_PATH = "/workspace/logs/qualitative_analysis_100_samples/predictions.json" 
with open(PREDICTIONS_PATH, 'r') as f:
    predictions_data = json.load(f)

# --- PRM800K Sample (Using the second example for diversity) ---
# Index 1 is safe, as you have 48 PRM800K samples.
# 1. Access the third prediction set (index 2)
prm800k_scores = predictions_data[0]['predictions']
prm800k_labels = predictions_data[0]['labels']

# 2. CORRECTED: Load 3 samples to ensure index [2] exists
prm800k_test = load_prm800k("/workspace/datasets/PRM800K", split="test", phase=2, max_samples=3)

# 3. CORRECTED: Access the third loaded sample (index 2)
prm800k_sample_2 = prm800k_test[0]

print("\n--- PRM800K Sample 2 (The Third Example) ---")
if prm800k_test:
    print(f"PROBLEM: {prm800k_sample_2['problem']}")
    for i, step in enumerate(prm800k_sample_2['steps'][:len(prm800k_scores)]):
        print(f"  Step {i+1} Text: {step}")
        print(f"  --> Predicted p_t: {prm800k_scores[i]:.3f} | True Label: {prm800k_labels[i]:.1f}\n")


# --- REVEAL Sample (Skipping the crashing logic, accessing the first available sample) ---
# The first REVEAL prediction is now at index 48.
reveal_scores = predictions_data[5]['predictions'] 
reveal_labels = predictions_data[5]['labels'] 

# Load only 1 sample, as we know the dataset is small.
reveal_test = load_reveal("/workspace/datasets/reveal", split="open", max_samples=1)

print("\n--- REVEAL Sample 0 (1st Example) ---")
if reveal_test:
    print(f"QUESTION: {reveal_test[0]['question']}")
    for i, step in enumerate(reveal_test[0]['steps'][:len(reveal_scores)]):
        print(f"  Step {i+1} Text: {step}")
        print(f"  --> Predicted p_t: {reveal_scores[i]:.3f} | True Label: {reveal_labels[i]:.2f}\n")


# --- eQASC Sample (Skipping the crashing logic, accessing the next available sample) ---
# The first eQASC prediction is likely at index 49 (after 48 PRM800K + 1 REVEAL).
# Retrieve the 2nd eQASC sample (Index 7)
eqasc_scores = predictions_data[8]['predictions']
eqasc_labels = predictions_data[8]['labels']

# Load 2 samples to safely access the second one (index 1)
eqasc_test = load_eqasc("/workspace/datasets/eQASC", split="test", max_samples=3)
eqasc_sample_1 = eqasc_test[2]

print("\n--- eQASC Sample 1 (Index 7) ---")
print(f"QUESTION: {eqasc_sample_1['question']}")
for i, step in enumerate(eqasc_sample_1['steps'][:len(eqasc_scores)]):
    print(f"  Step {i+1} Text: {step}")
    print(f"  --> Predicted p_t: {eqasc_scores[i]:.3f} | True Label: {eqasc_labels[i]:.3f}\n")


Loading PRM800K from /workspace/datasets/PRM800K/phase2_test.jsonl...
[DEBUG] PRM800K rating_to_confidence mapping: {1: 1.0, 0: 0.5, -1: 0.0}
Loaded 3 examples from PRM800K test set (phase 2)

--- PRM800K Sample 2 (The Third Example) ---
PROBLEM: A Senate committee has 5 Democrats, 5 Republicans, and 1 Independent.  In how many ways can they sit around a circular table if all the members of each party all sit next to each other?  (Two seatings are considered equivalent if one is a rotation of the other.)
  Step 1 Text: I notice that there are three groups of people: Democrats, Republicans, and Independent.
  --> Predicted p_t: 0.400 | True Label: 1.0

  Step 2 Text: I wonder how many ways I can arrange these groups around the circle, ignoring the order within each group for now.
  --> Predicted p_t: 0.424 | True Label: 0.5

  Step 3 Text: I recall that the number of ways to arrange n distinct objects around a circle is (n-1)!, since we can fix one object and then permute the rest.
  --

In [45]:
import json

# NOTE: Using the file path from your latest successful evaluation run
PREDICTIONS_PATH = "/workspace/logs/qualitative_analysis_100_samples/predictions.json" 
# Fallback to the current directory if the full path fails
# PREDICTIONS_PATH = "predictions.json" 

try:
    with open(PREDICTIONS_PATH, 'r') as f:
        predictions_data = json.load(f)
    
    print(f"Total number of predictions (samples) saved: {len(predictions_data)}")
    
except FileNotFoundError:
    print(f"Error: Predictions file not found at {PREDICTIONS_PATH}")
except json.JSONDecodeError:
    print("Error: Could not decode JSON. File may be corrupted or empty.")

Total number of predictions (samples) saved: 10


In [44]:
import json

FILE_PATH = "/workspace/datasets/reveal/open.json"
sample_count = 0

try:
    with open(FILE_PATH, 'r') as f:
        # Read file line by line, assuming JSON Lines (.jsonl) format
        for line in f:
            # Skip empty lines
            if line.strip():
                try:
                    # Attempt to decode each line as a separate JSON object
                    json.loads(line)
                    sample_count += 1
                except json.JSONDecodeError:
                    # If a line fails to decode, we stop or skip it.
                    print(f"Warning: Skipping invalid JSON line.")
                    continue
    
    print(f"Total number of samples found in 'open.json' (reading line-by-line): {sample_count}")

except FileNotFoundError:
    print("Error: 'open.json' file not found at the specified path.")

Total number of samples found in 'open.json' (reading line-by-line): 1146


In [53]:
import json
from data.eqasc_loader import load_eqasc

# NOTE: Using the correct path
PREDICTIONS_PATH = "/workspace/logs/qualitative_analysis_100_samples/predictions.json" 
with open(PREDICTIONS_PATH, 'r') as f:
    predictions_data = json.load(f)

# Request 4 samples just to be safe
eqasc_test = load_eqasc("/workspace/datasets/eQASC", split="test", max_samples=4)

print(f"Loaded EQASC samples count: {len(eqasc_test)}")

if len(eqasc_test) >= 2:
    print("\n--- Sample 1 (Index 1) ---")
    print(f"Prediction Index: 7")
    print(f"Question: {eqasc_test[1]['question']}")

if len(eqasc_test) >= 3:
    print("\n--- Sample 2 (Index 2) ---")
    print(f"Prediction Index: 8")
    print(f"Question: {eqasc_test[2]['question']}")
    
    # Retrieve and print scores for this third sample
    eqasc_scores = predictions_data[8]['predictions']
    eqasc_labels = predictions_data[8]['labels']
    
    print("REASONING STEPS:")
    for i, step in enumerate(eqasc_test[2]['steps'][:len(eqasc_scores)]):
        print(f"  Step {i+1} Text: {step}")
        print(f"  --> Predicted p_t: {eqasc_scores[i]:.3f} | True Label: {eqasc_labels[i]:.3f}\n")

Loading eQASC from /workspace/datasets/eQASC/eqasc_test_grc.json...
Loaded 305 reasoning chains from eQASC test set
  Total steps: 610, Average steps per chain: 2.00
  Score range: 0.04 - 0.48
  Mean score: 0.16
Loaded EQASC samples count: 305

--- Sample 1 (Index 1) ---
Prediction Index: 7
Question: What type of birth do therian mammals have?

--- Sample 2 (Index 2) ---
Prediction Index: 8
Question: What type of birth do therian mammals have?
REASONING STEPS:
  Step 1 Text: Mammals that are viviparous are called therian mammals.
  --> Predicted p_t: 0.453 | True Label: 1.000

  Step 2 Text: Blennius, silures, and others bring forth living young, and are called viviparous.
  --> Predicted p_t: 0.609 | True Label: 1.000

