In [1]:
"""
RE-PROCESS CSV WITH IMPROVED ANSWER EXTRACTION - COLAB VERSION
"""

# First, install required packages
!pip install -q pandas

import pandas as pd
import re
import json
from datetime import datetime

# ============================================================
# IMPROVED ANSWER EXTRACTION FUNCTION
# ============================================================

def extract_final_answer(text):
    """
    FOCUSED ON EXTRACTING THE ACTUAL FINAL ANSWER
    Looks for the answer at the END of the response where it usually is
    """
    if not text or pd.isna(text):
        return None

    text = str(text).strip()

    # STRATEGY: Focus on the LAST part where final answers usually appear
    last_part = text[-500:] if len(text) > 500 else text

    # 1. Look for boxed answers (most reliable for math models)
    boxed_patterns = [
        r'\\boxed\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}',
        r'\\boxed\{([^}]+)\}',
    ]

    for pattern in boxed_patterns:
        matches = re.findall(pattern, last_part)
        if matches:
            final_answer = matches[-1].strip()  # Take the LAST boxed answer
            # Extract clean number
            num_match = re.search(r'(\d+\.?\d*)', final_answer.replace(",", ""))
            if num_match:
                return num_match.group(1)

    # 2. Look for final answer patterns in the last part
    final_patterns = [
        r'[Ff]inal\s*[Aa]nswer\s*[:\-]\s*\$?(\d+\.?\d*)',
        r'[Tt]herefore[^.,;]*?(\d+\.?\d*)',
        r'[Ss]o[^.,;]*?(\d+\.?\d*)',
        r'=\s*\$?(\d+\.?\d*)\s*$',
        r'[Aa]nswer\s*[:\-]\s*\$?(\d+\.?\d*)',
    ]

    for pattern in final_patterns:
        match = re.search(pattern, last_part)
        if match:
            return match.group(1)

    # 3. Look for calculations that show the final result
    # Find patterns like "9 × 2 = 18" or "200,000 - 130,000 = 70,000"
    calculation_patterns = [
        r'(\d+[\d,]*)\s*[\-\+]\s*(\d+[\d,]*)\s*=\s*(\d+[\d,]*)',
        r'(\d+[\d,]*)\s*[\×\*]\s*(\d+[\d,]*)\s*=\s*(\d+[\d,]*)',
    ]

    for pattern in calculation_patterns:
        matches = re.findall(pattern, last_part)
        if matches:
            # Take the last calculation's result
            last_calc = matches[-1]
            if len(last_calc) == 3:
                return last_calc[2].replace(",", "")

    # 4. Last resort: find all numbers and take the last reasonable one
    numbers = re.findall(r'\b(\d+\.?\d*)\b', text.replace(",", ""))
    if numbers:
        # Filter out small numbers that are likely step numbers
        reasonable = [n for n in numbers if float(n) > 1 or n == '0']
        if reasonable:
            return reasonable[-1]

    return None

def normalize_answer(answer):
    """Normalize answer for comparison"""
    if answer is None:
        return None
    answer = str(answer).strip().replace('$', '').replace(',', '').replace(' ', '')
    try:
        return float(answer)
    except:
        return answer.lower()

def compare_answers(pred, true, tolerance=1e-3):
    """Compare predicted and true answers"""
    pred_norm = normalize_answer(pred)
    true_norm = normalize_answer(true)

    if pred_norm is None or true_norm is None:
        return False

    if isinstance(pred_norm, (int, float)) and isinstance(true_norm, (int, float)):
        if abs(true_norm) > 1:
            return abs(pred_norm - true_norm) / abs(true_norm) < tolerance
        else:
            return abs(pred_norm - true_norm) < tolerance

    return str(pred_norm).lower() == str(true_norm).lower()

# ============================================================
# RE-PROCESS YOUR CSV
# ============================================================

def reprocess_csv(csv_file_path):
    """Re-process your CSV with improved answer extraction"""

    # Load your CSV
    df = pd.read_csv(csv_file_path)
    print(f"✅ Loaded CSV with {len(df)} rows")

    results = []
    correct_count = 0

    print(f"\n{'='*70}")
    print("RE-PROCESSING WITH IMPROVED ANSWER EXTRACTION")
    print(f"{'='*70}\n")

    for idx, row in df.iterrows():
        question = row['question']
        true_answer = row['true_answer']
        full_response = row['full_response']
        original_predicted = row['predicted_answer']
        original_correct = row['correct']

        # Re-extract the answer with improved function
        new_predicted = extract_final_answer(full_response)

        # Check correctness
        is_correct = compare_answers(new_predicted, true_answer)
        if is_correct:
            correct_count += 1

        result = {
            'index': idx,
            'question': question[:80] + "..." if len(question) > 80 else question,
            'true_answer': true_answer,
            'original_predicted': original_predicted,
            'new_predicted': new_predicted,
            'original_correct': original_correct,
            'new_correct': is_correct,
            'improved': is_correct and not original_correct
        }

        results.append(result)

        # Show improvements
        if is_correct != original_correct:
            status = "✅ IMPROVED" if is_correct else "⚠️  REGRESSION"
            print(f"{status} - Row {idx}:")
            print(f"  True: {true_answer} | Original: {original_predicted} | New: {new_predicted}")

    # Calculate accuracies
    original_accuracy = (df['correct'].sum() / len(df)) * 100
    new_accuracy = (correct_count / len(df)) * 100
    improvement = new_accuracy - original_accuracy

    # Create results DataFrame
    results_df = pd.DataFrame(results)

    # Save improved results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = f"improved_results_{timestamp}.csv"
    results_df.to_csv(output_file, index=False)

    # Print summary
    print(f"\n{'='*70}")
    print("RESULTS SUMMARY")
    print(f"{'='*70}")
    print(f"Original Accuracy: {original_accuracy:.2f}%")
    print(f"New Accuracy:      {new_accuracy:.2f}%")
    print(f"Improvement:       {improvement:+.2f}%")
    print(f"\nImproved {correct_count - df['correct'].sum()} predictions!")
    print(f"Results saved to: {output_file}")

    # Show some examples of improvements
    improvements = [r for r in results if r['improved']]
    if improvements:
        print(f"\n{'='*70}")
        print("IMPROVEMENT EXAMPLES")
        print(f"{'='*70}")
        for i, imp in enumerate(improvements[:3]):
            print(f"\nExample {i+1}:")
            print(f"Question: {imp['question']}")
            print(f"True: {imp['true_answer']}")
            print(f"Was: {imp['original_predicted']} ❌")
            print(f"Now: {imp['new_predicted']} ✅")

    return results_df, new_accuracy

# ============================================================
# UPLOAD YOUR CSV FILE TO COLAB
# ============================================================

print("Step 1: Upload your CSV file")
from google.colab import files
uploaded = files.upload()

# Get the uploaded filename
csv_filename = list(uploaded.keys())[0]
print(f"📁 Uploaded file: {csv_filename}")

# ============================================================
# RUN THE RE-PROCESSING
# ============================================================

print("\nStep 2: Re-processing with improved answer extraction...")
results_df, new_accuracy = reprocess_csv(csv_filename)

# Show detailed comparison for first 10 rows
print(f"\n{'='*70}")
print("DETAILED COMPARISON (First 10 rows)")
print(f"{'='*70}")

for i in range(min(10, len(results_df))):
    row = results_df.iloc[i]
    status_old = "✅" if row['original_correct'] else "❌"
    status_new = "✅" if row['new_correct'] else "❌"
    improved = "🔄 IMPROVED" if row['improved'] else ""

    print(f"\nRow {i}: {status_old}→{status_new} {improved}")
    print(f"True: {row['true_answer']} | Was: {row['original_predicted']} | Now: {row['new_predicted']}")

print(f"\n🎉 Re-processing complete! Download your improved results from the file browser on the left.")

Step 1: Upload your CSV file


Saving results_20251021_085011.csv to results_20251021_085011.csv
📁 Uploaded file: results_20251021_085011.csv

Step 2: Re-processing with improved answer extraction...
✅ Loaded CSV with 50 rows

RE-PROCESSING WITH IMPROVED ANSWER EXTRACTION

✅ IMPROVED - Row 0:
  True: 18 | Original: 13 | New: 18
✅ IMPROVED - Row 1:
  True: 3 | Original: 1 | New: 3
✅ IMPROVED - Row 2:
  True: 70000 | Original: 130 | New: 70000
✅ IMPROVED - Row 6:
  True: 260 | Original: 2 | New: 260
✅ IMPROVED - Row 11:
  True: 694 | Original: 204 | New: 694
✅ IMPROVED - Row 17:
  True: 57500 | Original: 35 | New: 57500
✅ IMPROVED - Row 18:
  True: 7 | Original: 28 | New: 7
✅ IMPROVED - Row 20:
  True: 15 | Original: 6 | New: 15
✅ IMPROVED - Row 22:
  True: 7 | Original: 3 | New: 7
✅ IMPROVED - Row 25:
  True: 2 | Original: 15 | New: 2
✅ IMPROVED - Row 27:
  True: 16 | Original: 4 | New: 16.00
✅ IMPROVED - Row 30:
  True: 109 | Original: 18 | New: 109
✅ IMPROVED - Row 31:
  True: 80 | Original: 40 | New: 80
✅ IMPROVED

In [2]:
"""
SIMPLE MODEL EVALUATION - JUST METRICS
"""

import pandas as pd
import numpy as np

def evaluate_model_simple(csv_file_path):
    """
    Simple evaluation - just gives accuracy and metrics
    """

    # Load the CSV
    df = pd.read_csv(csv_file_path)

    # Calculate basic metrics
    total_samples = len(df)
    correct_predictions = df['new_correct'].sum()
    accuracy = (correct_predictions / total_samples) * 100

    # Extraction success rate
    extracted_answers = df[~df['new_predicted'].isna()]
    extraction_success_rate = (len(extracted_answers) / total_samples) * 100

    # Confidence intervals (simplified)
    std_error = np.sqrt(accuracy * (100 - accuracy) / total_samples)
    confidence_interval = (accuracy - 1.96 * std_error, accuracy + 1.96 * std_error)

    print("📊 MODEL EVALUATION METRICS")
    print("=" * 50)
    print(f"Total Questions:      {total_samples}")
    print(f"Correct Predictions:  {correct_predictions}")
    print(f"Accuracy:             {accuracy:.2f}%")
    print(f"Extraction Success:   {extraction_success_rate:.2f}%")
    print(f"95% Confidence:       ({confidence_interval[0]:.2f}% - {confidence_interval[1]:.2f}%)")
    print("=" * 50)

    return {
        'total_samples': total_samples,
        'correct_predictions': correct_predictions,
        'accuracy': accuracy,
        'extraction_success_rate': extraction_success_rate,
        'confidence_interval': confidence_interval
    }

# ============================================================
# UPLOAD AND EVALUATE
# ============================================================

print("Upload your improved CSV file:")
from google.colab import files
uploaded = files.upload()

# Get the uploaded filename
csv_filename = list(uploaded.keys())[0]
print(f"📁 Evaluating: {csv_filename}")

# Run evaluation
metrics = evaluate_model_simple(csv_filename)

Upload your improved CSV file:


Saving improved_results_20251021_101414.csv to improved_results_20251021_101414 (1).csv
📁 Evaluating: improved_results_20251021_101414 (1).csv
📊 MODEL EVALUATION METRICS
Total Questions:      50
Correct Predictions:  28
Accuracy:             56.00%
Extraction Success:   100.00%
95% Confidence:       (42.24% - 69.76%)


In [3]:
"""
EXPORT YOUR MODEL RESULTS
"""

import pandas as pd
import numpy as np
import json
from datetime import datetime

# Your results
results = {
    'model_name': 'Qwen2.5-Math-1.5B',
    'evaluation_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'dataset': 'GSM8K (50 samples)',
    'total_questions': 50,
    'correct_predictions': 28,
    'accuracy': 56.00,
    'extraction_success_rate': 100.00,
    'confidence_interval': {
        'lower_bound': 42.24,
        'upper_bound': 69.76
    },
    'performance_summary': 'Model achieved 56% accuracy on mathematical reasoning tasks with perfect answer extraction'
}

# Export as JSON
with open('model_performance_results.json', 'w') as f:
    json.dump(results, f, indent=2)

# Export as CSV summary
summary_df = pd.DataFrame([{
    'Metric': 'Accuracy',
    'Value': '56.00%',
    'Details': '28/50 correct predictions'
}, {
    'Metric': 'Extraction Success',
    'Value': '100.00%',
    'Details': 'All answers successfully extracted'
}, {
    'Metric': '95% Confidence Interval',
    'Value': '42.24% - 69.76%',
    'Details': 'Statistical confidence range'
}, {
    'Metric': 'Dataset Size',
    'Value': '50 questions',
    'Details': 'GSM8K test samples'
}])

summary_df.to_csv('model_performance_summary.csv', index=False)

# Export detailed results
df = pd.read_csv('improved_results_20251021_101414 (1).csv')
detailed_results = df[['index', 'question', 'true_answer', 'new_predicted', 'new_correct']]
detailed_results.to_csv('detailed_predictions.csv', index=False)

print("✅ RESULTS EXPORTED:")
print("   - model_performance_results.json")
print("   - model_performance_summary.csv")
print("   - detailed_predictions.csv")
print(f"\n🎯 YOUR MODEL PERFORMANCE:")
print(f"   Accuracy: 56.00% (28/50)")
print(f"   Extraction Rate: 100.00%")
print(f"   Confidence: 42.24% - 69.76%")

✅ RESULTS EXPORTED:
   - model_performance_results.json
   - model_performance_summary.csv
   - detailed_predictions.csv

🎯 YOUR MODEL PERFORMANCE:
   Accuracy: 56.00% (28/50)
   Extraction Rate: 100.00%
   Confidence: 42.24% - 69.76%
