# Sanity Test

Here, I am performing a small sanity test to ensure that it isn't overfitting.  
I'm using a completely different author from which I will sample 20 paragraphs, 20 generated, and 20 mimiced tests.

We will see how the model performs on these.

I'm choosing Agatha Christie for this.

In [1]:
import os
import random
import numpy as np
import torch
from pathlib import Path
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the trained model
print("Loading tier_c_final_model...")
peft_model_id = "../tier_c_final_model"

config = PeftConfig.from_pretrained(peft_model_id)
base_model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path, 
    num_labels=3
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(base_model, peft_model_id)
model.eval()

print("✓ Model loaded successfully")

# Load all test files
mini_dataset_dir = Path("mini-dataset")

test_samples = []

# Load class-1 (human-written) - label 0
class1_dir = mini_dataset_dir / "class-1"
for txt_file in class1_dir.glob("*.txt"):
    with open(txt_file, 'r', encoding='utf-8') as f:
        text = f.read().strip()
        test_samples.append({
            'file': txt_file.name,
            'text': text,
            'actual_class': 0,
            'class_name': 'Class 1: Human-written'
        })

# Load class-2 (AI-written) - label 1
class2_dir = mini_dataset_dir / "class-2"
for txt_file in class2_dir.glob("*.txt"):
    with open(txt_file, 'r', encoding='utf-8') as f:
        text = f.read().strip()
        test_samples.append({
            'file': txt_file.name,
            'text': text,
            'actual_class': 1,
            'class_name': 'Class 2: AI-written'
        })

# Load class-3 (AI-mimicry) - label 2
class3_dir = mini_dataset_dir / "class-3"
for txt_file in class3_dir.glob("*.txt"):
    with open(txt_file, 'r', encoding='utf-8') as f:
        text = f.read().strip()
        test_samples.append({
            'file': txt_file.name,
            'text': text,
            'actual_class': 2,
            'class_name': 'Class 3: AI-mimicry'
        })

print(f"\nLoaded {len(test_samples)} test samples:")
print(f"  Class 1: {sum(1 for s in test_samples if s['actual_class'] == 0)}")
print(f"  Class 2: {sum(1 for s in test_samples if s['actual_class'] == 1)}")
print(f"  Class 3: {sum(1 for s in test_samples if s['actual_class'] == 2)}")

# Shuffle to avoid data leakage
random.seed(42)
random.shuffle(test_samples)
print("\n✓ Samples shuffled (no data leakage)")

# Run predictions
print("\nRunning predictions...")
print("=" * 80)

correct = 0
total = 0
results = []

class_names = {
    0: 'Class 1: Human-written',
    1: 'Class 2: AI-written',
    2: 'Class 3: AI-mimicry'
}

for i, sample in enumerate(test_samples, 1):
    # Tokenize and predict
    inputs = tokenizer(sample['text'], return_tensors="pt", truncation=True, max_length=512)
    
    with torch.no_grad():
        logits = model(**inputs).logits
        probs = torch.softmax(logits, dim=1)[0]
        predicted_class = torch.argmax(probs).item()
        confidence = probs[predicted_class].item()
    
    actual = sample['actual_class']
    is_correct = (predicted_class == actual)
    
    if is_correct:
        correct += 1
        status = "✓ SUCCESS"
    else:
        status = "✗ FAILED"
    
    total += 1
    
    # Store result
    results.append({
        'file': sample['file'],
        'text': sample['text'],
        'actual': actual,
        'predicted': predicted_class,
        'confidence': confidence,
        'correct': is_correct
    })
    
    # Print result
    print(f"[{i}/{len(test_samples)}] {status}")
    print(f"  File: {sample['file']}")
    print(f"  Actual: {class_names[actual]}")
    print(f"  Predicted: {class_names[predicted_class]} (confidence: {confidence:.4f})")
    print()

print("=" * 80)
print(f"\nFinal Results: {correct}/{total} correct ({100*correct/total:.2f}% accuracy)")
print()

# Count misclassifications by category
misclassified = [r for r in results if not r['correct']]
print(f"Misclassified: {len(misclassified)} samples")

if len(misclassified) > 0:
    print("\nMisclassification breakdown:")
    for actual_cls in range(3):
        for predicted_cls in range(3):
            if actual_cls != predicted_cls:
                count = sum(1 for r in misclassified if r['actual'] == actual_cls and r['predicted'] == predicted_cls)
                if count > 0:
                    print(f"  {class_names[actual_cls]} → {class_names[predicted_cls]}: {count}")

  from .autonotebook import tqdm as notebook_tqdm


Loading tier_c_final_model...


Loading weights: 100%|██████████| 100/100 [00:00<00:00, 526.65it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_projector.bias    | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
pre_classifier.bias     | MISSING    | 
pre_classifier.weight   | MISSING    | 
classifier.weight       | MISSING    | 
classifier.bias         | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


✓ Model loaded successfully

Loaded 59 test samples:
  Class 1: 41
  Class 2: 9
  Class 3: 9

✓ Samples shuffled (no data leakage)

Running predictions...
[1/59] ✓ SUCCESS
  File: The_Murder_on_the_Links_T03_P01.txt
  Actual: Class 2: AI-written
  Predicted: Class 2: AI-written (confidence: 1.0000)

[2/59] ✓ SUCCESS
  File: The_Mysterious_Affair_at_Styles_09.txt
  Actual: Class 1: Human-written
  Predicted: Class 1: Human-written (confidence: 1.0000)

[3/59] ✓ SUCCESS
  File: CHRISTIE_The_Murder_on_the_Links_T02_P05.txt
  Actual: Class 3: AI-mimicry
  Predicted: Class 3: AI-mimicry (confidence: 0.9952)

[4/59] ✓ SUCCESS
  File: The_Murder_on_the_Links_09.txt
  Actual: Class 1: Human-written
  Predicted: Class 1: Human-written (confidence: 1.0000)

[5/59] ✓ SUCCESS
  File: The_Murder_on_the_Links_11.txt
  Actual: Class 1: Human-written
  Predicted: Class 1: Human-written (confidence: 1.0000)

[6/59] ✓ SUCCESS
  File: The_Mysterious_Affair_at_Styles_14.txt
  Actual: Class 1: Human-writte

In [2]:
# Save misclassified samples to sanity-test_misclassified/
output_dir = 'sanity-test_misclassified'
os.makedirs(output_dir, exist_ok=True)

# Define misclassification categories
categories = [
    (0, 1, 'class1_as_class2.txt', 'Class 1 (Human) misclassified as Class 2 (AI)'),
    (0, 2, 'class1_as_class3.txt', 'Class 1 (Human) misclassified as Class 3 (AI-mimicry)'),
    (1, 0, 'class2_as_class1.txt', 'Class 2 (AI) misclassified as Class 1 (Human)'),
    (1, 2, 'class2_as_class3.txt', 'Class 2 (AI) misclassified as Class 3 (AI-mimicry)'),
    (2, 0, 'class3_as_class1.txt', 'Class 3 (AI-mimicry) misclassified as Class 1 (Human)'),
    (2, 1, 'class3_as_class2.txt', 'Class 3 (AI-mimicry) misclassified as Class 2 (AI)')
]

total_saved = 0

for actual_class, predicted_class, filename, description in categories:
    # Filter misclassified examples for this category
    category_misclassified = [r for r in results if r['actual'] == actual_class and r['predicted'] == predicted_class]
    
    if len(category_misclassified) == 0:
        continue
    
    filepath = os.path.join(output_dir, filename)
    
    with open(filepath, 'w', encoding='utf-8') as outfile:
        outfile.write("=" * 80 + "\n")
        outfile.write(f"{description}\n")
        outfile.write(f"Total: {len(category_misclassified)} files\n")
        outfile.write("=" * 80 + "\n\n")
        
        for result in category_misclassified:
            outfile.write("-" * 80 + "\n")
            outfile.write(f"File: {result['file']}\n")
            outfile.write(f"Actual: {class_names[result['actual']]}\n")
            outfile.write(f"Predicted: {class_names[result['predicted']]}\n")
            outfile.write(f"Confidence: {result['confidence']:.4f}\n")
            outfile.write("-" * 80 + "\n")
            outfile.write(result['text'])
            outfile.write("\n\n")
    
    total_saved += len(category_misclassified)
    print(f"Saved {len(category_misclassified)} files to {filename}")

if total_saved > 0:
    print(f"\n✓ Total: {total_saved} misclassified text files saved to {output_dir}/")
else:
    print(f"\n✓ No misclassifications to save - perfect accuracy!")
    # Still create empty files for consistency
    for _, _, filename, _ in categories:
        filepath = os.path.join(output_dir, filename)
        with open(filepath, 'w', encoding='utf-8') as outfile:
            outfile.write("=" * 80 + "\n")
            outfile.write(f"{filename.replace('.txt', '').replace('_', ' ').title()}\n")
            outfile.write(f"Total: 0 files\n")
            outfile.write("=" * 80 + "\n")
    print(f"Created empty category files in {output_dir}/ for consistency")

Saved 1 files to class2_as_class3.txt

✓ Total: 1 misclassified text files saved to sanity-test_misclassified/
