# Task-3

Here, we want to know why the model thinks a text is AI generated.  

We are required to use either [SHAP](https://shap.readthedocs.io/en/latest/) or [Captum](https://captum.ai/) for this task. I have decided to use SHAP because:
1. It seems to have nicer visualisations
2. Captum seems to be primarily for PyTorch.

> highlight the words in an **"Imposter" paragraph** that most strongly signaled "AI" to your Tier C model.

This is a bit of a problem. I do not have 3 such paragraphs as recommended. I only have 1 paragraph which was class-1, but mistaken to be class-3. 

I instead have decided to use SHAP to analyse the ones classified with the least confidence.

In [None]:
import pandas as pd
import torch
import shap
import glob
import os
from pathlib import Path
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import PeftModel, PeftConfig
from tqdm import tqdm

MODEL_PATH = "tier_c_final_model"
DATASET_DIR = Path('../dataset')
OUTPUT_DIR = "low_confidence_analysis"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

print("Loading model for inference...")
config = PeftConfig.from_pretrained(MODEL_PATH)
base_model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path, num_labels=3
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(base_model, MODEL_PATH)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def load_and_scan():
    results = {0: [], 1: [], 2: []}
    
    paths = {
        0: (DATASET_DIR / 'class1-human-written', ['01-arthur-conan-doyle', '02-pg-wodehouse', '03-mark-twain', '04-william-shakespeare'], 'extracted_paragraphs'),
        1: (DATASET_DIR / 'class2-ai-written', ['ai-generated-paragraphs'], ''), 
        2: (DATASET_DIR / 'class3-ai-mimicry', ['01-arthur-conan-doyle', '02-pg-wodehouse', '03-mark-twain', '04-william-shakespeare'], '')
    }

    print("\nScanning dataset for low-confidence samples...")
    
    for label, (base_path, subfolders, suffix) in paths.items():
        files = []
        for sub in subfolders:
            if suffix:
                search_path = base_path / sub / suffix
            else:
                if sub == 'ai-generated-paragraphs': 
                    search_path = base_path / sub
                else:
                    search_path = base_path / sub
            
            files.extend(glob.glob(os.path.join(str(search_path), '*.txt')))

        print(f"Scanning Class {label} ({len(files)} files)...")
        
        for file_path in tqdm(files):
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read().strip()
                
                if not text: continue

                inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
                with torch.no_grad():
                    logits = model(**inputs).logits
                    probs = torch.softmax(logits, dim=1)[0]
                
                confidence = probs[label].item()
                
                results[label].append({
                    'confidence': confidence,
                    'text': text,
                    'file': os.path.basename(file_path),
                    'probs': probs.cpu().numpy()
                })
                
            except Exception as e:
                pass

    return results

data_map = load_and_scan()
lowest_samples = {}

print("\n--- RESULTS: LOWEST CONFIDENCE SAMPLES ---")

file_names = {
    0: "low-confidence-class-1-human.txt",
    1: "low-confidence-class-2-generic.txt",
    2: "low-confidence-class-3-mimic.txt"
}

for label in [0, 1, 2]:
    sorted_samples = sorted(data_map[label], key=lambda x: x['confidence'])
    
    bottom_3 = sorted_samples[:3]
    lowest_samples[label] = bottom_3
    
    out_file = os.path.join(OUTPUT_DIR, file_names[label])
    with open(out_file, 'w', encoding='utf-8') as f:
        print(f"\n[Class {label}] Lowest Confidence:")
        for i, sample in enumerate(bottom_3):
            header = f"Sample {i+1} | Conf: {sample['confidence']:.2%} | File: {sample['file']}"
            print(f"  {header}")
            f.write(f"{header}\n")
            f.write(f"{sample['text']}\n")
            f.write("-" * 50 + "\n")

print("\nRunning SHAP Analysis on these 9 samples...")

def predict_shap(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
        scores = torch.softmax(logits, dim=1)
    return scores.cpu().numpy()

explainer = shap.Explainer(predict_shap, tokenizer)

texts_to_explain = []
labels_for_plot = []

for label in [0, 1, 2]:
    for sample in lowest_samples[label]:
        texts_to_explain.append(sample['text'])
        labels_for_plot.append(f"Class_{label}_Conf_{sample['confidence']:.2f}")

shap_values = explainer(texts_to_explain)

html_path = os.path.join(OUTPUT_DIR, "low_confidence_shap_map.html")
with open(html_path, "w", encoding='utf-8') as f:
    f.write(shap.plots.text(shap_values, display=False))

print(f"\nDONE! \n1. Text files saved in '{OUTPUT_DIR}'\n2. SHAP visualization saved to '{html_path}'")

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 