In [6]:
import json
import os
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_recall_curve, average_precision_score, roc_curve, auc
import seaborn as sns
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
import re
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [None]:
def preprocess_text_nb(text): # For Naive Bayes
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    toks = text.split()
    return " ".join(toks)

def preprocess_text_bert_input(text):
    text = text.lower()
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def load_dev_data_from_single_file(filepath):
    texts = []
    labels = []
    processed_prompts = []
    count = 0
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data = json.loads(line)
                human_text = data.get("human_text")
                machine_text = data.get("machine_text")
                prompt = data.get("prompt", "N/A") # Get prompt if available

                if human_text and isinstance(human_text, str) and len(human_text.strip()) > 0:
                    texts.append(human_text)
                    labels.append(0)
                    processed_prompts.append(prompt)
                
                if machine_text and isinstance(machine_text, str) and len(machine_text.strip()) > 0:
                    texts.append(machine_text)
                    labels.append(1)
                    processed_prompts.append(prompt)
                count +=1
            except json.JSONDecodeError:
                print(f"Warning: Could not decode JSON from line in {filepath}: {line.strip()}")
            except Exception as e:
                print(f"Warning: Error processing line {count} in {filepath}: {line.strip()} - {e}")
    return texts, labels, processed_prompts

# Evaluation Function
def evaluate_model_on_data(texts, labels, prompts, model_name_prefix, model_obj,
                           is_bert=False, bert_tokenizer_obj=None, nb_vectorizer_obj=None,
                           output_dir="evaluation_outputs"):

    print(f"\nEvaluating {model_name_prefix}...")
    os.makedirs(output_dir, exist_ok=True)

    all_preds = []
    all_probs_positive_class = []

    if is_bert:
        bert_pipeline = model_obj
        id2label = bert_pipeline.model.config.id2label
        label_to_int = {v: k for k, v in id2label.items()}
        positive_class_label_str = None
        for int_val, str_val in id2label.items():
            if int_val == 1:
                positive_class_label_str = str_val
                break
        if positive_class_label_str is None:
            if 1 in id2label:
                 positive_class_label_str = id2label[1]
            else:
                if len(id2label) > 1:
                     positive_class_label_str = list(id2label.values())[1]


        processed_texts_for_bert = [preprocess_text_bert_input(t) for t in texts]
        chunk_size = 32
        for i in range(0, len(processed_texts_for_bert), chunk_size):
            batch_texts = processed_texts_for_bert[i:i+chunk_size]
            raw_batch_preds_detailed = bert_pipeline(batch_texts, truncation=True, padding=True, max_length=256, return_all_scores=True)

            for item_preds_detailed in raw_batch_preds_detailed:
                current_pred_label_str = ""
                max_score = -1.0
                
                prob_positive = 0.0
                for pred_info in item_preds_detailed:
                    if pred_info['label'] == positive_class_label_str:
                        prob_positive = pred_info['score']
                    if pred_info['score'] > max_score:
                        max_score = pred_info['score']
                        current_pred_label_str = pred_info['label']
                
                all_preds.append(label_to_int[current_pred_label_str])
                all_probs_positive_class.append(prob_positive)
        preds = all_preds

    else:
        processed_texts_for_nb = [preprocess_text_nb(t) for t in texts]
        X_dev_tfidf = nb_vectorizer_obj.transform(processed_texts_for_nb)
        nb_classifier = model_obj
        preds = nb_classifier.predict(X_dev_tfidf)
        probs = nb_classifier.predict_proba(X_dev_tfidf)
        all_probs_positive_class = probs[:, 1].tolist() # Prob of class 1

    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", pos_label=1, zero_division=0)
    
    metrics = {'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1}

    misclassified_examples = []
    for i in range(len(texts)):
        if preds[i] != labels[i]:
            misclassified_examples.append({
                "prompt": prompts[i],
                "text": texts[i],
                "text_length_words": len(texts[i].split()),
                "true_label": "Human" if labels[i] == 0 else "AI",
                "predicted_label": "Human" if preds[i] == 0 else "AI",
                "prob_ai_generated": all_probs_positive_class[i] # Prob of being AI
            })

    print(f"Results for {model_name_prefix} → Acc: {acc:.4f}, Prec: {prec:.4f}, Rec: {rec:.4f}, F1: {f1:.4f}")

    # Confusion Matrix
    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Predicted Human', 'Predicted AI'], 
                yticklabels=['Actual Human', 'Actual AI'])
    plt.title(f'Confusion Matrix for {model_name_prefix}')
    plt.ylabel('Actual Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f"{model_name_prefix}_confusion_matrix.png"))
    plt.close()

    # ROC Curve
    if len(np.unique(labels)) > 1:
        fpr, tpr, _ = roc_curve(labels, all_probs_positive_class, pos_label=1)
        roc_auc = auc(fpr, tpr)
        metrics['roc_auc'] = roc_auc
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC for {model_name_prefix}')
        plt.legend(loc="lower right")
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f"{model_name_prefix}_roc_curve.png"))
        plt.close()
    else:
        metrics['roc_auc'] = np.nan


    # Precision-Recall Curve
    if len(np.unique(labels)) > 1:
        precision_vals, recall_vals, _ = precision_recall_curve(labels, all_probs_positive_class, pos_label=1)
        ap = average_precision_score(labels, all_probs_positive_class, pos_label=1)
        metrics['average_precision'] = ap
        plt.figure()
        plt.plot(recall_vals, precision_vals, lw=2, color='blue', label=f'PR curve (AP = {ap:.2f})')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.ylim([0.0, 1.05])
        plt.xlim([0.0, 1.0])
        plt.title(f'Precision-Recall Curve for {model_name_prefix}')
        plt.legend(loc="best")
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f"{model_name_prefix}_pr_curve.png"))
        plt.close()
    else:
        metrics['average_precision'] = np.nan


    # Distribution of Prediction Scores
    probs_human_true = [all_probs_positive_class[i] for i, label in enumerate(labels) if label == 0]
    probs_ai_true = [all_probs_positive_class[i] for i, label in enumerate(labels) if label == 1]
    plt.figure()
    if probs_human_true:
        sns.histplot(probs_human_true, bins=30, alpha=0.6, label='True Human', color='skyblue', kde=False)
    if probs_ai_true:
        sns.histplot(probs_ai_true, bins=30, alpha=0.6, label='True AI', color='salmon', kde=False)
    plt.title(f'Distribution of Predicted AI Probabilities ({model_name_prefix})')
    plt.xlabel('Predicted Probability of Being AI-Generated')
    plt.ylabel('Frequency')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f"{model_name_prefix}_prob_distribution.png"))
    plt.close()
    
    return metrics, misclassified_examples

if __name__ == "__main__":
    DEV_SET_ROOT_DIR = "./devset/"
    NAIVE_BAYES_MODEL_PATH = "baseline_saved_model/naive_bayes_model.joblib" 
    NAIVE_BAYES_VECTORIZER_PATH = "baseline_saved_model/tfidf_vectorizer.joblib"
    BERT_MODEL_DIR = "./bert_ai_detector_final"

    # Load Naive Bayes Model and Vectorizer
    nb_clf_loaded = None
    tfidf_vec_loaded = None

    nb_clf_loaded = joblib.load(NAIVE_BAYES_MODEL_PATH)
    tfidf_vec_loaded = joblib.load(NAIVE_BAYES_VECTORIZER_PATH)

    # Load BERT Model and Tokenizer
    bert_pipeline_loaded = None
    device = 0 if torch.cuda.is_available() else -1
    print(f"Using device: {'cuda:0' if device == 0 else 'cpu'} for BERT.")

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_DIR)
    model = AutoModelForSequenceClassification.from_pretrained(BERT_MODEL_DIR)
        
    bert_pipeline_loaded = pipeline(
            "text-classification", 
            model=model, 
            tokenizer=tokenizer, 
            device=device
    )

    dev_filenames = [
        "arxiv_chatGPT.jsonl",
        "arxiv_cohere.jsonl",
        "reddit_chatGPT.jsonl",
        "reddit_cohere.jsonl"
    ]

    all_results_summary = [] # To store dicts for final DataFrame

    for filename in dev_filenames:
        filepath = os.path.join(DEV_SET_ROOT_DIR, filename)
        dataset_short_name = filename.replace(".jsonl", "")
        print(f"\n\n--- Processing Dev File: {filepath} ---")
        texts, labels, prompts = load_dev_data_from_single_file(filepath)
        
        # Evaluate Naive Bayes
        if nb_clf_loaded and tfidf_vec_loaded:
            nb_metrics, nb_errors = evaluate_model_on_data(
                texts, labels, prompts, 
                model_name_prefix=f"NaiveBayes_{dataset_short_name}", 
                model_obj=nb_clf_loaded, 
                is_bert=False, 
                nb_vectorizer_obj=tfidf_vec_loaded
            )
            nb_metrics['model'] = f"NaiveBayes"
            nb_metrics['dataset'] = dataset_short_name
            all_results_summary.append(nb_metrics)
            print(f"Top 3 NB Misclassified for {dataset_short_name}:")
            for i, err in enumerate(nb_errors[:3]):
                 print(f"  NB_Err {i+1}: True={err['true_label']}, Pred={err['predicted_label']}, Text='{err['text'][:100]}...'")

        if bert_pipeline_loaded:
            bert_metrics, bert_errors = evaluate_model_on_data(
                texts, labels, prompts,
                model_name_prefix=f"BERT_{dataset_short_name}", 
                model_obj=bert_pipeline_loaded, 
                is_bert=True
            )
            bert_metrics['model'] = f"BERT"
            bert_metrics['dataset'] = dataset_short_name
            all_results_summary.append(bert_metrics)
            print(f"Top 3 BERT Misclassified for {dataset_short_name}:")
            for i, err in enumerate(bert_errors[:3]):
                 print(f"  BERT_Err {i+1}: True={err['true_label']}, Pred={err['predicted_label']}, Text='{err['text'][:100]}...'")


    # Print Results Table
    if all_results_summary:
        results_df = pd.DataFrame(all_results_summary)
        print("\n\n--- Overall Dev Set Performance Summary ---")
        print(results_df[['model', 'dataset', 'accuracy', 'precision', 'recall', 'f1']])
        print("\n\n--- Average Performance Across Dev Subsets (per model) ---")
        for col in ['accuracy', 'precision', 'recall', 'f1']:
            results_df[col] = pd.to_numeric(results_df[col], errors='coerce')
        avg_performance = results_df.groupby('model')[['accuracy', 'precision', 'recall', 'f1']].mean()
        print(avg_performance)

Device set to use cuda:0


Naive Bayes model and vectorizer loaded successfully.
Using device: cuda:0 for BERT.


--- Processing Dev File: ./devset/arxiv_chatGPT.jsonl ---

Evaluating NaiveBayes_arxiv_chatGPT...
Results for NaiveBayes_arxiv_chatGPT → Acc: 0.5095, Prec: 0.5718, Rec: 0.0757, F1: 0.1336
Top 3 NB Misclassified for arxiv_chatGPT:
  NB_Err 1: True=AI, Pred=Human, Text='In this paper, we investigate the continuum limit of polymer quantum mechanics. The aim of our work ...'
  NB_Err 2: True=AI, Pred=Human, Text='In this paper, we present the results of our analysis of the Serpens star-forming region using data ...'
  NB_Err 3: True=AI, Pred=Human, Text='In this work, we present a new method of integrating stochastic differential equations on Lie groups...'

Evaluating BERT_arxiv_chatGPT...




Results for BERT_arxiv_chatGPT → Acc: 0.8642, Prec: 0.7987, Rec: 0.9737, F1: 0.8776
Top 3 BERT Misclassified for arxiv_chatGPT:
  BERT_Err 1: True=Human, Pred=AI, Text='  A rather non-standard quantum representation of the canonical commutation
relations of quantum mec...'
  BERT_Err 2: True=Human, Pred=AI, Text='  We present Lie group integrators for nonlinear stochastic differential
equations with non-commutat...'
  BERT_Err 3: True=Human, Pred=AI, Text='  The multisite phosphorylation-dephosphorylation cycle is a motif repeatedly
used in cell signaling...'


--- Processing Dev File: ./devset/arxiv_cohere.jsonl ---

Evaluating NaiveBayes_arxiv_cohere...
Results for NaiveBayes_arxiv_cohere → Acc: 0.6597, Prec: 0.8690, Rec: 0.3760, F1: 0.5249
Top 3 NB Misclassified for arxiv_cohere:
  NB_Err 1: True=AI, Pred=Human, Text='
We consider a system of many polymers in solution that interact via an external force that is appli...'
  NB_Err 2: True=AI, Pred=Human, Text='

Spectroscopic Observa



Results for BERT_arxiv_cohere → Acc: 0.8147, Prec: 0.7810, Rec: 0.8747, F1: 0.8252
Top 3 BERT Misclassified for arxiv_cohere:
  BERT_Err 1: True=Human, Pred=AI, Text='  A rather non-standard quantum representation of the canonical commutation
relations of quantum mec...'
  BERT_Err 2: True=AI, Pred=Human, Text='

We present a catalog of 66 YSOs in the Serpens cloud, as observed with IRAC and MIPS, and discuss ...'
  BERT_Err 3: True=Human, Pred=AI, Text='  We present Lie group integrators for nonlinear stochastic differential
equations with non-commutat...'


--- Processing Dev File: ./devset/reddit_chatGPT.jsonl ---

Evaluating NaiveBayes_reddit_chatGPT...
Results for NaiveBayes_reddit_chatGPT → Acc: 0.4757, Prec: 0.4657, Rec: 0.3300, F1: 0.3863
Top 3 NB Misclassified for reddit_chatGPT:
  NB_Err 1: True=Human, Pred=AI, Text='In 1801, James Monroe and Robert R. Livingston (the R. also stood for Robert, oddly enough) were sen...'
  NB_Err 2: True=Human, Pred=AI, Text='Good question! [I



Results for BERT_reddit_chatGPT → Acc: 0.8938, Prec: 0.8267, Rec: 0.9967, F1: 0.9037
Top 3 BERT Misclassified for reddit_chatGPT:
  BERT_Err 1: True=Human, Pred=AI, Text='Henry died in a joust against the captain of his Scottish Guard, Gabriel, the Count of Montgomery.  ...'
  BERT_Err 2: True=Human, Pred=AI, Text='Watergate is an incredibly interesting period of political history that I feel is greatly misunderst...'
  BERT_Err 3: True=Human, Pred=AI, Text='No, in medieval Europe,  there were no restrictions on common soldiers killing a king or any other a...'


--- Processing Dev File: ./devset/reddit_cohere.jsonl ---

Evaluating NaiveBayes_reddit_cohere...
Results for NaiveBayes_reddit_cohere → Acc: 0.5998, Prec: 0.3699, Rec: 0.5467, F1: 0.4413
Top 3 NB Misclassified for reddit_cohere:
  NB_Err 1: True=AI, Pred=Human, Text='

The English king Henry II was famously involved in a 1559 jousting accident in which he accidental...'
  NB_Err 2: True=Human, Pred=AI, Text='In 1801, James Mo



Results for BERT_reddit_cohere → Acc: 0.8457, Prec: 0.6561, Rec: 0.9803, F1: 0.7861
Top 3 BERT Misclassified for reddit_cohere:
  BERT_Err 1: True=Human, Pred=AI, Text='Henry died in a joust against the captain of his Scottish Guard, Gabriel, the Count of Montgomery.  ...'
  BERT_Err 2: True=Human, Pred=AI, Text='Watergate is an incredibly interesting period of political history that I feel is greatly misunderst...'
  BERT_Err 3: True=Human, Pred=AI, Text='No, in medieval Europe,  there were no restrictions on common soldiers killing a king or any other a...'


--- Overall Dev Set Performance Summary ---
        model         dataset  accuracy  precision    recall        f1
0  NaiveBayes   arxiv_chatGPT  0.509500   0.571788  0.075667  0.133647
1        BERT   arxiv_chatGPT  0.864167   0.798742  0.973667  0.877572
2  NaiveBayes    arxiv_cohere  0.659667   0.869029  0.376000  0.524895
3        BERT    arxiv_cohere  0.814667   0.780952  0.874667  0.825157
4  NaiveBayes  reddit_chatGPT  0.