In [13]:
import json
import os
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_recall_curve, average_precision_score, roc_curve, auc, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, pipeline
import seaborn as sns
import numpy as np
import joblib
import torch
import re
import pandas as pd
import matplotlib.pyplot as plt
import textstat
from tqdm import tqdm

In [14]:
def preprocess_text_nb(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return " ".join(text.split())

def preprocess_text_bert_input(text):
    text = text.lower()
    text = re.sub(r"<[^>]+>", " ", text)
    return re.sub(r"\s+", " ", text).strip()

def load_dev_data_from_single_file(filepath):
    texts, labels, prompts = [], [], []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data = json.loads(line)
                
                if data.get("human_text"):
                    texts.append(data["human_text"])
                    labels.append(0)
                    prompts.append(data.get("prompt", "N/A"))

                if data.get("machine_text"):
                    texts.append(data["machine_text"])
                    labels.append(1)
                    prompts.append(data.get("prompt", "N/A"))
            except Exception: continue
    return texts, labels, prompts

In [15]:
FEATURE_NAMES = ['flesch_reading_ease', 'flesch_kincaid_grade', 'gunning_fog', 'smog_index', 'automated_readability_index', 'coleman_liau_index', 'lexicon_count', 'sentence_count', 'avg_sentence_length', 'avg_word_length', 'type_token_ratio']

def extract_stylometric_features(text: str) -> np.ndarray:
    if not text or len(text.split()) < 3: return np.zeros(len(FEATURE_NAMES))
    try:
        features = [
            textstat.flesch_reading_ease(text),
            textstat.flesch_kincaid_grade(text),
            textstat.gunning_fog(text),
            textstat.smog_index(text),
            textstat.automated_readability_index(text),
            textstat.coleman_liau_index(text),
            textstat.lexicon_count(text),
            textstat.sentence_count(text),
            textstat.avg_sentence_length(text),
            textstat.avg_word_length(text),
        ]
        words = text.lower().split(); ttr = len(set(words)) / len(words) if len(words) > 0 else 0
        features.append(ttr)
        return np.array(features, dtype=np.float32)
    except Exception: return np.zeros(len(FEATURE_NAMES))

In [16]:
def evaluate_model_on_data(texts, labels, prompts, model_name_prefix, preds, probs, output_dir="evaluation_outputs"):
    print(f"\nEvaluating {model_name_prefix}")
    os.makedirs(output_dir, exist_ok=True)

    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", pos_label=1, zero_division=0)
    metrics = {'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1}
    print(f"Results for {model_name_prefix} → Acc: {acc:.4f}, Prec: {prec:.4f}, Rec: {rec:.4f}, F1: {f1:.4f}")

    # Confusion Matrix
    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted Human', 'Predicted AI'], yticklabels=['Actual Human', 'Actual AI'])
    plt.title(f'Confusion Matrix for {model_name_prefix}')
    plt.ylabel('Actual Label'); plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f"{model_name_prefix}_confusion_matrix.png"))
    plt.close()

    # ROC Curve
    if len(np.unique(labels)) > 1:
        fpr, tpr, _ = roc_curve(labels, probs, pos_label=1)
        roc_auc = auc(fpr, tpr)
        metrics['roc_auc'] = roc_auc
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})'); plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC for {model_name_prefix}')
        plt.legend(loc="lower right")
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f"{model_name_prefix}_roc_curve.png"))
        plt.close()
    else: metrics['roc_auc'] = np.nan

    # Precision-Recall Curve
    if len(np.unique(labels)) > 1:
        precision_vals, recall_vals, _ = precision_recall_curve(labels, probs, pos_label=1)
        ap = average_precision_score(labels, probs, pos_label=1)
        metrics['average_precision'] = ap
        plt.figure(); plt.plot(recall_vals, precision_vals, lw=2, color='blue', label=f'PR curve (AP = {ap:.2f})')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.ylim([0.0, 1.05])
        plt.xlim([0.0, 1.0])
        plt.title(f'Precision-Recall Curve for {model_name_prefix}')
        plt.legend(loc="best"); plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f"{model_name_prefix}_pr_curve.png"))
        plt.close()
    else: metrics['average_precision'] = np.nan

    # Distribution of Prediction Scores
    probs_human_true = [probs[i] for i, label in enumerate(labels) if label == 0]
    probs_ai_true = [probs[i] for i, label in enumerate(labels) if label == 1]
    plt.figure()
    if probs_human_true: sns.histplot(probs_human_true, bins=30, alpha=0.6, label='True Human', color='skyblue', kde=False)
    if probs_ai_true: sns.histplot(probs_ai_true, bins=30, alpha=0.6, label='True AI', color='salmon', kde=False)
    plt.title(f'Distribution of Predicted AI Probabilities ({model_name_prefix})')
    plt.xlabel('Predicted Probability of Being AI-Generated')
    plt.ylabel('Frequency')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f"{model_name_prefix}_prob_distribution.png"))
    plt.close()
    
    misclassified_examples = []
    for i in range(len(texts)):
        if preds[i] != labels[i]:
            misclassified_examples.append({"prompt": prompts[i], "text": texts[i], "true_label": "Human" if labels[i] == 0 else "AI", "predicted_label": "Human" if preds[i] == 0 else "AI", "prob_ai_generated": probs[i]})
    return metrics, misclassified_examples

In [17]:
DEV_SET_ROOT_DIR = "./devset/"
OUTPUT_DIR = "evaluation_outputs_final"
BERT_MODEL_DIR = "./bert_ai_detector_final"
NAIVE_BAYES_MODEL_PATH = "baseline_saved_model/naive_bayes_model.joblib"
NAIVE_BAYES_VECTORIZER_PATH = "baseline_saved_model/tfidf_vectorizer.joblib"
HYBRID_CLASSIFIER_PATH = "hybrid_model/hybrid_classifier.joblib"
SCALER_PATH = "hybrid_model/feature_scaler.joblib"

device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'cuda:0' if device == 0 else 'cpu'}.")

# Load All Models
nb_clf = joblib.load(NAIVE_BAYES_MODEL_PATH)
tfidf_vec = joblib.load(NAIVE_BAYES_VECTORIZER_PATH)
bert_pipeline = pipeline("text-classification", model=BERT_MODEL_DIR, tokenizer=BERT_MODEL_DIR, device=device)
hybrid_clf = joblib.load(HYBRID_CLASSIFIER_PATH)
scaler = joblib.load(SCALER_PATH)
bert_base_model = AutoModel.from_pretrained(BERT_MODEL_DIR).to(device).eval()
bert_base_tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_DIR)

Using device: cuda:0.


Device set to use cuda:0


In [18]:
dev_filenames = ["arxiv_chatGPT.jsonl", "arxiv_cohere.jsonl", "reddit_chatGPT.jsonl", "reddit_cohere.jsonl"]
ethics_set_filenames = ["german_wikipedia.jsonl", "hewlett.json", "toefl.json"]
all_results_summary = []

for filename in dev_filenames:
    filepath = os.path.join(DEV_SET_ROOT_DIR, filename)
    dataset_short_name = filename.replace(".jsonl", "")
    print(f"\n\nProcessing Dev File: {dataset_short_name}")
    texts, labels, prompts = load_dev_data_from_single_file(filepath)

    # Evaluate Naive Bayes
    nb_processed_texts = [preprocess_text_nb(t) for t in texts]
    X_dev_tfidf = tfidf_vec.transform(nb_processed_texts)
    nb_preds = nb_clf.predict(X_dev_tfidf)
    nb_probs = nb_clf.predict_proba(X_dev_tfidf)[:, 1]
    nb_metrics, nb_errors = evaluate_model_on_data(texts, labels, prompts, f"NaiveBayes_{dataset_short_name}", nb_preds, nb_probs, OUTPUT_DIR)
    all_results_summary.append({**nb_metrics, 'model': 'NaiveBayes', 'dataset': dataset_short_name})

    # Evaluate BERT
    bert_processed_texts = [preprocess_text_bert_input(t) for t in texts]
    bert_preds_raw = bert_pipeline(bert_processed_texts, truncation=True, padding=True, max_length=256, return_all_scores=True)
    positive_class_label_str = 'LABEL_1'
    if bert_pipeline.model.config.id2label[1]:
        positive_class_label_str = bert_pipeline.model.config.id2label[1]

    bert_probs = [p['score'] for d in bert_preds_raw for p in d if p['label'] == positive_class_label_str]
    bert_preds = [np.argmax([p['score'] for p in d]) for d in bert_preds_raw]
    bert_metrics, bert_errors = evaluate_model_on_data(texts, labels, prompts, f"BERT_{dataset_short_name}", bert_preds, bert_probs, OUTPUT_DIR)
    all_results_summary.append({**bert_metrics, 'model': 'BERT', 'dataset': dataset_short_name})

    # Evaluate Hybrid Model
    print(f"\nExtracting hybrid features for {dataset_short_name}...")
    bert_feats_dev = []
    with torch.no_grad():
        for text in tqdm(texts, desc="Hybrid BERT Features"):
            inputs = bert_base_tokenizer(preprocess_text_bert_input(text), return_tensors="pt", truncation=True, max_length=256).to(device)
            outputs = bert_base_model(**inputs)
            bert_feats_dev.append(outputs.last_hidden_state[:, 0, :].cpu().numpy().squeeze())
            
    style_feats_dev = np.array([extract_stylometric_features(t) for t in texts])
    X_hybrid_dev = np.concatenate([np.array(bert_feats_dev), style_feats_dev], axis=1)
    X_hybrid_dev_scaled = scaler.transform(X_hybrid_dev)
    
    hybrid_preds = hybrid_clf.predict(X_hybrid_dev_scaled)
    hybrid_probs = hybrid_clf.predict_proba(X_hybrid_dev_scaled)[:, 1]
    hybrid_metrics, hybrid_errors = evaluate_model_on_data(texts, labels, prompts, f"Hybrid_BERT_{dataset_short_name}", hybrid_preds, hybrid_probs, OUTPUT_DIR)
    all_results_summary.append({**hybrid_metrics, 'model': 'Hybrid_BERT', 'dataset': dataset_short_name})

# Final Results
if all_results_summary:
    results_df = pd.DataFrame(all_results_summary)
    print("\n\nOverall Dev Set Performance Summary (F1-Score):")
    print(results_df.pivot_table(index='model', columns='dataset', values='f1').round(4))
    print("\n\nAverage Performance Across Dev Subsets:")
    avg_performance = results_df.groupby('model')[['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'average_precision']].mean()
    print(avg_performance.round(4))



Processing Dev File: arxiv_chatGPT

Evaluating NaiveBayes_arxiv_chatGPT
Results for NaiveBayes_arxiv_chatGPT → Acc: 0.5095, Prec: 0.5718, Rec: 0.0757, F1: 0.1336





Evaluating BERT_arxiv_chatGPT
Results for BERT_arxiv_chatGPT → Acc: 0.9555, Prec: 0.9627, Rec: 0.9477, F1: 0.9551

Extracting hybrid features for arxiv_chatGPT...


Hybrid BERT Features: 100%|██████████| 6000/6000 [01:24<00:00, 71.37it/s]
  textstat.avg_sentence_length(text),



Evaluating Hybrid_BERT_arxiv_chatGPT
Results for Hybrid_BERT_arxiv_chatGPT → Acc: 0.9502, Prec: 0.9409, Rec: 0.9607, F1: 0.9507


Processing Dev File: arxiv_cohere

Evaluating NaiveBayes_arxiv_cohere
Results for NaiveBayes_arxiv_cohere → Acc: 0.6597, Prec: 0.8690, Rec: 0.3760, F1: 0.5249





Evaluating BERT_arxiv_cohere
Results for BERT_arxiv_cohere → Acc: 0.8843, Prec: 0.9565, Rec: 0.8053, F1: 0.8744

Extracting hybrid features for arxiv_cohere...


Hybrid BERT Features: 100%|██████████| 6000/6000 [01:21<00:00, 73.57it/s]
  textstat.avg_sentence_length(text),



Evaluating Hybrid_BERT_arxiv_cohere
Results for Hybrid_BERT_arxiv_cohere → Acc: 0.8963, Prec: 0.9339, Rec: 0.8530, F1: 0.8916


Processing Dev File: reddit_chatGPT

Evaluating NaiveBayes_reddit_chatGPT
Results for NaiveBayes_reddit_chatGPT → Acc: 0.4757, Prec: 0.4657, Rec: 0.3300, F1: 0.3863





Evaluating BERT_reddit_chatGPT
Results for BERT_reddit_chatGPT → Acc: 0.8777, Prec: 0.9926, Rec: 0.7610, F1: 0.8615

Extracting hybrid features for reddit_chatGPT...


Hybrid BERT Features: 100%|██████████| 6000/6000 [01:26<00:00, 69.69it/s]
  textstat.avg_sentence_length(text),



Evaluating Hybrid_BERT_reddit_chatGPT
Results for Hybrid_BERT_reddit_chatGPT → Acc: 0.9252, Prec: 0.9822, Rec: 0.8660, F1: 0.9205


Processing Dev File: reddit_cohere

Evaluating NaiveBayes_reddit_cohere
Results for NaiveBayes_reddit_cohere → Acc: 0.5998, Prec: 0.3699, Rec: 0.5467, F1: 0.4413





Evaluating BERT_reddit_cohere
Results for BERT_reddit_cohere → Acc: 0.9164, Prec: 0.9811, Rec: 0.7246, F1: 0.8336

Extracting hybrid features for reddit_cohere...


Hybrid BERT Features: 100%|██████████| 4220/4220 [00:57<00:00, 72.91it/s]
  textstat.avg_sentence_length(text),



Evaluating Hybrid_BERT_reddit_cohere
Results for Hybrid_BERT_reddit_cohere → Acc: 0.9374, Prec: 0.9552, Rec: 0.8221, F1: 0.8837


Overall Dev Set Performance Summary (F1-Score):
dataset      arxiv_chatGPT  arxiv_cohere  reddit_chatGPT  reddit_cohere
model                                                                  
BERT                0.9551        0.8744          0.8615         0.8336
Hybrid_BERT         0.9507        0.8916          0.9205         0.8837
NaiveBayes          0.1336        0.5249          0.3863         0.4413


Average Performance Across Dev Subsets:
             accuracy  precision  recall      f1  roc_auc  average_precision
model                                                                       
BERT           0.9085     0.9732  0.8096  0.8812   0.9762             0.9761
Hybrid_BERT    0.9273     0.9531  0.8754  0.9116   0.9811             0.9778
NaiveBayes     0.5611     0.5691  0.3321  0.3715   0.6028             0.5493
