In [1]:
import json
import os
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
import re
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def preprocess_text_nb(text): # For Naive Bayes
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    toks = text.split()
    return " ".join(toks)

def preprocess_text_bert_input(text):
    text = text.lower()
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def load_dev_data_from_single_file(filepath):
    texts = []
    labels = []
    processed_prompts = []
    count = 0
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data = json.loads(line)
                human_text = data.get("human_text")
                machine_text = data.get("machine_text")
                prompt = data.get("prompt", "N/A") # Get prompt if available

                if human_text and isinstance(human_text, str) and len(human_text.strip()) > 0:
                    texts.append(human_text)
                    labels.append(0)
                    processed_prompts.append(prompt)
                
                if machine_text and isinstance(machine_text, str) and len(machine_text.strip()) > 0:
                    texts.append(machine_text)
                    labels.append(1)
                    processed_prompts.append(prompt)
                count +=1
            except json.JSONDecodeError:
                print(f"Warning: Could not decode JSON from line in {filepath}: {line.strip()}")
            except Exception as e:
                print(f"Warning: Error processing line {count} in {filepath}: {line.strip()} - {e}")
    return texts, labels, processed_prompts

# Evaluation Function
def evaluate_model_on_data(texts, labels, prompts, model_name_prefix, model_obj, 
                           is_bert=False, bert_tokenizer_obj=None, nb_vectorizer_obj=None):
    print(f"\nEvaluating {model_name_prefix}...")
    
    all_preds = []
    
    if is_bert:
        # Preprocess texts for BERT input
        processed_texts_for_bert = [preprocess_text_bert_input(t) for t in texts]
        
        # Using the passed model_obj
        bert_pipeline = model_obj 
        id2label = bert_pipeline.model.config.id2label
        label_to_int = {v: k for k, v in id2label.items()}
        chunk_size = 64
        for i in range(0, len(processed_texts_for_bert), chunk_size):
            batch_texts = processed_texts_for_bert[i:i+chunk_size]
            raw_batch_preds = bert_pipeline(batch_texts, truncation=True, padding=True, max_length=256)
            batch_preds = [label_to_int[p['label']] for p in raw_batch_preds]
            all_preds.extend(batch_preds)
        preds = all_preds

    else: # Naive Bayes
        processed_texts_for_nb = [preprocess_text_nb(t) for t in texts]
        X_dev_tfidf = nb_vectorizer_obj.transform(processed_texts_for_nb)
        nb_classifier = model_obj
        preds = nb_classifier.predict(X_dev_tfidf)

    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", pos_label=1, zero_division=0)
    
    misclassified_examples = []
    for i in range(len(texts)):
        if preds[i] != labels[i]:
            misclassified_examples.append({
                "prompt": prompts[i],
                "text": texts[i],
                "true_label": "Human" if labels[i] == 0 else "AI",
                "predicted_label": "Human" if preds[i] == 0 else "AI"
            })

    print(f"Results for {model_name_prefix} → Acc: {acc:.4f}, Prec: {prec:.4f}, Rec: {rec:.4f}, F1: {f1:.4f}")
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}, misclassified_examples

if __name__ == "__main__":
    DEV_SET_ROOT_DIR = "./devset/"
    NAIVE_BAYES_MODEL_PATH = "baseline_saved_model/naive_bayes_model.joblib" 
    NAIVE_BAYES_VECTORIZER_PATH = "baseline_saved_model/tfidf_vectorizer.joblib"
    BERT_MODEL_DIR = "./bert_ai_detector_final"

    # Load Naive Bayes Model and Vectorizer
    nb_clf_loaded = None
    tfidf_vec_loaded = None

    nb_clf_loaded = joblib.load(NAIVE_BAYES_MODEL_PATH)
    tfidf_vec_loaded = joblib.load(NAIVE_BAYES_VECTORIZER_PATH)
    print("Naive Bayes model and vectorizer loaded successfully.")

    # Load BERT Model and Tokenizer
    bert_pipeline_loaded = None
    # Check if CUDA is available
    device = 0 if torch.cuda.is_available() else -1
    print(f"Using device: {'cuda:0' if device == 0 else 'cpu'} for BERT.")

    # Load tokenizer and model first to ensure they exist
    tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_DIR)
    model = AutoModelForSequenceClassification.from_pretrained(BERT_MODEL_DIR)
        
    bert_pipeline_loaded = pipeline(
            "text-classification", 
            model=model, 
            tokenizer=tokenizer, 
            device=device
    )

    dev_filenames = [
        "arxiv_chatGPT.jsonl",
        "arxiv_cohere.jsonl",
        "reddit_chatGPT.jsonl",
        "reddit_cohere.jsonl"
    ]

    all_results_summary = [] # To store dicts for final DataFrame

    for filename in dev_filenames:
        filepath = os.path.join(DEV_SET_ROOT_DIR, filename)
        dataset_short_name = filename.replace(".jsonl", "")
        print(f"\n\n--- Processing Dev File: {filepath} ---")
        texts, labels, prompts = load_dev_data_from_single_file(filepath)
        
        # Evaluate Naive Bayes
        if nb_clf_loaded and tfidf_vec_loaded:
            nb_metrics, nb_errors = evaluate_model_on_data(
                texts, labels, prompts, 
                model_name_prefix=f"NaiveBayes_{dataset_short_name}", 
                model_obj=nb_clf_loaded, 
                is_bert=False, 
                nb_vectorizer_obj=tfidf_vec_loaded
            )
            nb_metrics['model'] = f"NaiveBayes" # For grouping
            nb_metrics['dataset'] = dataset_short_name
            all_results_summary.append(nb_metrics)
            print(f"Top 3 NB Misclassified for {dataset_short_name}:")
            for i, err in enumerate(nb_errors[:3]):
                 print(f"  NB_Err {i+1}: True={err['true_label']}, Pred={err['predicted_label']}, Text='{err['text'][:100]}...'")

        if bert_pipeline_loaded:
            bert_metrics, bert_errors = evaluate_model_on_data(
                texts, labels, prompts,
                model_name_prefix=f"BERT_{dataset_short_name}", 
                model_obj=bert_pipeline_loaded, 
                is_bert=True
            )
            bert_metrics['model'] = f"BERT" # For grouping
            bert_metrics['dataset'] = dataset_short_name
            all_results_summary.append(bert_metrics)
            print(f"Top 3 BERT Misclassified for {dataset_short_name}:")
            for i, err in enumerate(bert_errors[:3]):
                 print(f"  BERT_Err {i+1}: True={err['true_label']}, Pred={err['predicted_label']}, Text='{err['text'][:100]}...'")


    # Print Results Table
    if all_results_summary:
        results_df = pd.DataFrame(all_results_summary)
        print("\n\n--- Overall Dev Set Performance Summary ---")
        print(results_df[['model', 'dataset', 'accuracy', 'precision', 'recall', 'f1']])
        print("\n\n--- Average Performance Across Dev Subsets (per model) ---")
        for col in ['accuracy', 'precision', 'recall', 'f1']:
            results_df[col] = pd.to_numeric(results_df[col], errors='coerce')
        avg_performance = results_df.groupby('model')[['accuracy', 'precision', 'recall', 'f1']].mean()
        print(avg_performance)

Naive Bayes model and vectorizer loaded successfully.
Using device: cuda:0 for BERT.


Device set to use cuda:0




--- Processing Dev File: ./devset/arxiv_chatGPT.jsonl ---

Evaluating NaiveBayes_arxiv_chatGPT...
Results for NaiveBayes_arxiv_chatGPT → Acc: 0.5312, Prec: 0.6866, Rec: 0.1147, F1: 0.1965
Top 3 NB Misclassified for arxiv_chatGPT:
  NB_Err 1: True=AI, Pred=Human, Text='In this paper, we investigate the continuum limit of polymer quantum mechanics. The aim of our work ...'
  NB_Err 2: True=AI, Pred=Human, Text='In this work, we present the results of high-resolution spectroscopic observations of the intermedia...'
  NB_Err 3: True=AI, Pred=Human, Text='In this work, we present a new method of integrating stochastic differential equations on Lie groups...'

Evaluating BERT_arxiv_chatGPT...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Results for BERT_arxiv_chatGPT → Acc: 0.5333, Prec: 0.7128, Rec: 0.1117, F1: 0.1931
Top 3 BERT Misclassified for arxiv_chatGPT:
  BERT_Err 1: True=AI, Pred=Human, Text='In this paper, we investigate the continuum limit of polymer quantum mechanics. The aim of our work ...'
  BERT_Err 2: True=AI, Pred=Human, Text='In this paper, we present the results of our analysis of the Serpens star-forming region using data ...'
  BERT_Err 3: True=AI, Pred=Human, Text='In this work, we present the results of high-resolution spectroscopic observations of the intermedia...'


--- Processing Dev File: ./devset/arxiv_cohere.jsonl ---

Evaluating NaiveBayes_arxiv_cohere...
Results for NaiveBayes_arxiv_cohere → Acc: 0.6577, Prec: 0.8754, Rec: 0.3677, F1: 0.5178
Top 3 NB Misclassified for arxiv_cohere:
  NB_Err 1: True=AI, Pred=Human, Text='
We consider a system of many polymers in solution that interact via an external force that is appli...'
  NB_Err 2: True=AI, Pred=Human, Text='

Spectroscopic Observa