In [None]:
import torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split as sk_train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import re
import random
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def preprocess_text_for_bert(text):
    text = text.lower()
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def get_data_from_jsonl_for_bert(path, n_per_split=10000):
    raw_ds = load_dataset("json", data_files=path, split=f"train[:{n_per_split*2}]")

    texts = []
    labels = []

    count_human = 0
    count_ai = 0

    for item in raw_ds:
        if item.get("real") and count_human < n_per_split:
            processed_text = preprocess_text_for_bert(item["real"])
            if len(processed_text.split()) >= 5:
                texts.append(processed_text)
                labels.append(0) # 0 for human
                count_human += 1
        if item.get("gpt2") and count_ai < n_per_split:
            processed_text = preprocess_text_for_bert(item["gpt2"])
            if len(processed_text.split()) >= 5:
                texts.append(processed_text)
                labels.append(1) # 1 for AI
                count_ai += 1
        if count_human >= n_per_split and count_ai >= n_per_split:
            break
            
    # Combine and shuffle
    combined = list(zip(texts, labels))
    random.shuffle(combined)
    texts, labels = zip(*combined)
    
    return list(texts), list(labels)

MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256) # Adjust max_length

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

if __name__ == "__main__":
    # Load and prepare data
    texts, labels = get_data_from_jsonl_for_bert("data/train.jsonl", n_per_split=1000)
    
    train_texts, test_texts, train_labels, test_labels = sk_train_test_split(
        texts, labels, test_size=0.2, random_state=42, stratify=labels
    )

    # Create Hugging Face Dataset objects
    train_dataset_dict = {"text": train_texts, "label": train_labels}
    test_dataset_dict = {"text": test_texts, "label": test_labels}

    train_dataset = Dataset.from_dict(train_dataset_dict)
    test_dataset = Dataset.from_dict(test_dataset_dict)

    # Tokenize
    tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
    tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

    # Remove text column
    tokenized_train_dataset = tokenized_train_dataset.remove_columns(["text"]).with_format("torch")
    tokenized_test_dataset = tokenized_test_dataset.remove_columns(["text"]).with_format("torch")

    # Load Model
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

    # Training Arguments
    training_args = TrainingArguments(
        output_dir="./bert_ai_detector_checkpoints",
        num_train_epochs=3,
        per_device_train_batch_size=16, # Adjust based on GPU memory
        per_device_eval_batch_size=32,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="tensorboard",
        fp16=torch.cuda.is_available(),
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_test_dataset,
        compute_metrics=compute_metrics,
    )

    # Train
    print("Starting BERT model training...")
    trainer.train()

    # Evaluate on dev set
    print("\nEvaluating on the development set:")
    eval_results = trainer.evaluate()
    print(f"BERT Dev Set → Acc: {eval_results['eval_accuracy']:.4f}, Prec: {eval_results['eval_precision']:.4f}, Rec: {eval_results['eval_recall']:.4f}, F1: {eval_results['eval_f1']:.4f}")

    # Save the model and tokenizer
    print("\nSaving the fine-tuned model...")
    trainer.save_model("./bert_ai_detector_final")
    tokenizer.save_pretrained("./bert_ai_detector_final")
    print("Model and tokenizer saved to ./bert_ai_detector_final")

Map: 100%|██████████| 1600/1600 [00:00<00:00, 7830.80 examples/s]
Map: 100%|██████████| 400/400 [00:00<00:00, 7855.68 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting BERT model training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6891,0.658841,0.6575,0.632708,0.682081,0.59
2,0.597,0.623373,0.64,0.727273,0.585366,0.96
3,0.4263,0.552195,0.7325,0.689855,0.82069,0.595



Evaluating on the development set:


BERT Dev Set → Acc: 0.6400, Prec: 0.5854, Rec: 0.9600, F1: 0.7273

Saving the fine-tuned model...
Model and tokenizer saved to ./bert_ai_detector_final
