In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import BertTokenizer
from transformers import BertModel, AdamW, get_linear_schedule_with_warmup
import torch.nn as nn

In [2]:
# Load the HC3 dataset
dataset = load_dataset("Hello-SimpleAI/HC3", "all")

README.md:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

HC3.py:   0%|          | 0.00/9.47k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/39.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/24322 [00:00<?, ? examples/s]

In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'human_answers', 'chatgpt_answers', 'source'],
        num_rows: 24322
    })
})


In [5]:
# Create a custom Dataset class
class HC3Dataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

In [6]:
# Prepare the dataset
def prepare_hc3_data(dataset):
    texts = []
    labels = []

    # Process human responses (label 0)
    for entry in dataset["train"]:
        texts.append(entry["human_answers"][0])  # Take first human answer
        labels.append(0)  # Human = 0

    # Process ChatGPT responses (label 1)
    for entry in dataset["train"]:
        if entry["chatgpt_answers"]:
            texts.append(entry["chatgpt_answers"][0])  # Take first GPT answer
            labels.append(1)  # GPT3 = 1

    # For simplicity, we'll simulate having GPT-4 and Claude texts
    # In a real implementation, we'd need to collect these from appropriate sources
    # For now, we'll just use the first half of GPT answers as "GPT-4" and second half as "Claude"
    # This is just for demonstration - in practice, use actual data from these models

    half_idx = len([l for l in labels if l == 1]) // 2
    gpt4_count = 0
    claude_count = 0

    for i, label in enumerate(labels):
        if label == 1:
            if gpt4_count < half_idx:
                labels[i] = 2  # GPT-4 = 2
                gpt4_count += 1
            else:
                labels[i] = 3  # Claude = 3
                claude_count += 1

    return texts, labels

In [7]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Prepare data
texts, labels = prepare_hc3_data(dataset)

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

# Create datasets
train_dataset = HC3Dataset(train_texts, train_labels, tokenizer)
val_dataset = HC3Dataset(val_texts, val_labels, tokenizer)

# Create data loaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [12]:
print("Train Samples: ", len(train_dataset))
print("Val Samples: ", len(val_dataset))

Train Samples:  38551
Val Samples:  9638


In [13]:
from transformers import BertModel, BertTokenizer
import torch.nn as nn


class LLMDetector(nn.Module):
    def __init__(self, num_classes, model_name="bert-base-uncased"):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
        self.dropout = nn.Dropout(0.1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled = outputs.pooler_output
        pooled = self.dropout(pooled)
        return self.classifier(pooled)

In [15]:
# Initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: ", device)

Device:  cuda


In [None]:
model = LLMDetector(num_classes=3)  # human, gpt3, claude
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Training parameters
epochs = 3
best_accuracy = 0

# Set up optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=100, num_training_steps=len(train_loader) * epochs
)

# Training loop
for epoch in range(epochs):
    model.train()
    train_loss = 0
    train_preds = []
    train_labels_list = []

    # Training
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()
        train_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
        train_labels_list.extend(labels.cpu().numpy())

        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    # Validation
    model.eval()
    val_loss = 0
    val_preds = []
    val_labels_list = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            val_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            val_labels_list.extend(labels.cpu().numpy())

    # Calculate metrics
    train_accuracy = accuracy_score(train_labels_list, train_preds)
    val_accuracy = accuracy_score(val_labels_list, val_preds)

    print(f"Epoch {epoch+1}/{epochs}")
    print(f"Train Loss: {train_loss/len(train_loader):.4f}, Train Accuracy: {train_accuracy:.4f}")
    print(f"Val Loss: {val_loss/len(val_loader):.4f}, Val Accuracy: {val_accuracy:.4f}")

    # Print classification report for validation data
    target_names = ["Human", "GPT-3", "GPT-4", "Claude"]
    print("\nValidation Classification Report:")
    print(classification_report(val_labels_list, val_preds, target_names=target_names))

    # Save best model
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        torch.save(model.state_dict(), "detector.pth")
        print("Saved best model!")

In [None]:
# Load the best model for inference
model.load_state_dict(torch.load("detector.pth"))
model.to(device)
model.eval()

In [None]:
# Function to predict the source of a text
def predict_text_source(text):
    encoding = tokenizer(
        text, truncation=True, padding="max_length", max_length=512, return_tensors="pt"
    )

    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        probs = torch.softmax(outputs, dim=1)
        predicted_class = torch.argmax(probs, dim=1).item()

    labels = ["Human", "GPT-3", "GPT-4", "Claude"]
    probabilities = probs[0].cpu().numpy()

    return {
        "prediction": labels[predicted_class],
        "confidence": {labels[i]: float(probabilities[i]) for i in range(len(labels))},
    }


# Example usage
sample_text = "This is a sample text to test the AI text detector."
result = predict_text_source(sample_text)
print(f"Prediction: {result['prediction']}")
print("Confidence scores:")
for source, score in result["confidence"].items():
    print(f"- {source}: {score:.4f}")