# VisioNova AI Detector Training

This notebook trains a DeBERTa-v3-base model for AI text detection.
It saves the model directly to your Google Drive.

In [None]:
# 1. Setup Environment & Drive
import os
from google.colab import drive

# Disable W&B logging to prevent blocking prompts
os.environ["WANDB_DISABLED"] = "true"

drive.mount('/content/drive')

!pip install transformers datasets scikit-learn accelerate sentencepiece pandas

In [None]:
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

# Settings
MODEL_NAME = "microsoft/deberta-v3-base"
DATASET_NAME = "artem9k/ai-text-detection-pile"
MAX_SAMPLES = 20000
EPOCHS = 3
BATCH_SIZE = 8

# Save directly to Drive
OUTPUT_DIR = "/content/drive/MyDrive/VisioNova_Model"
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Model will be saved to: {OUTPUT_DIR}")

In [None]:
# 2. Download and Balance Data
print("Downloading dataset...")
dataset = load_dataset(DATASET_NAME, split="train", streaming=True)

rows = []
counts = {0: 0, 1: 0}
target = MAX_SAMPLES // 2

print("Balancing...")
for item in dataset:
    text = item.get('text', item.get('content', ''))
    if not text or len(text) < 50: continue
    
    label = item.get('label')
    if label is None:
        if 'generated' in item: label = 1 if item['generated'] else 0
        elif 'source' in item and 'ai' in str(item['source']).lower(): label = 1
        else: continue
    
    label = int(label)
    if counts[label] < target:
        rows.append({'text': text, 'label': label})
        counts[label] += 1
    if counts[0] >= target and counts[1] >= target: break

df = pd.DataFrame(rows)

In [None]:
# 3. Train
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

def tokenize(df):
    texts = df['text'].tolist()
    labels = df['label'].tolist()
    encs = tokenizer(texts, truncation=True, padding=True, max_length=512)
    class DS(torch.utils.data.Dataset):
        def __init__(self, encs, labs): self.encs, self.labs = encs, labs
        def __getitem__(self, i): 
            item = {k: torch.tensor(v[i]) for k, v in self.encs.items()}
            item['labels'] = torch.tensor(self.labs[i])
            return item
        def __len__(self): return len(self.labs)
    return DS(encs, labels)

train_ds = tokenize(train_df)
val_ds = tokenize(val_df)

def metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    warmup_ratio=0.1,
    fp16=torch.cuda.is_available(),
    report_to="none"  # Disable W&B logging
)

trainer = Trainer(model, args, train_dataset=train_ds, eval_dataset=val_ds, compute_metrics=metrics)
trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"DONE! Model saved to {OUTPUT_DIR}")