# Qwen3-0.6B Fine-tuning for AI/Human Text Detection

This notebook provides a generic training pipeline for fine-tuning Qwen3-0.6B on AI/Human text detection datasets.

**Features:**
- Dataset selection from 4 available sizes (10k, 100k, 1M, 2M)
- Automatic class weight calculation based on label distribution
- Configurable hyperparameters
- Weights & Biases integration for experiment tracking
- Optimized for RunPod A100 GPU
- PEFT/LoRA for efficient fine-tuning

**Model:** Qwen3-0.6B-Base

**Hardware:** Single A100 GPU (40GB/80GB)


## 1. Installation and Setup


In [None]:
# Install required packages
# Uncomment the following lines if packages are not installed

# !pip install -q -U transformers accelerate datasets evaluate peft bitsandbytes
# !pip install -q -U scikit-learn pandas numpy PyYAML
# !pip install -q -U wandb tensorboard
# !pip install -q -U ipywidgets


In [None]:
# Import libraries
import os
import sys
import yaml
import torch
import numpy as np
import pandas as pd
from datetime import datetime
from pathlib import Path
from typing import Dict, Optional, List, Tuple
import warnings
warnings.filterwarnings('ignore')

# HuggingFace
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    BitsAndBytesConfig,
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType,
)

# Metrics
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report,
)

# Set environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")


## 2. Dataset Selection

Choose the dataset size for training:


In [None]:
# Dataset mapping
DATASET_OPTIONS = {
    "10k": {
        "name": "codefactory4791/raid_aligned_10k",
        "size": "~10K samples",
        "balance": "Balanced (50/50)",
        "class_weights": [1.0, 1.0]  # Equal weights for balanced dataset
    },
    "100k": {
        "name": "codefactory4791/raid_aligned_100k",
        "size": "~100K samples",
        "balance": "Balanced (50/50)",
        "class_weights": [1.0, 1.0]  # Equal weights for balanced dataset
    },
    "1M": {
        "name": "codefactory4791/raid_aligned_1000k",
        "size": "~1M samples",
        "balance": "Slightly Imbalanced (AI: 52.17%, Human: 47.83%)",
        "class_weights": [0.9167, 1.0909]  # Inverse frequency weights
    },
    "2M": {
        "name": "codefactory4791/raid_aligned_2000k",
        "size": "~2M samples",
        "balance": "Imbalanced (AI: 54.57%, Human: 45.43%)",
        "class_weights": [0.8324, 1.2009]  # Inverse frequency weights
    }
}

# Display dataset options
print("Available Datasets:")
print("=" * 80)
for key, info in DATASET_OPTIONS.items():
    print(f"\n{key}:")
    print(f"  Dataset: {info['name']}")
    print(f"  Size: {info['size']}")
    print(f"  Label Balance: {info['balance']}")
    print(f"  Class Weights (AI_Generated, Human_Written): {info['class_weights']}")
print("\n" + "=" * 80)


In [None]:
# SELECT YOUR DATASET HERE
# Options: "10k", "100k", "1M", "2M"

SELECTED_DATASET = "10k"  # Change this to your desired dataset

# Validate selection
if SELECTED_DATASET not in DATASET_OPTIONS:
    raise ValueError(f"Invalid dataset selection. Choose from: {list(DATASET_OPTIONS.keys())}")

# Get dataset info
dataset_info = DATASET_OPTIONS[SELECTED_DATASET]
DATASET_NAME = dataset_info["name"]
CLASS_WEIGHTS = dataset_info["class_weights"]

print(f"Selected Dataset: {SELECTED_DATASET}")
print(f"HuggingFace Dataset: {DATASET_NAME}")
print(f"Size: {dataset_info['size']}")
print(f"Label Balance: {dataset_info['balance']}")
print(f"Class Weights: {CLASS_WEIGHTS}")


## 3. Load Configuration


In [None]:
# Load configuration
config_path = "config.yaml"
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Update config with selected dataset
config['dataset']['dataset_name'] = DATASET_NAME
config['class_weights']['manual_weights'] = CLASS_WEIGHTS

# Update output directory based on dataset
config['training']['output_dir'] = f"./output/qwen3-0.6b-{SELECTED_DATASET}"
config['training']['logging_dir'] = f"./logs/qwen3-0.6b-{SELECTED_DATASET}"
config['misc']['tokenized_cache_dir'] = f"./tokenized_cache/{SELECTED_DATASET}"
config['misc']['sampled_cache_dir'] = f"./sampled_cache/{SELECTED_DATASET}"

# Update run name
timestamp = datetime.now().strftime('%Y%m%d-%H%M%S')
config['training']['run_name'] = f"qwen3-0.6b-{SELECTED_DATASET}-{timestamp}"

# Update wandb tags
config['wandb']['tags'].append(f"dataset-{SELECTED_DATASET}")

print("Configuration loaded successfully!")
print(f"\nModel: {config['model']['name']}")
print(f"Dataset: {config['dataset']['dataset_name']}")
print(f"Output directory: {config['training']['output_dir']}")
print(f"Run name: {config['training']['run_name']}")


## 4. Hyperparameter Configuration

Modify hyperparameters as needed. Default values are loaded from config.yaml.


In [None]:
# Display current hyperparameters
print("Current Hyperparameters:")
print("=" * 80)
print(f"\nTraining:")
print(f"  Epochs: {config['training']['num_train_epochs']}")
print(f"  Batch Size (per device): {config['training']['per_device_train_batch_size']}")
print(f"  Gradient Accumulation Steps: {config['training']['gradient_accumulation_steps']}")
print(f"  Effective Batch Size: {config['training']['per_device_train_batch_size'] * config['training']['gradient_accumulation_steps']}")
print(f"  Learning Rate: {config['training']['learning_rate']}")
print(f"  Weight Decay: {config['training']['weight_decay']}")
print(f"  Warmup Ratio: {config['training']['warmup_ratio']}")
print(f"  LR Scheduler: {config['training']['lr_scheduler_type']}")

print(f"\nLoRA:")
print(f"  Enabled: {config['peft']['enabled']}")
print(f"  LoRA Rank: {config['peft']['lora_r']}")
print(f"  LoRA Alpha: {config['peft']['lora_alpha']}")
print(f"  LoRA Dropout: {config['peft']['lora_dropout']}")

print(f"\nTokenization:")
print(f"  Max Length: {config['tokenization']['max_length']}")

print(f"\nClass Weights:")
print(f"  Enabled: {config['class_weights']['enabled']}")
print(f"  Weights (AI_Generated, Human_Written): {config['class_weights']['manual_weights']}")

print("\n" + "=" * 80)
print("\nTo modify any hyperparameter, edit the config dictionary in the next cell.")


In [None]:
# OPTIONAL: Modify hyperparameters here
# Uncomment and modify as needed

# Example modifications:
# config['training']['num_train_epochs'] = 5
# config['training']['learning_rate'] = 2e-4
# config['training']['per_device_train_batch_size'] = 16
# config['peft']['lora_r'] = 32
# config['tokenization']['max_length'] = 512

print("Hyperparameter configuration complete.")
