In [None]:
# Установка зависимостей
!pip install -q transformers accelerate scikit-learn datasets

In [None]:
# Путь для сохранения модели
model_save_path = "./distilbert-ai-vs-human"

In [None]:
# Обработка датасета
from datasets import load_dataset, Dataset
import pandas as pd

raw = load_dataset("Hello-SimpleAI/HC3", "all", split="train")

data = []
for row in raw:
    data.extend([{"text": t, "label": 0} for t in row["human_answers"]])
    data.extend([{"text": t, "label": 1} for t in row["chatgpt_answers"]])

df = pd.DataFrame(data)
df_human = df[df['label'] == 0]
df_ai    = df[df['label'] == 1]
df_balanced = pd.concat([df_human, df_ai])  # shuffle
full_dataset = Dataset.from_pandas(df_balanced)

In [None]:
# Разделение на train, valid и test
train_valid = full_dataset.train_test_split(test_size=0.2, shuffle=True)
valid_test = train_valid["test"].train_test_split(test_size=0.5, shuffle=True)

train_dataset = train_valid["train"]
valid_dataset = valid_test["train"]
test_dataset  = valid_test["test"]

In [None]:
# Токенизация
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

train_dataset = train_dataset.map(tokenize, batched=True)
valid_dataset = valid_dataset.map(tokenize, batched=True)
test_dataset  = test_dataset.map(tokenize, batched=True)


In [None]:
# Загрузка модели
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


In [None]:
# Метрики
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)

    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }


In [None]:
# Параметры обучения
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=1000,
    save_steps=5000,
    save_strategy="steps",
    logging_dir="./logs",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    load_best_model_at_end=True,
    weight_decay=0.01,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics
)

In [None]:
# Обучение
trainer.train()

In [None]:
# Финальная проверка на тесте
metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Final test metrics:", metrics)

In [None]:

# Сохраняем модель
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)