In [1]:
## Imports
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer



In [2]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
    auc = roc_auc_score(labels, probs[:,1], multi_class='ovr')
    return {"roc_auc": auc}

def preprocess_function(examples):
    return tokenizer(examples["text"], max_length=128, padding=True, truncation=True)

In [3]:
## Read data
train_essays = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")
test_essays = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv")
train_prompts = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv")
ai_generated_train_essays = pd.read_csv("/kaggle/input/llm-generated-essays/ai_generated_train_essays.csv")
ai_generated_train_essays_gpt4 = pd.read_csv("/kaggle/input/llm-generated-essays/ai_generated_train_essays_gpt-4.csv")
train_essays = pd.concat([train_essays, ai_generated_train_essays, ai_generated_train_essays_gpt4])
train_essays['label'] = train_essays['generated']
train_essays.drop(["id", "prompt_id", "generated"], inplace=True, axis=1)
print(train_essays.shape)
train_essays.head()

(2078, 2)


Unnamed: 0,text,label
0,Cars. Cars have been around since they became ...,0
1,Transportation is a large necessity in most co...,0
2,"""America's love affair with it's vehicles seem...",0
3,How often do you ride in a car? Do you drive a...,0
4,Cars are a wonderful thing. They are perhaps o...,0


In [4]:
# ref: https://www.kaggle.com/datasets/thedrcat/daigt-proper-train-dataset/data
daigt_external_dataset_1 = pd.read_csv("/kaggle/input/daigt-proper-train-dataset/train_drcat_01.csv")
daigt_external_dataset_2 = pd.read_csv("/kaggle/input/daigt-proper-train-dataset/train_drcat_02.csv")
daigt_external_dataset_3 = pd.read_csv("/kaggle/input/daigt-proper-train-dataset/train_drcat_03.csv")
daigt_external_dataset_4 = pd.read_csv("/kaggle/input/daigt-proper-train-dataset/train_drcat_04.csv")
daigt_external_dataset = pd.concat([daigt_external_dataset_1, daigt_external_dataset_2, daigt_external_dataset_3, daigt_external_dataset_4])
daigt_external_dataset.drop(["source", "essay_id", "prompt"], axis=1, inplace=True)
daigt_external_dataset_train = daigt_external_dataset[daigt_external_dataset.fold != 0].drop("fold", axis=1)
daigt_external_dataset_val = daigt_external_dataset[daigt_external_dataset.fold == 0].drop("fold", axis=1)
print(daigt_external_dataset.shape)
daigt_external_dataset.head()

(159456, 3)


Unnamed: 0,text,label,fold
0,There are alot reasons to keep our the despise...,0,2
1,Driving smart cars that drive by themself has ...,0,4
2,"Dear Principal,\n\nI believe that students at ...",0,0
3,"Dear Principal,\n\nCommunity service should no...",0,0
4,My argument for the development of the driverl...,0,3


In [5]:
## Split the dataset
train_essays, val_essays = train_test_split(train_essays, test_size=0.33)
## Merge with external dataset
train_essays = pd.concat([daigt_external_dataset_train, train_essays])
val_essays = pd.concat([daigt_external_dataset_val, val_essays])

In [6]:
## Tokenizer
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/distilbertbaseuncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/input/distilbertbaseuncased", num_labels=2)

## Dataset
train_essay_dataset = Dataset.from_pandas(train_essays)
val_essay_dataset = Dataset.from_pandas(val_essays)
test_essay_dataset = Dataset.from_pandas(test_essays)

## Tokenize datasets
tokenized_train_essays = train_essay_dataset.map(preprocess_function, batched=True)
tokenized_val_essays = val_essay_dataset.map(preprocess_function, batched=True)

## Training
training_args = TrainingArguments(
    output_dir="/kaggle/working/",
    learning_rate=2e-5,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_essays,
    eval_dataset=tokenized_val_essays,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/distilbertbaseuncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/142 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,0.0104,0.004961,0.999883
2,0.0008,0.005161,0.99991


TrainOutput(global_step=35394, training_loss=0.020358492946346955, metrics={'train_runtime': 2444.2216, 'train_samples_per_second': 115.845, 'train_steps_per_second': 14.481, 'total_flos': 9377035982361600.0, 'train_loss': 0.020358492946346955, 'epoch': 2.0})

In [7]:
## Make submission
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
test_ds = Dataset.from_pandas(test)
test_ds_enc = test_ds.map(preprocess_function, batched=True)
test_preds = trainer.predict(test_ds_enc)
logits = test_preds.predictions
probs = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
sub = pd.DataFrame()
sub['id'] = test['id']
sub['generated'] = probs[:,1]
sub.to_csv('submission.csv', index=False)
sub.head()

  0%|          | 0/1 [00:00<?, ?ba/s]

Unnamed: 0,id,generated
0,0000aaaa,0.997225
1,1111bbbb,0.999775
2,2222cccc,0.999643
