In [25]:
import numpy as np
import pandas as pd
import opendatasets as od

import os
import warnings
import torch

import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')
torch.cuda.empty_cache()
np.random.seed(42)

In [26]:
data_path_dict = {
    'data_path': '../data',
    'model_data': '../data/model_data',
    'data_subset': '../data/model_data/data_subset',
    'models': '../data/models'
}

for file_path_key in data_path_dict:
    if not os.path.exists(data_path_dict[file_path_key]):
        print(f'Path does not Exist: {data_path_dict[file_path_key]}')

        os.makedirs(data_path_dict[file_path_key])

In [27]:
if not (os.path.exists('../data/human-vs-llm-text-corpus') or os.path.exists('../data/human_vs_llm_text_corpus')):
    od.download(dataset_id_or_url="https://www.kaggle.com/datasets/starblasters8/human-vs-llm-text-corpus", data_dir='../data/')
    os.rename('../data/human-vs-llm-text-corpus/', '../data/human_vs_llm_text_corpus/')

In [28]:
raw_data_path = '../data/human_vs_llm_text_corpus/data.parquet'
data_subset_path = '../data/model_data/data_subset/'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [46]:
raw_data = pd.read_parquet(raw_data_path)

raw_data = raw_data[raw_data['source'].isin(['Human', 'GPT-3.5'])]
raw_data = raw_data.groupby('source').apply(lambda row: row.sample(n=26000)).reset_index(drop=True)

raw_data['source'] = np.where(raw_data['source'] == 'Human', 0, 1)

X_train, X_test, y_train, y_test = train_test_split(raw_data['text'], raw_data['source'], test_size=0.02, stratify=raw_data['source'], random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.02, stratify=y_train, random_state=42)

X_train.shape, X_test.shape, X_val.shape

((1920,), (40,), (40,))

In [47]:
def count_parameters(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    non_trainable_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)
    return trainable_params, non_trainable_params

tokenizer = RobertaTokenizer.from_pretrained('roberta-large', max_length=512, is_split_into_words=True)
model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=2)

trainable_params, non_trainable_params = count_parameters(model)
print(f"Trainable parameters: {trainable_params:,}")
print(f"Non-trainable parameters: {non_trainable_params:,}")

for param in model.parameters():
    param.requires_grad = False

for param in model.roberta.encoder.layer[-2:].parameters():
    param.requires_grad = True

trainable_params, non_trainable_params = count_parameters(model)
print(f"Trainable parameters: {trainable_params:,}")
print(f"Non-trainable parameters: {non_trainable_params:,}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable parameters: 355,361,794
Non-trainable parameters: 0
Trainable parameters: 25,192,448
Non-trainable parameters: 330,169,346


In [48]:
train_dataset = datasets.Dataset.from_dict({'text': X_train, 'labels': y_train})
val_dataset = datasets.Dataset.from_dict({'text': X_val, 'labels': y_val})
test_dataset = datasets.Dataset.from_dict({'text': X_test, 'labels': y_test})

def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding=True, truncation=True)

train_dataset = train_dataset.map(tokenization, batched=True, batch_size=128)
val_dataset = val_dataset.map(tokenization, batched=True, batch_size=128)
test_dataset = test_dataset.map(tokenization, batched=True, batch_size=128)

Map:   0%|          | 0/1920 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [49]:
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [50]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)

    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

In [10]:
training_args = TrainingArguments(
    output_dir='../data/models/roberta_large_model_checkpoint', num_train_epochs=5, per_device_train_batch_size=8, per_device_eval_batch_size=8,
    gradient_accumulation_steps=8, disable_tqdm=False, load_best_model_at_end=True, overwrite_output_dir=True,
    logging_steps=8, fp16 = True, logging_dir='../model_logs', dataloader_num_workers = 8,
    run_name = 'roberta-large-classification-v1', evaluation_strategy='epoch', save_strategy='epoch')

In [11]:
trainer = Trainer(
    model=model, args=training_args, compute_metrics=compute_metrics,
    train_dataset=train_dataset, eval_dataset=val_dataset)

In [12]:
trainer.train()

  0%|          | 0/3900 [00:00<?, ?it/s]

{'loss': 0.6812, 'grad_norm': 1.3164498805999756, 'learning_rate': 4.9897435897435904e-05, 'epoch': 0.01}
{'loss': 0.443, 'grad_norm': 4.338006496429443, 'learning_rate': 4.979487179487179e-05, 'epoch': 0.02}
{'loss': 0.2587, 'grad_norm': 2.3948256969451904, 'learning_rate': 4.969230769230769e-05, 'epoch': 0.03}
{'loss': 0.1495, 'grad_norm': 4.941751956939697, 'learning_rate': 4.9589743589743594e-05, 'epoch': 0.04}
{'loss': 0.1376, 'grad_norm': 2.616694688796997, 'learning_rate': 4.948717948717949e-05, 'epoch': 0.05}
{'loss': 0.1548, 'grad_norm': 5.906073570251465, 'learning_rate': 4.9384615384615384e-05, 'epoch': 0.06}
{'loss': 0.1287, 'grad_norm': 2.816087484359741, 'learning_rate': 4.9282051282051285e-05, 'epoch': 0.07}
{'loss': 0.1129, 'grad_norm': 5.12425422668457, 'learning_rate': 4.917948717948718e-05, 'epoch': 0.08}
{'loss': 0.089, 'grad_norm': 2.638442039489746, 'learning_rate': 4.907692307692308e-05, 'epoch': 0.09}
{'loss': 0.0369, 'grad_norm': 1.6807336807250977, 'learning_r

  0%|          | 0/128 [00:00<?, ?it/s]

{'eval_loss': 0.07523393630981445, 'eval_accuracy': 0.9803921568627451, 'eval_f1': 0.9807321772639692, 'eval_precision': 0.9640151515151515, 'eval_recall': 0.9980392156862745, 'eval_runtime': 50.1274, 'eval_samples_per_second': 20.348, 'eval_steps_per_second': 2.553, 'epoch': 1.0}
{'loss': 0.0444, 'grad_norm': 0.2949162423610687, 'learning_rate': 3.994871794871795e-05, 'epoch': 1.0}
{'loss': 0.035, 'grad_norm': 1.9489147663116455, 'learning_rate': 3.984615384615385e-05, 'epoch': 1.01}
{'loss': 0.0559, 'grad_norm': 3.962026834487915, 'learning_rate': 3.974358974358974e-05, 'epoch': 1.03}
{'loss': 0.0169, 'grad_norm': 0.812520444393158, 'learning_rate': 3.964102564102564e-05, 'epoch': 1.04}
{'loss': 0.0253, 'grad_norm': 1.4716525077819824, 'learning_rate': 3.953846153846154e-05, 'epoch': 1.05}
{'loss': 0.0204, 'grad_norm': 0.26988303661346436, 'learning_rate': 3.943589743589744e-05, 'epoch': 1.06}
{'loss': 0.015, 'grad_norm': 0.19651570916175842, 'learning_rate': 3.933333333333333e-05, '

  0%|          | 0/128 [00:00<?, ?it/s]

{'eval_loss': 0.144891157746315, 'eval_accuracy': 0.9705882352941176, 'eval_f1': 0.9713740458015268, 'eval_precision': 0.9460966542750929, 'eval_recall': 0.9980392156862745, 'eval_runtime': 48.702, 'eval_samples_per_second': 20.944, 'eval_steps_per_second': 2.628, 'epoch': 2.0}
{'loss': 0.0135, 'grad_norm': 0.08073705434799194, 'learning_rate': 2.98974358974359e-05, 'epoch': 2.01}
{'loss': 0.0056, 'grad_norm': 0.7607877850532532, 'learning_rate': 2.9794871794871797e-05, 'epoch': 2.02}
{'loss': 0.0095, 'grad_norm': 2.5929064750671387, 'learning_rate': 2.969230769230769e-05, 'epoch': 2.03}
{'loss': 0.0233, 'grad_norm': 0.8164771199226379, 'learning_rate': 2.958974358974359e-05, 'epoch': 2.04}
{'loss': 0.0017, 'grad_norm': 0.3717249035835266, 'learning_rate': 2.948717948717949e-05, 'epoch': 2.05}
{'loss': 0.0202, 'grad_norm': 0.3227551281452179, 'learning_rate': 2.938461538461539e-05, 'epoch': 2.06}
{'loss': 0.0204, 'grad_norm': 0.20356117188930511, 'learning_rate': 2.9282051282051283e-05

  0%|          | 0/128 [00:00<?, ?it/s]

{'eval_loss': 0.074783556163311, 'eval_accuracy': 0.9833333333333333, 'eval_f1': 0.9835748792270531, 'eval_precision': 0.9695238095238096, 'eval_recall': 0.9980392156862745, 'eval_runtime': 48.7688, 'eval_samples_per_second': 20.915, 'eval_steps_per_second': 2.625, 'epoch': 3.0}
{'loss': 0.0014, 'grad_norm': 0.40378057956695557, 'learning_rate': 1.9974358974358975e-05, 'epoch': 3.0}
{'loss': 0.0026, 'grad_norm': 0.03732137009501457, 'learning_rate': 1.987179487179487e-05, 'epoch': 3.01}
{'loss': 0.0173, 'grad_norm': 0.39742687344551086, 'learning_rate': 1.976923076923077e-05, 'epoch': 3.02}
{'loss': 0.0027, 'grad_norm': 0.9576666355133057, 'learning_rate': 1.9666666666666666e-05, 'epoch': 3.03}
{'loss': 0.0047, 'grad_norm': 0.029697470366954803, 'learning_rate': 1.9564102564102564e-05, 'epoch': 3.04}
{'loss': 0.0036, 'grad_norm': 3.3737826347351074, 'learning_rate': 1.9461538461538462e-05, 'epoch': 3.05}
{'loss': 0.0094, 'grad_norm': 0.06182125583291054, 'learning_rate': 1.935897435897

  0%|          | 0/128 [00:00<?, ?it/s]

{'eval_loss': 0.18089893460273743, 'eval_accuracy': 0.9725490196078431, 'eval_f1': 0.9732313575525813, 'eval_precision': 0.9496268656716418, 'eval_recall': 0.9980392156862745, 'eval_runtime': 48.9245, 'eval_samples_per_second': 20.848, 'eval_steps_per_second': 2.616, 'epoch': 4.0}
{'loss': 0.0027, 'grad_norm': 2.871208906173706, 'learning_rate': 9.923076923076923e-06, 'epoch': 4.01}
{'loss': 0.0079, 'grad_norm': 0.2777670621871948, 'learning_rate': 9.820512820512821e-06, 'epoch': 4.02}
{'loss': 0.003, 'grad_norm': 0.006265921052545309, 'learning_rate': 9.71794871794872e-06, 'epoch': 4.03}
{'loss': 0.0053, 'grad_norm': 0.006836553569883108, 'learning_rate': 9.615384615384616e-06, 'epoch': 4.04}
{'loss': 0.0085, 'grad_norm': 0.0829005166888237, 'learning_rate': 9.512820512820514e-06, 'epoch': 4.05}
{'loss': 0.0019, 'grad_norm': 4.503619194030762, 'learning_rate': 9.41025641025641e-06, 'epoch': 4.06}
{'loss': 0.0005, 'grad_norm': 0.07351597398519516, 'learning_rate': 9.307692307692308e-06

  0%|          | 0/128 [00:00<?, ?it/s]

{'eval_loss': 0.13133424520492554, 'eval_accuracy': 0.9784313725490196, 'eval_f1': 0.9788461538461538, 'eval_precision': 0.960377358490566, 'eval_recall': 0.9980392156862745, 'eval_runtime': 48.1612, 'eval_samples_per_second': 21.179, 'eval_steps_per_second': 2.658, 'epoch': 5.0}
{'train_runtime': 10681.1667, 'train_samples_per_second': 23.378, 'train_steps_per_second': 0.365, 'train_loss': 0.024229471454253564, 'epoch': 5.0}


TrainOutput(global_step=3900, training_loss=0.024229471454253564, metrics={'train_runtime': 10681.1667, 'train_samples_per_second': 23.378, 'train_steps_per_second': 0.365, 'total_flos': 2.3259515738485555e+17, 'train_loss': 0.024229471454253564, 'epoch': 4.997597308986064})

In [7]:
test_data = pd.read_csv('../data/test_data/trail.csv')

In [10]:
test_dataset = datasets.Dataset.from_dict({'text': test_data['text'].values, 'labels': test_data['label'].values})

def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding=True, truncation=True)

test_dataset = test_dataset.map(tokenization, batched=True, batch_size=128)
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

In [51]:
model = RobertaForSequenceClassification.from_pretrained('../data/models/roberta_large_model_checkpoint/checkpoint-3900')

trainable_params, non_trainable_params = count_parameters(model)
print(f"Trainable parameters: {trainable_params:,}")
print(f"Non-trainable parameters: {non_trainable_params:,}")

for param in model.parameters():
    param.requires_grad = True

trainable_params, non_trainable_params = count_parameters(model)
print(f"Trainable parameters: {trainable_params:,}")
print(f"Non-trainable parameters: {non_trainable_params:,}")

Trainable parameters: 355,361,794
Non-trainable parameters: 0
Trainable parameters: 355,361,794
Non-trainable parameters: 0


In [52]:
trainer = Trainer(model=model, compute_metrics=compute_metrics)

In [53]:
trainer.evaluate(test_dataset, metric_key_prefix='test')

  0%|          | 0/5 [00:00<?, ?it/s]

{'test_loss': 0.44105520844459534,
 'test_accuracy': 0.925,
 'test_f1': 0.926829268292683,
 'test_precision': 0.9047619047619048,
 'test_recall': 0.95,
 'test_runtime': 2.6514,
 'test_samples_per_second': 15.087,
 'test_steps_per_second': 1.886}

In [54]:
trainer.evaluate(train_dataset, metric_key_prefix='train')

  0%|          | 0/240 [00:00<?, ?it/s]

{'train_loss': 0.2171945869922638,
 'train_accuracy': 0.9671875,
 'train_f1': 0.967067433350758,
 'train_precision': 0.9706190975865687,
 'train_recall': 0.9635416666666666,
 'train_runtime': 123.8822,
 'train_samples_per_second': 15.499,
 'train_steps_per_second': 1.937}

In [16]:
trainer.predict(test_dataset)

  0%|          | 0/1 [00:00<?, ?it/s]

PredictionOutput(predictions=array([[-0.86542106,  1.0865637 ],
       [ 2.1196542 , -1.585902  ],
       [-5.4527307 ,  5.5017896 ],
       [ 4.8288474 , -4.494965  ],
       [-2.2739596 ,  2.325448  ],
       [-5.4020505 ,  5.5328875 ],
       [ 5.023696  , -4.657509  ]], dtype=float32), label_ids=array([1, 0, 1, 0, 1, 1, 0], dtype=int64), metrics={'test_loss': 0.023893997073173523, 'test_accuracy': 1.0, 'test_f1': 1.0, 'test_precision': 1.0, 'test_recall': 1.0, 'test_runtime': 0.4375, 'test_samples_per_second': 16.001, 'test_steps_per_second': 2.286})

In [17]:
test_data['label'].values

array([1, 0, 1, 0, 1, 1, 0], dtype=int64)