In [1]:
!pip install datasets
!pip install accelerate -U
!pip install evaluate
!pip install sentencepiece

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict

### give your file path
# dataset_path = '/Users/rjaditya/Documents/NEU-SEM/Spring-2024/ML/LLM-vs-Human-Text-Detection/train_v2_drcat_02.csv'
dataset_path = "/content/drive/MyDrive/CS6140/project/train_v2_drcat_02.csv"
df = pd.read_csv(dataset_path, header = 0)

### get the labels
label = df['label']
label = label.astype("int")
# df.drop(columns='label', axis = 1, inplace = True)
df.drop(columns='RDizzl3_seven', axis = 1, inplace=True)
df.drop(columns='prompt_name', axis = 1, inplace=True)
df.drop(columns='source', axis = 1, inplace=True)
df.dropna(subset=['text'], inplace=True)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

In [6]:
from transformers import AlbertTokenizer, AlbertForSequenceClassification
import torch

### testing
model = AlbertForSequenceClassification.from_pretrained("albert-base-v2", num_labels = 2)
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

In [7]:
def tokenize_func(dataset):
    return tokenizer(list(dataset['text']), padding = "max_length", truncation = True)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_tokenized_datasets = train_dataset.map(tokenize_func, batched = True)
test_tokenized_datasets = test_dataset.map(tokenize_func, batched = True)
train_tokenized_datasets.set_format(type='torch', columns = ['input_ids', 'attention_mask', 'label'])
test_tokenized_datasets.set_format(type='torch', columns = ['input_ids', 'attention_mask', 'label'])

print(train_tokenized_datasets)
print(test_tokenized_datasets)

# Drop columns from the tokenized datasets
columns_to_drop = ["text", "__index_level_0__", "token_type_ids"]  # Specify columns to drop
train_tokenized_datasets = train_tokenized_datasets.remove_columns(columns_to_drop)
test_tokenized_datasets = test_tokenized_datasets.remove_columns(columns_to_drop)

print(train_tokenized_datasets)
print(test_tokenized_datasets)

Map:   0%|          | 0/35894 [00:00<?, ? examples/s]

Map:   0%|          | 0/8974 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 35894
})
Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 8974
})
Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 35894
})
Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 8974
})


In [8]:
from transformers import Trainer, TrainingArguments
import evaluate
import torch.nn.functional as F

batch_size = 20
args = TrainingArguments(output_dir = "LLMvHuman_finetune",
                               num_train_epochs = 2,
                               learning_rate = 2e-5,
                               per_device_train_batch_size = batch_size,
                               per_device_eval_batch_size = batch_size,
                               weight_decay = 0.01,
                               evaluation_strategy = "epoch",
                               disable_tqdm = False,
                               logging_steps = len(train_dataset),
                               push_to_hub = False,
                               log_level = "error",
                               fp16=True,
                               fp16_backend="auto"
                              )
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [9]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=args,                  # training arguments, defined above
    train_dataset=train_tokenized_datasets,         # training dataset
    eval_dataset=test_tokenized_datasets,             # evaluation dataset
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

# Step 8: Train the model
trainer.train()

# Step 9: Evaluate the model
trainer.evaluate()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.025607,0.995208


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.025607,0.995208
2,No log,0.014801,0.99766


{'eval_loss': 0.014801331795752048,
 'eval_accuracy': 0.9976599063962559,
 'eval_runtime': 217.9927,
 'eval_samples_per_second': 41.167,
 'eval_steps_per_second': 2.06,
 'epoch': 2.0}

In [13]:
from transformers import AlbertForSequenceClassification, AlbertTokenizer
import torch

# Load the fine-tuned model
model_path = "LLMvHuman_finetune/checkpoint-1000"  # Specify the path to your fine-tuned model
model = AlbertForSequenceClassification.from_pretrained(model_path)

# Load the tokenizer
tokenizer = AlbertTokenizer.from_pretrained(model_path)

# Example text for prediction
text = "Essay written by human. Will my model classify this correctly? I am a bit skeptical. See the word embeddings are positional encodings. Idk what else the model will learn apart from context w.r.t the content. Have to see!"
text = "fuck yes.....this is human"
text = """
  In the era of advancing technology, the emergence of Large Language Models (LLMs) like GPT has stirred debates on the potential rivalry between artificial and human intelligence.

LLMs, powered by vast datasets and complex algorithms, exhibit remarkable linguistic abilities. However, they lack the nuanced understanding, consciousness, and emotional depth inherent in human intelligence.

Despite disparities, LLMs and humans can collaborate effectively. LLMs excel in data processing, while humans contribute contextual understanding and ethical judgment.

Ethical concerns, including privacy and algorithmic bias, necessitate careful regulation and oversight. Furthermore, societal implications such as job displacement and inequality must be addressed.

In conclusion, while LLMs offer significant potential, their integration should be approached with caution. Collaborative efforts can ensure that AI enhances, rather than undermines, human well-being and autonomy.
"""
# Tokenize input text
inputs = tokenizer(text, return_tensors="pt")

# Perform inference
with torch.no_grad():
    outputs = model(**inputs)

# Get predicted class probabilities
probs = torch.softmax(outputs.logits, dim=-1)

# Get predicted class label
predicted_class = torch.argmax(probs, dim=-1).item()

print("Predicted class label:", predicted_class) #it needs to print 0
print("Predicted class probabilities:", probs)


Predicted class label: 1
Predicted class probabilities: tensor([[7.5650e-05, 9.9992e-01]])
