<a href="https://colab.research.google.com/github/sanjanasajith/AI-Generated-Text-Detection/blob/main/AI_Generated_Text_Detection_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import time
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, AutoTokenizer
from datasets import Dataset
import pandas as pd
import torch
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load data
data = pd.read_csv('/content/LLM.csv')
data.columns = ['text', 'label']

# Convert textual labels to numerical values
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# Drop rows with NaN labels and ensure valid labels
data = data.dropna(subset=['label'])
data = data[data['label'].isin([0, 1])]

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(data)

# Split the dataset
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Load tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to("cuda")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["text"])
eval_dataset = eval_dataset.remove_columns(["text"])

# Training arguments
training_args = TrainingArguments(
    output_dir='/kaggle/working/',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='/kaggle/working/',
    logging_steps=10,
    report_to="none",
)

# Compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model on a small subset to estimate time
fraction = 0.1
train_subset = train_dataset.select(range(int(len(train_dataset) * fraction)))
start_time = time.time()
trainer.train_dataset = train_subset
trainer.train()
end_time = time.time()

# Estimate full training time
training_time_fraction = end_time - start_time
estimated_full_training_time = training_time_fraction / fraction
print(f"Estimated full training time: {estimated_full_training_time:.2f} seconds")

# Train on the full dataset
trainer.train_dataset = train_dataset
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Real-time input loop for predictions
print("\nEnter a sentence to classify (type 'exit' to quit):")
model.eval()

while True:
    user_input = input("Your sentence: ")
    if user_input.lower() == 'exit':
        break

    # Tokenize the user input
    inputs = tokenizer(user_input, truncation=True, padding=True, max_length=512, return_tensors="pt").to("cuda")

    # Make prediction
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1).item()

    # Convert prediction to label
    predicted_label = label_encoder.inverse_transform([prediction])[0]
    print(f"Predicted label: {predicted_label}\n")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/881 [00:00<?, ? examples/s]

Map:   0%|          | 0/221 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.394587,1.0
2,0.495300,0.25462,1.0
3,0.495300,0.218867,1.0


Estimated full training time: 530.89 seconds


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0067,0.004342,1.0
2,0.002,0.002612,1.0
3,0.0016,0.002404,1.0


Evaluation results: {'eval_loss': 0.002403580117970705, 'eval_accuracy': 1.0, 'eval_runtime': 6.2271, 'eval_samples_per_second': 35.49, 'eval_steps_per_second': 2.248, 'epoch': 3.0}

Enter a sentence to classify (type 'exit' to quit):
Predicted label: student

Your sentence: The error "ModuleNotFoundError: 
Predicted label: ai



In [None]:
!pip install datasets



Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading