<a href="https://colab.research.google.com/github/sayantann7/ai-vs-human-text-classifier/blob/main/AI_VS_Human_Text_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ==============================
# 1) Install & Import Libraries
# ==============================
!pip install -q kagglehub tensorflow pandas scikit-learn

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split

print("Libraries installed and imported.")

Libraries installed and imported.


In [2]:
# =========================================
# 2) Set Up Kaggle API Credentials (if needed)
# =========================================
# If you haven't uploaded your kaggle.json yet, run the following:
#
# from google.colab import files
# files.upload()   # Upload your kaggle.json file here.
#
# Then run these commands to set it up:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/  # This copies your uploaded kaggle.json to the right folder
!chmod 600 ~/.kaggle/kaggle.json

print("Kaggle API credentials set up.")

Kaggle API credentials set up.


In [3]:
# ============================================
# 3) Download the Dataset Using Kaggle CLI
# ============================================
# Replace "shanegerami/ai-vs-human-text" with the actual dataset slug.
# This command downloads and unzips the dataset in the current working directory.
!kaggle datasets download -d shanegerami/ai-vs-human-text --unzip

# List files to confirm download
!ls -lh

Dataset URL: https://www.kaggle.com/datasets/shanegerami/ai-vs-human-text
License(s): other
Downloading ai-vs-human-text.zip to /content
 98% 342M/350M [00:01<00:00, 180MB/s]
100% 350M/350M [00:01<00:00, 192MB/s]
total 1.1G
-rw-r--r-- 1 root root 1.1G Mar 20 14:04 AI_Human.csv
-rw-r--r-- 1 root root   66 Mar 20 13:58 kaggle.json
drwxr-xr-x 1 root root 4.0K Mar 17 13:32 sample_data


In [27]:
# =====================================
# 4) Load and Limit the CSV Data
# =====================================
# Assume the CSV file is named "AI_Human.csv".
# We limit the number of rows to 50,000 (you can change this as needed).

csv_file = "AI_Human.csv"
df = pd.read_csv(csv_file)
print(f"Original dataset shape: {df.shape}")

# Limit dataset to the first 50,000 rows (or adjust to your desired number)
max_rows = 150000
df = df.head(max_rows)
print(f"Dataset shape after limiting to {max_rows} rows: {df.shape}")

# Convert the 'generated' column to int (0 => Human, 1 => AI)
df['generated'] = df['generated'].astype(int)
print("Value counts for 'generated':")
print(df['generated'].value_counts())

Original dataset shape: (487235, 2)
Dataset shape after limiting to 150000 rows: (150000, 2)
Value counts for 'generated':
generated
0    85336
1    64664
Name: count, dtype: int64


In [28]:
# ====================================
# 5) Split Data into Training and Validation
# ====================================
train_df, val_df = train_test_split(
    df,
    test_size=0.1,      # 10% for validation
    random_state=42,
    stratify=df['generated']
)

print("Training size:", len(train_df))
print("Validation size:", len(val_df))

Training size: 135000
Validation size: 15000


In [30]:
# ===============================
# 6) Create a Text Vectorization Layer
# ===============================
from tensorflow.keras.layers import TextVectorization

max_tokens = 40000   # Maximum vocabulary size
max_len    = 512     # Maximum sequence length

vectorize_layer = TextVectorization(
    max_tokens=max_tokens,
    output_mode='int',
    output_sequence_length=max_len
)

# Adapt the vectorizer on the training text only.
vectorize_layer.adapt(train_df['text'].values)
print("Text vectorization layer adapted.")

Text vectorization layer adapted.


In [31]:
# ============================================
# 7) Build tf.data Pipeline for Training & Validation
# ============================================
def make_dataset(texts, labels, batch_size=32, shuffle=False):
    ds = tf.data.Dataset.from_tensor_slices((texts, labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(texts))
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

train_texts = train_df['text'].values
train_labels = train_df['generated'].values
val_texts = val_df['text'].values
val_labels = val_df['generated'].values

train_ds = make_dataset(train_texts, train_labels, shuffle=True)
val_ds = make_dataset(val_texts, val_labels, shuffle=False)

# Apply the vectorize layer to convert text to integer sequences.
def vectorize_text(text, label):
    text = vectorize_layer(text)
    return text, label

train_ds = train_ds.map(vectorize_text)
val_ds = val_ds.map(vectorize_text)
print("tf.data pipelines are ready.")

tf.data pipelines are ready.


In [32]:
# ============================================
# 8) Build a Deep Learning Model for Text Classification
# ============================================
# We use a simple CNN-based model for demonstration.
model = tf.keras.Sequential([
    layers.Embedding(input_dim=max_tokens, output_dim=128, input_length=max_len),
    layers.Conv1D(filters=64, kernel_size=5, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')  # Binary output: sigmoid for probability.
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()



In [34]:
# ======================
# 9) Train the Model
# ======================
EPOCHS = 3
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS
)

print("Training complete.")

Epoch 1/3
[1m  51/4219[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m10:23[0m 150ms/step - accuracy: 0.9999 - loss: 0.0011

KeyboardInterrupt: 

In [35]:
# ===================================
# 10) Evaluate the Model on Validation Set
# ===================================
val_loss, val_acc = model.evaluate(val_ds)
print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")

[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 37ms/step - accuracy: 0.9969 - loss: 0.0155
Validation Loss: 0.0157
Validation Accuracy: 0.9965


In [41]:
# ==========================================
# 11) Predict on New Text Examples
# ==========================================
def predict_text(text):
    # Convert text to a tensor batch of size 1
    input_data = tf.convert_to_tensor([text])
    # Vectorize the input text
    input_data = vectorize_layer(input_data)
    # Predict probability
    prob = model.predict(input_data)[0][0]
    label = 1 if prob >= 0.5 else 0  # 1 => AI-generated, 0 => Human-generated
    return label, prob

sample_texts = [
    "A Sustainable Vision for Tomorrow In an era marked by rapid urbanization and escalating environment.",
    "An electoral College compromises between election of the president by vote in congress and election",
    "Greetings Mr. Binod I am going to be your mentor for today"
]

for text in sample_texts:
    lbl, probability = predict_text(text)
    label_str = "AI" if lbl == 1 else "Human"
    # print(f"\nText: {text}\nPrediction: {label_str} (score={probability:.4f})")
    print(f"\nText: {text}\nPrediction: {label_str}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step

Text: A Sustainable Vision for Tomorrow In an era marked by rapid urbanization and escalating environment.
Prediction: AI
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step

Text: An electoral College compromises between election of the president by vote in congress and election
Prediction: Human
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step

Text: Greetings Mr. Binod I am going to be your mentor for today
Prediction: AI
