In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
# -------------------- Setup & Data --------------------
import kagglehub
import pandas as pd
import os
from sklearn.model_selection import train_test_split

# Download dataset
path = kagglehub.dataset_download("shanegerami/ai-vs-human-text")
print("📥 Downloaded dataset to:", path)
df = pd.read_csv(f"{path}/AI_Human.csv").head(100000)

# Preprocess labels
df = df.rename(columns={"generated": "label"})
df["label"] = df["label"].astype(int)
print("✅ Loaded dataset. Shape:", df.shape)

# Train/Test split
train_df, test_df = train_test_split(df, test_size=0.3, stratify=df["label"], random_state=42)   #is to ensure that the distribution of labels (classes) remains balanced in both the training and testing datasets.

# Save supporting files
output_dir = "/kaggle/working/ai_vs_human_split"
os.makedirs(output_dir, exist_ok=True)
train_df.to_csv(f"{output_dir}/train_essays.csv", index=False)
test_df.to_csv(f"{output_dir}/test_essays.csv", index=False)

print("📂 Files saved in:", output_dir)

# -------------------- TPU Setup --------------------
import tensorflow as tf

try:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')  # For TPU VM
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.TPUStrategy(resolver)
    print("✅ TPU initialized.")
except Exception as e:
    print("🚫 TPU not found or failed to initialize. Using default strategy.\nError:", str(e))
    strategy = tf.distribute.get_strategy()

print("🔧 Strategy in use:", strategy)

# -------------------- BERT + Tokenization --------------------
from transformers import BertTokenizer, TFBertModel
from tqdm import tqdm
import numpy as np

local_model_path='/kaggle/input/bert-base-uncased/bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(local_model_path)

# Load BERT model inside strategy scope
with strategy.scope():
    bert_model = TFBertModel.from_pretrained(local_model_path)

# Function to get embeddings using batch + tf.data
def get_bert_embeddings_batched(texts, batch_size=32, max_len=128):   #This sets the maximum number of tokens the tokenizer will return for each input. If the input text is longer than 128 tokens, it gets truncated. If it’s shorter, it’s padded (if padding=True is set).
    print(f"🔄 Getting BERT embeddings for {len(texts)} texts...")     #BERT can handle up to 512 tokens
    encodings = tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=max_len,
        return_tensors='tf'
    )

    # Save token IDs and masks
    input_ids = encodings['input_ids'].numpy()
    attention_mask = encodings['attention_mask'].numpy()

    print("📥 Sample token IDs:\n", input_ids[:2])
    print("🧠 Sample attention masks:\n", attention_mask[:2])

    # Save token ids and attention masks
    np.save(f"{output_dir}/input_ids.npy", input_ids)
    np.save(f"{output_dir}/attention_masks.npy", attention_mask)

    dataset = tf.data.Dataset.from_tensor_slices({
        "input_ids": encodings['input_ids'],
        "attention_mask": encodings['attention_mask']
    }).batch(batch_size)

    all_embeddings = []
    for batch in tqdm(dataset, desc="⚙️  Running BERT"):
        outputs = bert_model(batch)['last_hidden_state'][:, 0, :]  # CLS token
        all_embeddings.append(outputs)

    full_embeddings = tf.concat(all_embeddings, axis=0).numpy()
    print("📐 Embeddings shape:", full_embeddings.shape)
    print("🔢 Sample of first embedding (first 10 dims):", full_embeddings[0][:10])

    return full_embeddings

# Load train and test
train = pd.read_csv(f"{output_dir}/train_essays.csv")
test = pd.read_csv(f"{output_dir}/test_essays.csv")

# Generate embeddings
X_train_bert = get_bert_embeddings_batched(train['text'].tolist())
X_test_bert = get_bert_embeddings_batched(test['text'].tolist())
y_train = train['label'].values
y_test = test['label'].values

# Save embeddings for reuse
np.save(f"{output_dir}/X_train_bert.npy", X_train_bert)
np.save(f"{output_dir}/X_test_bert.npy", X_test_bert)

# -------------------- XGBoost Classifier --------------------
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

print("🚀 Training XGBoost classifier...")
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train_bert, y_train)

# -------------------- Evaluation --------------------
# Evaluate on training data
train_preds = xgb.predict(X_train_bert)
train_acc = accuracy_score(y_train, train_preds)
print("📊 Training Accuracy:", train_acc)
print("🔍 Training Classification Report:\n", classification_report(y_train, train_preds))

# Evaluate on test data
test_preds = xgb.predict(X_test_bert)
test_acc = accuracy_score(y_test, test_preds)
print("📊 Test Accuracy:", test_acc)
print("🔍 Test Classification Report:\n", classification_report(y_test, test_preds))


📥 Downloaded dataset to: /kaggle/input/ai-vs-human-text
✅ Loaded dataset. Shape: (100000, 2)
📂 Files saved in: /kaggle/working/ai_vs_human_split


2025-05-19 13:30:03.211407: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747661403.388639      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747661403.440049      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🚫 TPU not found or failed to initialize. Using default strategy.
Error: Please provide a TPU Name to connect to.
🔧 Strategy in use: <tensorflow.python.distribute.distribute_lib._DefaultDistributionStrategy object at 0x7edf033c97d0>


I0000 00:00:1747661425.818580      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertFo

🔄 Getting BERT embeddings for 70000 texts...
📥 Sample token IDs:
 [[  101  6203  3516  2110  5205  1010  1045  7475  1999  5684  1997  4363
   1996  6092  2267  1012  1996  6092  2267  2003  3214  2000  2022  1037
  12014  2090  2602  1997  1996  2343  2011  1037  3789  1999  3519  1998
   2602  1997  1996  2343  2011  1037  2759  3789  1997  4591  4480  1012
   1045  7475  2005  4363  1996  6092  2267  2138  1997  2195  4436  6153
   2039  2007  2592  1012 15847  1010  1045  1005  1040  2066  2000  2391
   2041  2008  1996  6092  2267  3084  2009  2061  2008  3469  2163  2123
   1005  1056  2031  3469  3747  2084  3760  2163  1012  2065  1996  6092
   2267  2003  3718  2059  3469  2163  4618  2031  2172  2062  2576  2373
   2084  3760  2163  1012  2029  2965  2008  1037  2235  2110  2453  2025
   2131  1996  2168  3815  1997  3086  2013   102]
 [  101  1999  3522  2086  1010  2116  2816  2031  5625  2000  3749  3454
   2005  3784  4083  2083  2678  8921  1998  3784 14799  1012  2116  

⚙️  Running BERT: 100%|██████████| 2188/2188 [06:33<00:00,  5.56it/s]


📐 Embeddings shape: (70000, 768)
🔢 Sample of first embedding (first 10 dims): [-0.0687438  -0.25372472 -0.8853341   0.06881139 -0.0010235  -0.5130544
  0.3910389   0.9081425  -0.16836181 -0.10408129]
🔄 Getting BERT embeddings for 30000 texts...
📥 Sample token IDs:
 [[  101  6203  3836  1035  2171  1010  1996  2095  2003  2249  1010  2471
   2296  4845  1999  1996  5893  3694  2038  1037  3526  3042  1012  2065
   2057  2071  2069  2224  2009  2043  2057  4995  1005  1056  1999  2082
   2084  2054  1005  1055  1996  2391  1029  2065  4268  2020  2583  2000
   2224  2037  3526 11640  1999  2465  1010  2009  2052  3499  2336  2000
   3579  2062  1999  2465  1998  2027  2052  4847  1996  3627  1997  3810
   2037 11640  2125  1999  2465  1012  1045  2903  2008  2009  2052  2022
   1037  2204  2801  2000  2991  5004  2083  2007  3343  2193  2028  1012
   2076  2489  6993  1998  6265  2847  1010  2045  2003  2053  7386  1999
   4352  2493  2000  2224  2037  3526 11640  1012  2138  2045  2003 

⚙️  Running BERT: 100%|██████████| 938/938 [02:53<00:00,  5.40it/s]


📐 Embeddings shape: (30000, 768)
🔢 Sample of first embedding (first 10 dims): [ 0.3196784  -0.26945573  0.16511464  0.28543472  0.31779614 -0.7717479
  0.5807484   0.6513076  -0.05484775 -0.5954517 ]
🚀 Training XGBoost classifier...
📊 Training Accuracy: 0.9998142857142858
🔍 Training Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     37041
           1       1.00      1.00      1.00     32959

    accuracy                           1.00     70000
   macro avg       1.00      1.00      1.00     70000
weighted avg       1.00      1.00      1.00     70000

📊 Test Accuracy: 0.9791333333333333
🔍 Test Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98     15875
           1       0.98      0.97      0.98     14125

    accuracy                           0.98     30000
   macro avg       0.98      0.98      0.98     30000
weighted avg       0.98   