In [1]:
import json
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import textstat
import spacy
from nltk import word_tokenize, sent_tokenize
from collections import Counter
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Load Spacy English tokenizer
nlp = spacy.load("en_core_web_sm")

# Load HC3 dataset
ds = load_dataset("Hello-SimpleAI/HC3", "all")
train_ds = ds['train']

# Load GPT2 for perplexity (we'll use distilgpt2 for speed)
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT2LMHeadModel.from_pretrained("distilgpt2").to(device)
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [5]:
# Perplexity function
def calculate_perplexity(text):
    encodings = tokenizer(text, return_tensors='pt', truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**encodings, labels=encodings.input_ids)
    loss = outputs.loss
    return torch.exp(loss).item()

def calculate_burstiness(text):
    doc = nlp(text)
    ppls = []
    for sent in doc.sents:
        try:
            ppl = calculate_perplexity(sent.text)
            ppls.append(ppl)
        except:
            continue
    if len(ppls) <= 1:
        return 0.0
    return np.std(ppls)

def type_token_ratio(text):
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if token.is_alpha]
    types = set(tokens)
    return len(types) / len(tokens) if len(tokens) > 0 else 0

def ngram_entropy(text, n=3):
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if token.is_alpha]
    ngrams = list(zip(*[tokens[i:] for i in range(n)]))
    counts = Counter(ngrams)
    total = sum(counts.values())
    probs = [count / total for count in counts.values()]
    entropy = -sum(p * np.log2(p) for p in probs if p > 0)
    return entropy

def extract_features(text):
    features = {}
    features['perplexity'] = calculate_perplexity(text)
    features['burstiness'] = calculate_burstiness(text)
    features['readability'] = textstat.flesch_reading_ease(text)
    features['sentence_length'] = np.mean([len(sent.text.split()) for sent in nlp(text).sents])
    features['type_token_ratio'] = type_token_ratio(text)
    features['ngram_entropy'] = ngram_entropy(text)
    return features

# === Train on HC3 ===
print("Loading HC3 training data...")
ds = load_dataset("Hello-SimpleAI/HC3", "all")
train_ds = ds['train']

Loading HC3 training data...


In [6]:
data = []
labels = []

for sample in tqdm(train_ds):
    if sample['human_answers']:
        text = sample['human_answers'][0]
        features = extract_features(text)
        data.append(features)
        labels.append(0)
    if sample['chatgpt_answers']:
        text = sample['chatgpt_answers'][0]
        features = extract_features(text)
        data.append(features)
        labels.append(1)

df = pd.DataFrame(data)
df['label'] = labels

# Train classifier
X = df.drop(columns=['label'])
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("\n=== HC3 validation report ===")
print(classification_report(y_test, y_pred))

 98%|█████████▊| 23832/24322 [1:09:00<01:25,  5.76it/s]


RuntimeError: cannot reshape tensor of 0 elements into shape [-1, 0] because the unspecified dimension size -1 can be any value and is ambiguous

In [9]:
df = pd.DataFrame(data)
df['label'] = labels

# Train classifier as usual
X = df.drop(columns=['label'])
y = df['label']
df_clean = df.dropna()
X = df_clean.drop(columns=['label'])
y = df_clean['label']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.97      0.98      0.98      4489
           1       0.98      0.97      0.98      4663

    accuracy                           0.98      9152
   macro avg       0.98      0.98      0.98      9152
weighted avg       0.98      0.98      0.98      9152



In [10]:
import os
import json
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import classification_report

# Directory containing all your .jsonl eval files
EVAL_DIR = "cs162-final-dev-main"

eval_texts = []
true_labels = []

# Loop over every file in the directory
for fname in os.listdir(EVAL_DIR):
    if not fname.endswith(".jsonl"):
        continue
    path = os.path.join(EVAL_DIR, fname)
    with open(path, 'r') as f:
        for line in f:
            obj = json.loads(line)
            # human_text
            h = obj.get('human_text', "").strip()
            if h:
                eval_texts.append(h)
                true_labels.append(0)
            # machine_text
            m = obj.get('machine_text', "").strip()
            if m:
                eval_texts.append(m)
                true_labels.append(1)

# Extract features
eval_data = []
surviving_labels = []
for text, label in tqdm(zip(eval_texts, true_labels), total=len(eval_texts)):
    try:
        feats = extract_features(text)
        eval_data.append(feats)
        surviving_labels.append(label)
    except Exception as e:
        # skip any text that throws an error during feature extraction
        continue

# Build DataFrame and drop NaNs
eval_df = pd.DataFrame(eval_data).dropna().reset_index(drop=True)
surviving_labels = surviving_labels[: len(eval_df)]

# Predict and evaluate
eval_preds = clf.predict(eval_df)
print("\n=== Evaluation report ===")
print(classification_report(surviving_labels, eval_preds))


100%|██████████| 6000/6000 [11:15<00:00,  8.89it/s]


=== Evaluation report ===
              precision    recall  f1-score   support

           0       0.56      0.70      0.62      2995
           1       0.60      0.45      0.52      2995

    accuracy                           0.58      5990
   macro avg       0.58      0.58      0.57      5990
weighted avg       0.58      0.58      0.57      5990






In [11]:
# Create dataframe
df = pd.DataFrame(data)
df['label'] = labels

# Save to disk
df.to_csv("extracted_features.csv", index=False)


In [9]:
# Vocabulary richness
def type_token_ratio(text):
    tokens = word_tokenize(text.lower())
    types = set(tokens)
    return len(types) / len(tokens) if len(tokens) > 0 else 0

In [10]:
# N-gram entropy (approx)
def ngram_entropy(text, n=3):
    tokens = word_tokenize(text.lower())
    ngrams = list(zip(*[tokens[i:] for i in range(n)]))
    counts = Counter(ngrams)
    total = sum(counts.values())
    probs = [count / total for count in counts.values()]
    entropy = -sum(p * np.log2(p) for p in probs if p > 0)
    return entropy


In [11]:
# Extract features
def extract_features(text):
    features = {}
    features['perplexity'] = calculate_perplexity(text)
    features['burstiness'] = calculate_burstiness(text)
    features['readability'] = textstat.flesch_reading_ease(text)
    features['sentence_length'] = np.mean([len(word_tokenize(s)) for s in sent_tokenize(text)])
    features['type_token_ratio'] = type_token_ratio(text)
    features['ngram_entropy'] = ngram_entropy(text)
    return features

In [12]:
# Load your dataset
def load_dataset(jsonl_path):
    samples = []
    with open(jsonl_path, 'r') as f:
        for line in f:
            obj = json.loads(line)
            samples.append((obj['human_text'], 0))  # human = 0
            samples.append((obj['machine_text'], 1))  # AI = 1
    return samples


In [13]:
# Process entire dataset
def build_feature_dataframe(samples):
    data = []
    labels = []
    for text, label in tqdm(samples):
        feats = extract_features(text)
        data.append(feats)
        labels.append(label)
    df = pd.DataFrame(data)
    df['label'] = labels
    return df

In [14]:
# Train & evaluate classifier
def train_classifier(df):
    X = df.drop(columns=['label'])
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    return clf

In [None]:
if __name__ == "__main__":
    jsonl_path = 'your_dataset.jsonl'  # Replace with your dataset path
    samples = load_dataset(jsonl_path)
    df = build_feature_dataframe(samples)
    clf = train_classifier(df)