In [None]:
# !pip install -q pandas numpy scikit-learn joblib streamlit


In [None]:
# !pip install kagglehub[pandas-datasets]

In [None]:
# pip install cuml-cu12 --index-url=https://pypi.nvidia.com

In [None]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

def load_first_tabular_file(dataset_name):
    # Download the entire dataset (or at least one supported file)
    path = kagglehub.dataset_download(dataset_name, force_download=True)
    # The `path` returns the directory or file path where data has been placed.

    # Then you can manually inspect the downloaded path, search for supported extensions
    import os
    import pandas as pd

    supported_exts = [".csv", ".tsv", ".json", ".jsonl", ".parquet",
                      ".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"]
    # Walk the path to find a suitable file
    file_to_load = None
    for root, _, files in os.walk(path):
        for f in files:
            _, ext = os.path.splitext(f)
            if ext.lower() in supported_exts:
                file_to_load = os.path.join(root, f)
                break
        if file_to_load:
            break

    if file_to_load is None:
        raise ValueError(f"No supported tabular file found in dataset '{dataset_name}'")

    df = pd.read_csv(file_to_load)  # adjust if needed for e.g. Excel
    return df

dataset_name = "shanegerami/ai-vs-human-text"
complete_data_set = load_first_tabular_file(dataset_name)

print("Columns:", complete_data_set.columns.tolist())
print("First 5 records:")
print(complete_data_set.head())


In [None]:
data = complete_data_set.sample(n=100, replace=False, random_state=42)


In [None]:
import pandas as pd
import random

# Check columns
print("Columns in df:", data.columns.tolist())

# Make sure 'generated' and 'text' exist
if "generated" in data.columns and "text" in data.columns:

    def get_random_text(label):
        """
        Get a random text for a given label.
        label: 0 for human, 1 for AI
        """
        subset = data[data["generated"] == label]
        if len(subset) == 0:
            return f"No texts found for label {label}"
        return subset["text"].sample(n=1).iloc[0]

    # Example usage
    random_human_text = get_random_text(0)
    random_ai_text = get_random_text(1)

    print("Random human text:", random_human_text)
    print("\n\nRandom AI text:", random_ai_text)

else:
    print("'generated' or 'text' column not found in the DataFrame")


In [None]:
import re, math, numpy as np

STOP = set("""a an the in on at of for to and or but if while as is are was were be been being from with by than then so because about into over after before under between out up down off near far very just not no nor""".split())

_SENT_SPLIT = re.compile(r"[.!?]+")
_WORD = re.compile(r"\b\w+\b", re.UNICODE)
_EMOJI = re.compile(r"[\U00010000-\U0010ffff]", flags=re.UNICODE)

def shannon_entropy(s: str) -> float:
    if not s: return 0.0
    counts = {}
    for ch in s:
        counts[ch] = counts.get(ch, 0) + 1
    n = len(s)
    return -sum((c/n) * math.log2(c/n) for c in counts.values())

def sent_lengths(s: str):
    sents = [x.strip() for x in _SENT_SPLIT.split(s) if x.strip()]
    if not sents: return (0.0, 0.0)
    lens = [len(_WORD.findall(x)) for x in sents]
    return (float(np.mean(lens)), float(np.var(lens)))

def type_token_ratio(words):
    if not words: return 0.0
    return len(set(words)) / len(words)

def hapax_ratio(words):
    if not words: return 0.0
    from collections import Counter
    c = Counter(words)
    hapax = sum(1 for k,v in c.items() if v == 1)
    return hapax / len(c)

def function_word_ratio(words):
    if not words: return 0.0
    fw = sum(1 for w in words if w.lower() in STOP)
    return fw / len(words)

def repetition_index(words, n=3):
    # max frequency of any n-gram / total n-grams
    if len(words) < n: return 0.0
    from collections import Counter
    grams = [" ".join(words[i:i+n]) for i in range(len(words)-n+1)]
    cnt = Counter(grams)
    return max(cnt.values()) / max(1, len(grams))

def extract_features(text: str) -> np.ndarray:
    s = text.strip()
    words = _WORD.findall(s)
    chars = len(s) or 1
    sent_avg, sent_var = sent_lengths(s)

    features = [
        shannon_entropy(s),                         # char_entropy
        type_token_ratio(words),                    # ttr
        hapax_ratio(words),                         # hapax
        sent_avg,                                   # avg_sent_len
        sent_var,                                   # var_sent_len
        sum(1 for c in s if c in ".,;:!?")/chars,   # punct_rate
        sum(1 for c in s if c.isdigit())/chars,     # digit_rate
        len(_EMOJI.findall(s))/chars,               # emoji_rate
        function_word_ratio(words),                 # function_word_ratio
        repetition_index([w.lower() for w in words], n=3) # repetition_index
    ]
    return np.array(features, dtype=float)

FEATURE_NAMES = [
    "char_entropy","ttr","hapax","avg_sent_len","var_sent_len",
    "punct_rate","digit_rate","emoji_rate","function_word_ratio","repetition_index"
]
