# AI_Human + MultiSocial dataset builder

Merge AI_Human.csv and multisocial_anonymized.csv, clean the text, and generate a feature rich table ready for ML training. The flow mirrors the feature engineering used in ai_vs_human_AI_Human.ipynb and adds metadata signals (language, source, model id, potential_noise).


In [None]:
import re
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm
from IPython.display import display

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import textstat

import warnings
warnings.filterwarnings("ignore")

# Paths
DATA_DIR = Path("src/ai_vs_human")
AI_HUMAN_PATH = "AI_Human.csv"
MULTI_PATH = "multisocial_anonymized.csv"
RAW_OUTPUT = "merged_raw_clean.csv"
FEATURE_OUTPUT = "merged_ai_human_multisocial_features.csv"
FEATURE_CACHE = "cache_merged_text_features.csv"
# Controls
RANDOM_STATE = 42
DROP_POTENTIAL_NOISE = True
SAMPLE_ROWS = None  # set to an int (e.g., 200000) for quick dry runs
NORMALIZE_TEXT = True
USE_FEATURE_CACHE = True
TOP_LANGS = 12
TOP_SOURCES = 15
TOP_MODELS = 20

nltk.download("vader_lexicon")


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ruben/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## 1. Load and harmonize datasets
- Normalize column names
- Keep a shared schema (text, label, language, source, model id, potential_noise, length, dataset)
- Optionally drop noisy rows flagged by MultiSocial


In [13]:
# AI_Human
ai_df = pd.read_csv(AI_HUMAN_PATH)
ai_df = ai_df.rename(columns={"generated": "label"})
ai_df["label"] = ai_df["label"].fillna(0).astype(int)
ai_df["text"] = ai_df["text"].fillna("").astype(str)
ai_df["dataset"] = "ai_human"
ai_df["language"] = "unknown"
ai_df["source"] = "ai_human"
ai_df["multi_label"] = np.where(ai_df["label"] == 1, "unknown_model", "human")
ai_df["potential_noise"] = 0
ai_df["split"] = "unspecified"
ai_df["length"] = ai_df["text"].str.len()

# MultiSocial
multi_df = pd.read_csv(MULTI_PATH)
multi_df["text"] = multi_df["text"].fillna("").astype(str)
multi_df["label"] = multi_df["label"].astype(int)
multi_df["dataset"] = "multisocial"
multi_df["multi_label"] = multi_df["multi_label"].fillna("unknown_model")

if DROP_POTENTIAL_NOISE and "potential_noise" in multi_df.columns:
    before = len(multi_df)
    multi_df = multi_df[multi_df["potential_noise"] == 0].copy()
    print(f"Dropped potential_noise rows: {before - len(multi_df)}")

# Ensure required columns exist everywhere
required_defaults = {
    "language": "unknown",
    "source": "unknown",
    "multi_label": "unknown_model",
    "potential_noise": 0,
    "split": "unspecified",
    "length": None,
}
for col, default in required_defaults.items():
    for df in (ai_df, multi_df):
        if col not in df.columns:
            df[col] = default

combined = pd.concat([ai_df, multi_df], ignore_index=True)
print(f"Combined raw shape: {combined.shape}")


FileNotFoundError: [Errno 2] No such file or directory: 'src/ai_vs_human/AI_Human.csv'

## 2. Clean text and basic checks
- Strip whitespace and collapse multiple spaces
- Remove rows with empty text
- Drop duplicates by (text, label)
- Optional sampling for quick experimentation


In [None]:
def clean_text(text: str) -> str:
    text = str(text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

if NORMALIZE_TEXT:
    combined["text"] = combined["text"].apply(clean_text)
else:
    combined["text"] = combined["text"].astype(str)

# Remove empties and recompute length after cleaning
combined = combined[combined["text"].str.len() > 0].copy()
combined["length"] = combined["text"].str.len()

before_dup = len(combined)
combined = combined.drop_duplicates(subset=["text", "label"])
print(f"Dropped duplicates: {before_dup - len(combined)}")

if SAMPLE_ROWS:
    combined = combined.sample(n=min(SAMPLE_ROWS, len(combined)), random_state=RANDOM_STATE)

combined = combined.reset_index(drop=True)
print(f"Merged and cleaned shape: {combined.shape}")
print("Label distribution:", combined["label"].value_counts())
print("Rows by dataset:", combined["dataset"].value_counts())
print("Top languages:", combined["language"].value_counts().head(10))
display(combined.head())


SyntaxError: unterminated string literal (detected at line 24) (581094703.py, line 24)

## 3. Helper to cap high-cardinality categorical features
Keeps only the top-N most frequent categories and buckets the rest into `other`.


In [None]:
def cap_categories(series: pd.Series, top_k: int, other_label: str = "other") -> pd.Series:
    top = series.value_counts().nlargest(top_k).index
    return series.where(series.isin(top), other_label)


## 4. Feature engineering on text (same core block as ai_vs_human_AI_Human)
Creates readability, lexical diversity, burstiness, sentiment, punctuation ratio, etc.


In [None]:
sia = SentimentIntensityAnalyzer()
punct_re = re.compile(r'[^\w\s]')
passive_re = re.compile(r'\b(?:is|are|was|were|be|been|being)\s+\w+ed\b', re.IGNORECASE)


def safe_readability(fn, text: str):
    try:
        return float(fn(text))
    except Exception:
        return np.nan


def extract_features(text: str) -> pd.Series:
    if not isinstance(text, str):
        text = ''
    clean = text.replace('\n', ' ').strip()

    sentences = [s.strip() for s in re.split(r'[.!?]+', clean) if s.strip()]
    sentence_count = max(len(sentences), 1)

    words = re.findall(r'\b\w+\b', clean.lower())
    word_count = len(words)
    unique_words = len(set(words))
    lexical_diversity = unique_words / word_count if word_count else 0.0
    avg_sentence_length = word_count / sentence_count if sentence_count else 0.0
    avg_word_length = np.mean([len(w) for w in words]) if words else 0.0
    character_count = len(clean)
    punctuation_ratio = len(punct_re.findall(clean)) / character_count if character_count else 0.0

    fre = safe_readability(textstat.flesch_reading_ease, clean) if word_count else 0.0
    fog = safe_readability(textstat.gunning_fog, clean) if word_count else 0.0

    sentence_lengths = [len(s.split()) for s in sentences] or [0]
    burstiness = np.std(sentence_lengths) / (np.mean(sentence_lengths) + 1e-6)

    sentiment_score = sia.polarity_scores(clean)['compound'] if word_count else 0.0
    passive_voice_ratio = len(passive_re.findall(clean)) / sentence_count

    predictability_score = 1 - lexical_diversity if word_count else 0.0
    grammar_errors = len(re.findall(r"\b(?:ain't|could of|should of|would of)\b", clean.lower()))

    return pd.Series({
        'word_count': word_count,
        'character_count': character_count,
        'sentence_count': sentence_count,
        'lexical_diversity': lexical_diversity,
        'avg_sentence_length': avg_sentence_length,
        'avg_word_length': avg_word_length,
        'punctuation_ratio': punctuation_ratio,
        'flesch_reading_ease': fre,
        'gunning_fog_index': fog,
        'grammar_errors': grammar_errors,
        'passive_voice_ratio': passive_voice_ratio,
        'predictability_score': predictability_score,
        'burstiness': burstiness,
        'sentiment_score': sentiment_score,
    })


## 5. Generate text feature matrix (with optional cache)
`FEATURE_CACHE` stores only the text-derived features to skip recomputation when the cleaned dataset length is unchanged.


In [None]:
tqdm.pandas()
text_features = None

# Ensure cache path is Path even if the variable was a string in a prior run
FEATURE_CACHE = Path(FEATURE_CACHE)

if USE_FEATURE_CACHE and FEATURE_CACHE.exists():
    cached = pd.read_csv(FEATURE_CACHE)
    if len(cached) == len(combined):
        text_features = cached
        print(f"Loaded cached text features from {FEATURE_CACHE}")
    else:
        print("Cache size mismatch, recalculating...")

if text_features is None:
    print("Building text features...")
    text_features = combined["text"].progress_apply(extract_features)
    if USE_FEATURE_CACHE:
        try:
            text_features.to_csv(FEATURE_CACHE, index=False)
            print(f"Saved text feature cache to {FEATURE_CACHE}")
        except Exception as e:
            print(f"Could not save cache: {e}")

print("Text features shape:", text_features.shape)
display(text_features.head())


## 6. Add metadata signals and build final feature table
- Language, source, model id, dataset origin (capped to most common categories)
- Numeric metadata: text length and potential_noise flag
- Target column: `label`


In [None]:
combined["language_norm"] = cap_categories(combined["language"].fillna("unknown"), TOP_LANGS)
combined["source_norm"] = cap_categories(combined["source"].fillna("unknown"), TOP_SOURCES)
combined["model_norm"] = cap_categories(combined["multi_label"].fillna("unknown_model"), TOP_MODELS)
combined["dataset_norm"] = combined["dataset"].fillna("unknown")

lang_dummies = pd.get_dummies(combined["language_norm"], prefix="lang")
source_dummies = pd.get_dummies(combined["source_norm"], prefix="src")
model_dummies = pd.get_dummies(combined["model_norm"], prefix="model")
dataset_dummies = pd.get_dummies(combined["dataset_norm"], prefix="ds")

meta_numeric = combined[["length", "potential_noise"]].copy()

feature_blocks = [
    text_features.reset_index(drop=True),
    meta_numeric.reset_index(drop=True),
    lang_dummies.reset_index(drop=True),
    source_dummies.reset_index(drop=True),
    model_dummies.reset_index(drop=True),
    dataset_dummies.reset_index(drop=True),
]

final_features = pd.concat(feature_blocks, axis=1)
final_features.insert(0, "text", combined["text"].values)
final_features["label"] = combined["label"].astype(int).values

print("Final feature matrix:", final_features.shape)
display(final_features.head())


## 7. Persist cleaned raw data and features
Both CSVs keep the exact row order used for feature generation (safe to reload and align on index).


In [None]:
raw_export_cols = [
    "text", "label", "language", "source", "multi_label", "potential_noise", "length", "dataset"
]
combined[raw_export_cols].to_csv(RAW_OUTPUT, index=False)
final_features.to_csv(FEATURE_OUTPUT, index=False)

print(f"Saved cleaned raw text to {RAW_OUTPUT}")
print(f"Saved feature table to {FEATURE_OUTPUT}")
print("Label ratio in features:
", final_features["label"].value_counts(normalize=True))


## 8. Quick train/validation split example
Use the feature matrix directly with classical models (LR/XGB/RF, etc.).


In [None]:
from sklearn.model_selection import train_test_split

X = final_features.drop(columns=["label"])
y = final_features["label"]
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)
print("Train/val shapes:", X_train.shape, X_val.shape)
print("Class balance train:", y_train.value_counts(normalize=True))
