# Bot-or-Not Challenge - Bot Detection System (Enhanced v2)

Multi-layer bot detection pipeline:
1. **Basic Features**: User profile + posting behavior + content statistics
2. **Advanced Temporal Features**: Time DNA sequences + inter-arrival distribution fitting + session analysis
3. **Text Stylometry Features**: N-gram repetition + Jaccard similarity + Zipf deviation + compression ratio
4. **Deep NLP Features**: Sentence embeddings + LLM perplexity
5. **Cross-user Features**: Inter-user similarity + HDBSCAN clustering
6. **Threshold Optimization**: Custom scoring (+4 TP, -1 FN, -2 FP) with fine-grained search
7. **Stacking Ensemble**: XGBoost + LightGBM + CatBoost (Layer 1) -> LogisticRegression (Layer 2)

In [None]:
!pip install xgboost lightgbm catboost sentence-transformers transformers torch scikit-learn pandas numpy scipy hdbscan optuna -q

In [None]:
import json
import re
import os
import zlib
import warnings
from datetime import datetime
from collections import Counter
from functools import partial
from itertools import combinations

import pandas as pd
import numpy as np
from scipy import stats
from scipy.spatial.distance import cosine
from scipy.optimize import curve_fit

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support,
                             classification_report, confusion_matrix)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegressionCV
import hdbscan

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

from sentence_transformers import SentenceTransformer
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

warnings.filterwarnings('ignore')
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Section 1: Data Loading

Upload the dataset files to Colab, then load them.

In [None]:
# Mount Google Drive and set DATA_DIR
from google.colab import drive
drive.mount('/content/drive')

DATA_DIR = '/content/drive/MyDrive/bot or not'

In [None]:
def load_dataset(json_path, bots_path=None):
    """Load a dataset from JSON and optionally load bot labels."""
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    posts_df = pd.DataFrame(data['posts'])
    users_df = pd.DataFrame(data['users'])
    metadata = {
        'id': data['id'],
        'lang': data['lang'],
        'metadata': data['metadata']
    }

    # Load bot labels if available (practice datasets)
    bot_ids = set()
    if bots_path and os.path.exists(bots_path):
        with open(bots_path, 'r') as f:
            bot_ids = set(line.strip() for line in f if line.strip())
        users_df['is_bot'] = users_df['id'].isin(bot_ids).astype(int)
    else:
        users_df['is_bot'] = -1  # Unknown

    print(f"Dataset {metadata['id']} ({metadata['lang']}): "
          f"{len(users_df)} users, {len(posts_df)} posts, {len(bot_ids)} known bots")
    return posts_df, users_df, bot_ids, metadata


# Load all practice datasets
datasets = {}
for ds_id in [30, 31, 32, 33]:
    json_path = os.path.join(DATA_DIR, f'dataset.posts&users.{ds_id}.json')
    bots_path = os.path.join(DATA_DIR, f'dataset.bots.{ds_id}.txt')
    if os.path.exists(json_path):
        posts_df, users_df, bot_ids, meta = load_dataset(json_path, bots_path)
        datasets[ds_id] = {
            'posts': posts_df, 'users': users_df,
            'bot_ids': bot_ids, 'meta': meta
        }

print(f"\nLoaded {len(datasets)} datasets: {list(datasets.keys())}")
print(f"English datasets: {[k for k,v in datasets.items() if v['meta']['lang']=='en']}")
print(f"French datasets: {[k for k,v in datasets.items() if v['meta']['lang']=='fr']}")

## Section 2: Feature Engineering (Layer 1 - Basic Features)

Extract 35+ features from user profiles, posting behavior, and content statistics.

In [None]:
EMOJI_PATTERN = re.compile(r'[\U00010000-\U0010ffff]', flags=re.UNICODE)

# ============================================================
# 2a. User Profile Features
# ============================================================
def extract_user_profile_features(user):
    feats = {}

    # Basic stats (from dataset)
    feats['tweet_count'] = user['tweet_count']
    feats['z_score'] = user['z_score']

    # Username features
    uname = str(user.get('username', ''))
    feats['username_length'] = len(uname)
    feats['username_digit_ratio'] = sum(c.isdigit() for c in uname) / max(len(uname), 1)
    feats['username_underscore_count'] = uname.count('_')
    feats['username_upper_ratio'] = sum(c.isupper() for c in uname) / max(len(uname), 1)
    feats['username_has_numbers'] = int(any(c.isdigit() for c in uname))

    # Name features
    name = str(user.get('name', '') or '')
    feats['name_length'] = len(name)
    feats['name_emoji_count'] = len(EMOJI_PATTERN.findall(name))
    feats['name_word_count'] = len(name.split())

    # Description features
    desc = str(user.get('description', '') or '')
    feats['has_description'] = int(bool(desc.strip()))
    feats['description_length'] = len(desc)
    feats['description_emoji_count'] = len(EMOJI_PATTERN.findall(desc))
    feats['description_word_count'] = len(desc.split()) if desc.strip() else 0
    feats['description_hashtag_count'] = desc.count('#')
    feats['description_url_count'] = len(re.findall(r'https?://\S+', desc))
    feats['description_pipe_count'] = desc.count('|')  # Bio separators like "Gamer | Streamer"

    # Location features
    loc = str(user.get('location', '') or '')
    feats['has_location'] = int(bool(loc.strip()))
    feats['location_length'] = len(loc)

    return feats


# ============================================================
# 2b. Posting Behavior Features
# ============================================================
def extract_behavioral_features(user_posts):
    feats = {}
    n = len(user_posts)

    if n < 2:
        return {k: 0 for k in [
            'posting_frequency', 'time_span_hours',
            'avg_interval', 'std_interval', 'min_interval', 'max_interval',
            'cv_interval', 'median_interval',
            'interval_skewness', 'interval_kurtosis',
            'hour_entropy', 'night_post_ratio', 'morning_post_ratio',
            'evening_post_ratio',
            'burst_count_60s', 'burst_count_300s',
            'unique_hours', 'unique_days',
            'weekend_ratio', 'posts_per_day_std',
            'max_posts_in_hour', 'regularity_score'
        ]}

    timestamps = pd.to_datetime(user_posts['created_at']).sort_values().reset_index(drop=True)

    # Time span
    time_span = (timestamps.max() - timestamps.min()).total_seconds() / 3600
    feats['time_span_hours'] = time_span
    feats['posting_frequency'] = n / max(time_span, 0.01)

    # Time intervals (in seconds)
    intervals = timestamps.diff().dropna().dt.total_seconds().values
    feats['avg_interval'] = np.mean(intervals)
    feats['std_interval'] = np.std(intervals)
    feats['min_interval'] = np.min(intervals)
    feats['max_interval'] = np.max(intervals)
    feats['median_interval'] = np.median(intervals)
    feats['cv_interval'] = np.std(intervals) / max(np.mean(intervals), 0.01)

    # Distribution shape
    if len(intervals) >= 4:
        feats['interval_skewness'] = float(stats.skew(intervals))
        feats['interval_kurtosis'] = float(stats.kurtosis(intervals))
    else:
        feats['interval_skewness'] = 0
        feats['interval_kurtosis'] = 0

    # Hour distribution entropy
    hours = timestamps.dt.hour
    hour_counts = hours.value_counts(normalize=True)
    feats['hour_entropy'] = float(stats.entropy(hour_counts))

    # Time-of-day ratios
    feats['night_post_ratio'] = hours.between(0, 5).mean()
    feats['morning_post_ratio'] = hours.between(6, 11).mean()
    feats['evening_post_ratio'] = hours.between(18, 23).mean()

    # Burst detection
    feats['burst_count_60s'] = int((intervals < 60).sum())
    feats['burst_count_300s'] = int((intervals < 300).sum())

    # Activity spread
    feats['unique_hours'] = hours.nunique()
    feats['unique_days'] = timestamps.dt.date.nunique()
    feats['weekend_ratio'] = timestamps.dt.dayofweek.isin([5, 6]).mean()

    # Posts per day variability
    posts_per_day = timestamps.dt.date.value_counts()
    feats['posts_per_day_std'] = posts_per_day.std() if len(posts_per_day) > 1 else 0

    # Max posts in any single hour
    hour_day = timestamps.dt.floor('h')
    feats['max_posts_in_hour'] = hour_day.value_counts().max()

    # Regularity score: how "clock-like" the posting is
    # Low std of intervals relative to mean = very regular
    feats['regularity_score'] = 1.0 / (1.0 + feats['cv_interval'])

    return feats


# ============================================================
# 2c. Content Features
# ============================================================
def extract_content_features(user_posts):
    feats = {}
    texts = user_posts['text'].dropna().tolist()

    if not texts:
        return {k: 0 for k in [
            'avg_length_chars', 'avg_length_words', 'std_length_chars',
            'vocabulary_richness', 'hapax_ratio',
            'avg_hashtags', 'avg_urls', 'avg_mentions',
            'emoji_rate', 'avg_exclamation', 'avg_question',
            'avg_uppercase_ratio', 'avg_punctuation_ratio',
            'duplicate_ratio', 'near_duplicate_ratio',
            'avg_sentence_count', 'avg_word_length',
            'link_tweet_ratio', 'retweet_ratio'
        ]}

    # Length features
    char_lens = [len(t) for t in texts]
    word_lens = [len(t.split()) for t in texts]
    feats['avg_length_chars'] = np.mean(char_lens)
    feats['avg_length_words'] = np.mean(word_lens)
    feats['std_length_chars'] = np.std(char_lens)

    # Vocabulary richness
    all_words = ' '.join(texts).lower().split()
    unique_words = set(all_words)
    feats['vocabulary_richness'] = len(unique_words) / max(len(all_words), 1)
    # Hapax legomena ratio (words appearing only once)
    word_counts = Counter(all_words)
    feats['hapax_ratio'] = sum(1 for c in word_counts.values() if c == 1) / max(len(unique_words), 1)

    # Entity counts
    feats['avg_hashtags'] = np.mean([t.count('#') for t in texts])
    feats['avg_urls'] = np.mean([len(re.findall(r'https?://\S+|t\.co/\S+', t)) for t in texts])
    feats['avg_mentions'] = np.mean([t.count('@') for t in texts])
    feats['emoji_rate'] = np.mean([len(EMOJI_PATTERN.findall(t)) for t in texts])

    # Punctuation
    feats['avg_exclamation'] = np.mean([t.count('!') for t in texts])
    feats['avg_question'] = np.mean([t.count('?') for t in texts])
    feats['avg_uppercase_ratio'] = np.mean([
        sum(c.isupper() for c in t) / max(len(t), 1) for t in texts
    ])
    feats['avg_punctuation_ratio'] = np.mean([
        sum(not c.isalnum() and not c.isspace() for c in t) / max(len(t), 1)
        for t in texts
    ])

    # Duplicate / near-duplicate analysis
    unique_texts = set(texts)
    feats['duplicate_ratio'] = 1 - len(unique_texts) / max(len(texts), 1)

    near_dup = 0
    total_pairs = 0
    sample = texts[:50]
    for i in range(len(sample)):
        wi = set(sample[i].lower().split())
        for j in range(i + 1, len(sample)):
            wj = set(sample[j].lower().split())
            if wi and wj:
                jaccard = len(wi & wj) / len(wi | wj)
                if jaccard > 0.8:
                    near_dup += 1
            total_pairs += 1
    feats['near_duplicate_ratio'] = near_dup / max(total_pairs, 1)

    # Sentence & word complexity
    feats['avg_sentence_count'] = np.mean([
        len(re.split(r'[.!?]+', t)) for t in texts
    ])
    feats['avg_word_length'] = np.mean([len(w) for w in all_words]) if all_words else 0

    # Link tweets ratio
    feats['link_tweet_ratio'] = np.mean([
        1 if re.search(r'https?://|t\.co/', t) else 0 for t in texts
    ])

    # Retweet-like pattern ratio
    feats['retweet_ratio'] = np.mean([
        1 if t.strip().startswith('RT ') or t.strip().startswith('rt ') else 0
        for t in texts
    ])

    return feats


# ============================================================
# Combine all basic features for one dataset
# ============================================================
def extract_all_basic_features(posts_df, users_df):
    """Extract all Layer-1 features for every user in a dataset."""
    all_feats = []

    for _, user in users_df.iterrows():
        uid = user['id']
        user_posts = posts_df[posts_df['author_id'] == uid]

        f = {'user_id': uid}
        f.update(extract_user_profile_features(user))
        f.update(extract_behavioral_features(user_posts))
        f.update(extract_content_features(user_posts))
        all_feats.append(f)

    df = pd.DataFrame(all_feats)
    return df

print("Feature extraction functions defined.")

In [None]:
# Extract basic features for all datasets
basic_features = {}
for ds_id, ds in datasets.items():
    print(f"\nExtracting basic features for dataset {ds_id}...")
    feats = extract_all_basic_features(ds['posts'], ds['users'])
    feats = feats.merge(ds['users'][['id', 'is_bot']], left_on='user_id', right_on='id', how='left')
    feats.drop(columns=['id'], inplace=True)
    basic_features[ds_id] = feats
    print(f"  Shape: {feats.shape}, Bots: {(feats['is_bot']==1).sum()}, Humans: {(feats['is_bot']==0).sum()}")

print("\nBasic feature extraction complete!")
basic_features[30].head()

## Section 2b: Advanced Temporal Features (NEW)

Inspired by DARPA Bot Challenge winners and MulBot paper:
- **Time DNA Sequences**: Encode posting times as character sequences, compute self-similarity
- **Inter-arrival Distribution Fitting**: Fit exponential distribution, measure goodness-of-fit (bots are more regular)
- **Session Analysis**: Define activity sessions, compute session-level statistics

In [None]:
# ============================================================
# 2d. Advanced Temporal Features
# ============================================================

def encode_time_dna(timestamps, resolution='hour'):
    """
    Encode posting timestamps as a 'DNA' string sequence (Cresci et al.).
    Each character represents a time bucket. We then measure self-similarity.
    """
    if resolution == 'hour':
        return ''.join([chr(ord('A') + t.hour) for t in timestamps])
    elif resolution == 'minute_bucket':
        # 10-minute buckets (0-5 -> A-F per hour)
        return ''.join([chr(ord('A') + (t.hour * 6 + t.minute // 10)) for t in timestamps])
    return ''


def dna_self_similarity(dna_seq, k=3):
    """Compute self-similarity of a DNA sequence using k-mer overlap."""
    if len(dna_seq) < k + 1:
        return 0.0
    kmers = [dna_seq[i:i+k] for i in range(len(dna_seq) - k + 1)]
    unique_kmers = set(kmers)
    return 1.0 - (len(unique_kmers) / max(len(kmers), 1))


def extract_advanced_temporal_features(user_posts):
    """Extract advanced temporal features per user."""
    feats = {}
    n = len(user_posts)

    default_feats = {
        'time_dna_self_sim_hour': 0, 'time_dna_self_sim_minute': 0,
        'time_dna_unique_3gram_ratio': 0,
        'iat_exponential_ks_stat': 0, 'iat_exponential_ks_pvalue': 1.0,
        'iat_gini_coefficient': 0,
        'iat_benford_deviation': 0,
        'session_count': 0, 'avg_session_length': 0, 'max_session_length': 0,
        'avg_inter_session_gap': 0, 'session_regularity': 0,
        'posting_acceleration': 0,
        'longest_active_streak_hours': 0,
        'minute_entropy': 0,
        'day_of_week_entropy': 0,
    }

    if n < 3:
        return default_feats

    try:
        timestamps = pd.to_datetime(user_posts['created_at']).sort_values().reset_index(drop=True)
    except:
        return default_feats

    # --- Time DNA Sequences ---
    dna_hour = encode_time_dna(timestamps, 'hour')
    dna_minute = encode_time_dna(timestamps, 'minute_bucket')
    feats['time_dna_self_sim_hour'] = dna_self_similarity(dna_hour, k=3)
    feats['time_dna_self_sim_minute'] = dna_self_similarity(dna_minute, k=3)

    # 3-gram uniqueness ratio for time DNA
    if len(dna_hour) >= 4:
        kmers = [dna_hour[i:i+3] for i in range(len(dna_hour) - 2)]
        feats['time_dna_unique_3gram_ratio'] = len(set(kmers)) / max(len(kmers), 1)
    else:
        feats['time_dna_unique_3gram_ratio'] = 0

    # --- Inter-arrival Time Distribution Fitting ---
    intervals = timestamps.diff().dropna().dt.total_seconds().values
    intervals = intervals[intervals > 0]  # Remove zero intervals

    if len(intervals) >= 5:
        # KS test against exponential distribution (human posting tends to be heavy-tailed)
        try:
            loc, scale = stats.expon.fit(intervals, floc=0)
            ks_stat, ks_pvalue = stats.kstest(intervals, 'expon', args=(0, scale))
            feats['iat_exponential_ks_stat'] = float(ks_stat)
            feats['iat_exponential_ks_pvalue'] = float(ks_pvalue)
        except:
            feats['iat_exponential_ks_stat'] = 0
            feats['iat_exponential_ks_pvalue'] = 1.0

        # Gini coefficient of intervals (measures inequality; bots tend to be more equal = lower Gini)
        sorted_intervals = np.sort(intervals)
        n_int = len(sorted_intervals)
        index = np.arange(1, n_int + 1)
        feats['iat_gini_coefficient'] = float(
            (2 * np.sum(index * sorted_intervals) / (n_int * np.sum(sorted_intervals))) - (n_int + 1) / n_int
        ) if np.sum(sorted_intervals) > 0 else 0

        # Benford's law deviation on leading digit of intervals
        leading_digits = [int(str(abs(int(x)))[0]) for x in intervals if x >= 1]
        if leading_digits:
            digit_counts = Counter(leading_digits)
            total = sum(digit_counts.values())
            benford_expected = {d: np.log10(1 + 1/d) for d in range(1, 10)}
            deviation = sum(
                abs(digit_counts.get(d, 0) / total - benford_expected[d])
                for d in range(1, 10)
            )
            feats['iat_benford_deviation'] = float(deviation)
        else:
            feats['iat_benford_deviation'] = 0
    else:
        feats['iat_exponential_ks_stat'] = 0
        feats['iat_exponential_ks_pvalue'] = 1.0
        feats['iat_gini_coefficient'] = 0
        feats['iat_benford_deviation'] = 0

    # --- Session Analysis ---
    SESSION_GAP_SECONDS = 1800  # 30-minute gap defines a new session
    if len(intervals) >= 1:
        session_breaks = np.where(intervals > SESSION_GAP_SECONDS)[0]
        session_count = len(session_breaks) + 1
        feats['session_count'] = session_count

        # Compute session lengths (number of posts per session)
        session_starts = np.concatenate([[0], session_breaks + 1])
        session_ends = np.concatenate([session_breaks + 1, [len(timestamps)]])
        session_lengths = session_ends - session_starts

        feats['avg_session_length'] = float(np.mean(session_lengths))
        feats['max_session_length'] = float(np.max(session_lengths))

        # Inter-session gaps
        if session_count > 1:
            inter_session_gaps = intervals[session_breaks]
            feats['avg_inter_session_gap'] = float(np.mean(inter_session_gaps))
            feats['session_regularity'] = float(
                np.std(inter_session_gaps) / max(np.mean(inter_session_gaps), 0.01)
            )
        else:
            feats['avg_inter_session_gap'] = 0
            feats['session_regularity'] = 0
    else:
        feats['session_count'] = 1
        feats['avg_session_length'] = n
        feats['max_session_length'] = n
        feats['avg_inter_session_gap'] = 0
        feats['session_regularity'] = 0

    # --- Posting Acceleration ---
    # Compare posting frequency in first half vs second half of time span
    mid_idx = len(timestamps) // 2
    if mid_idx > 0 and mid_idx < len(timestamps) - 1:
        first_half_span = (timestamps.iloc[mid_idx] - timestamps.iloc[0]).total_seconds()
        second_half_span = (timestamps.iloc[-1] - timestamps.iloc[mid_idx]).total_seconds()
        freq_first = mid_idx / max(first_half_span, 1)
        freq_second = (len(timestamps) - mid_idx) / max(second_half_span, 1)
        feats['posting_acceleration'] = freq_second - freq_first
    else:
        feats['posting_acceleration'] = 0

    # --- Longest Active Streak ---
    # How many hours was the user continuously active (at least 1 post per hour)
    hour_bins = timestamps.dt.floor('h')
    active_hours = sorted(hour_bins.unique())
    if len(active_hours) > 1:
        diffs = [(active_hours[i+1] - active_hours[i]).total_seconds() / 3600
                 for i in range(len(active_hours) - 1)]
        longest_streak = 1
        current_streak = 1
        for d in diffs:
            if d <= 1.0:
                current_streak += 1
                longest_streak = max(longest_streak, current_streak)
            else:
                current_streak = 1
        feats['longest_active_streak_hours'] = longest_streak
    else:
        feats['longest_active_streak_hours'] = 1

    # --- Minute-level entropy ---
    minutes = timestamps.dt.minute
    minute_counts = minutes.value_counts(normalize=True)
    feats['minute_entropy'] = float(stats.entropy(minute_counts))

    # --- Day of week entropy ---
    dow = timestamps.dt.dayofweek
    dow_counts = dow.value_counts(normalize=True)
    feats['day_of_week_entropy'] = float(stats.entropy(dow_counts))

    return feats


# Apply advanced temporal features to all datasets
advanced_temporal_features = {}
for ds_id, ds in datasets.items():
    print(f"\nExtracting advanced temporal features for dataset {ds_id}...")
    results = []
    for _, user in ds['users'].iterrows():
        uid = user['id']
        user_posts = ds['posts'][ds['posts']['author_id'] == uid]
        f = {'user_id': uid}
        f.update(extract_advanced_temporal_features(user_posts))
        results.append(f)
    advanced_temporal_features[ds_id] = pd.DataFrame(results)
    print(f"  Shape: {advanced_temporal_features[ds_id].shape}")

print("\nAdvanced temporal feature extraction complete!")

## Section 2c: Text Stylometry Features (NEW)

Inspired by TwiBot-22 baselines (Lee's compression ratio, Kantepe's text entropy):
- **N-gram Repetition**: 2-gram and 3-gram repetition rates across tweets
- **Pairwise Jaccard Similarity**: Average Jaccard between tweet pairs (bot tweets are template-like)
- **Zipf's Law Deviation**: Human text follows Zipf's law; bots may deviate
- **Compression Ratio**: How compressible the user's combined text is (bots = more compressible)
- **Text Entropy**: Shannon entropy of character distribution
- **Punctuation Pattern Features**: Bot punctuation usage is often more regular

In [None]:
# ============================================================
# 2e. Text Stylometry Features
# ============================================================

def compute_ngram_repetition(texts, n=2):
    """Compute n-gram repetition rate across all tweets."""
    all_ngrams = []
    for text in texts:
        words = text.lower().split()
        ngrams = [tuple(words[i:i+n]) for i in range(len(words) - n + 1)]
        all_ngrams.extend(ngrams)
    if not all_ngrams:
        return 0.0
    counts = Counter(all_ngrams)
    repeated = sum(c for c in counts.values() if c > 1)
    return repeated / max(len(all_ngrams), 1)


def compute_zipf_deviation(texts):
    """
    Measure deviation from Zipf's law.
    Zipf's law: frequency of word rank r is proportional to 1/r.
    Returns the mean squared error between observed and expected Zipf distribution.
    """
    all_words = ' '.join(texts).lower().split()
    if len(all_words) < 10:
        return 0.0

    word_counts = Counter(all_words)
    frequencies = sorted(word_counts.values(), reverse=True)
    ranks = np.arange(1, len(frequencies) + 1)
    freq_arr = np.array(frequencies, dtype=float)
    freq_arr /= freq_arr[0]  # Normalize to first rank

    # Expected Zipf: f(r) = 1/r
    expected = 1.0 / ranks
    # Use only top 50 to avoid noise
    limit = min(50, len(freq_arr))
    mse = float(np.mean((freq_arr[:limit] - expected[:limit]) ** 2))
    return mse


def compute_text_entropy(text):
    """Compute Shannon entropy of character distribution."""
    if not text:
        return 0.0
    char_counts = Counter(text.lower())
    total = sum(char_counts.values())
    probs = [c / total for c in char_counts.values()]
    return float(stats.entropy(probs))


def compute_compression_ratio(texts):
    """
    Compute compression ratio of combined text (Lee et al.).
    More compressible = more repetitive = more bot-like.
    """
    combined = ' '.join(texts).encode('utf-8')
    if len(combined) < 10:
        return 1.0
    compressed = zlib.compress(combined)
    return len(combined) / max(len(compressed), 1)


def extract_stylometry_features(user_posts):
    """Extract text stylometry features per user."""
    feats = {}
    texts = user_posts['text'].dropna().tolist()

    default_feats = {
        'ngram2_repetition': 0, 'ngram3_repetition': 0,
        'pairwise_jaccard_mean': 0, 'pairwise_jaccard_std': 0,
        'pairwise_jaccard_max': 0,
        'zipf_deviation': 0,
        'compression_ratio': 1.0,
        'text_char_entropy': 0,
        'punctuation_pattern_std': 0,
        'sentence_length_cv': 0,
        'avg_word_length_std': 0,
        'unique_first_words_ratio': 0,
        'url_pattern_regularity': 0,
        'mention_diversity': 0,
        'hashtag_diversity': 0,
    }

    if len(texts) < 3:
        return default_feats

    # --- N-gram Repetition ---
    feats['ngram2_repetition'] = compute_ngram_repetition(texts, n=2)
    feats['ngram3_repetition'] = compute_ngram_repetition(texts, n=3)

    # --- Pairwise Jaccard Similarity ---
    sample = texts[:50]
    jaccard_scores = []
    for i in range(len(sample)):
        wi = set(sample[i].lower().split())
        for j in range(i + 1, min(i + 10, len(sample))):  # Limit pairs for speed
            wj = set(sample[j].lower().split())
            if wi or wj:
                jaccard_scores.append(len(wi & wj) / max(len(wi | wj), 1))
    if jaccard_scores:
        feats['pairwise_jaccard_mean'] = float(np.mean(jaccard_scores))
        feats['pairwise_jaccard_std'] = float(np.std(jaccard_scores))
        feats['pairwise_jaccard_max'] = float(np.max(jaccard_scores))
    else:
        feats['pairwise_jaccard_mean'] = 0
        feats['pairwise_jaccard_std'] = 0
        feats['pairwise_jaccard_max'] = 0

    # --- Zipf's Law Deviation ---
    feats['zipf_deviation'] = compute_zipf_deviation(texts)

    # --- Compression Ratio ---
    feats['compression_ratio'] = compute_compression_ratio(texts)

    # --- Character Entropy ---
    combined_text = ' '.join(texts)
    feats['text_char_entropy'] = compute_text_entropy(combined_text)

    # --- Punctuation Pattern Regularity ---
    # Std of punctuation count per tweet (regular = bot-like)
    punct_counts = [sum(1 for c in t if not c.isalnum() and not c.isspace()) for t in texts]
    feats['punctuation_pattern_std'] = float(np.std(punct_counts)) if punct_counts else 0

    # --- Sentence Length Coefficient of Variation ---
    word_lengths = [len(t.split()) for t in texts]
    mean_wl = np.mean(word_lengths)
    feats['sentence_length_cv'] = float(np.std(word_lengths) / max(mean_wl, 0.01))

    # --- Word Length Variability Across Tweets ---
    avg_word_lens = [np.mean([len(w) for w in t.split()]) if t.split() else 0 for t in texts]
    feats['avg_word_length_std'] = float(np.std(avg_word_lens))

    # --- Unique First Words Ratio (template detection) ---
    first_words = [t.split()[0].lower() if t.split() else '' for t in texts]
    feats['unique_first_words_ratio'] = len(set(first_words)) / max(len(first_words), 1)

    # --- URL/Mention/Hashtag Diversity ---
    all_urls = [url for t in texts for url in re.findall(r'https?://\S+|t\.co/\S+', t)]
    feats['url_pattern_regularity'] = 1 - (len(set(all_urls)) / max(len(all_urls), 1)) if all_urls else 0

    all_mentions = [m for t in texts for m in re.findall(r'@\w+', t)]
    feats['mention_diversity'] = len(set(all_mentions)) / max(len(all_mentions), 1) if all_mentions else 0

    all_hashtags = [h for t in texts for h in re.findall(r'#\w+', t)]
    feats['hashtag_diversity'] = len(set(all_hashtags)) / max(len(all_hashtags), 1) if all_hashtags else 0

    return feats


# Apply stylometry features to all datasets
stylometry_features = {}
for ds_id, ds in datasets.items():
    print(f"\nExtracting stylometry features for dataset {ds_id}...")
    results = []
    for _, user in ds['users'].iterrows():
        uid = user['id']
        user_posts = ds['posts'][ds['posts']['author_id'] == uid]
        f = {'user_id': uid}
        f.update(extract_stylometry_features(user_posts))
        results.append(f)
    stylometry_features[ds_id] = pd.DataFrame(results)
    print(f"  Shape: {stylometry_features[ds_id].shape}")

print("\nStylometry feature extraction complete!")

## Section 3: Deep NLP Features (Layer 2)

- **Sentence Transformer**: Compute per-user tweet embedding similarity (bot tweets may be too uniform or too random)
- **GPT-2 Perplexity**: AI-generated text tends to have lower, more consistent perplexity

In [None]:
# ============================================================
# 3a. Sentence Transformer Embedding Features
# ============================================================
# Using multilingual model so it works for both English and French
EMBED_MODEL_NAME = 'paraphrase-multilingual-MiniLM-L12-v2'

def extract_embedding_features(posts_df, users_df, model=None):
    """Compute per-user tweet embedding similarity features."""
    if model is None:
        model = SentenceTransformer(EMBED_MODEL_NAME)

    results = []
    mean_embeddings = {}  # Store for cross-user analysis later

    for _, user in users_df.iterrows():
        uid = user['id']
        user_posts = posts_df[posts_df['author_id'] == uid]
        texts = user_posts['text'].dropna().tolist()

        feats = {'user_id': uid}

        if len(texts) < 2:
            feats.update({
                'emb_avg_sim': 0, 'emb_std_sim': 0,
                'emb_min_sim': 0, 'emb_max_sim': 0,
                'emb_median_sim': 0
            })
            results.append(feats)
            continue

        # Sample up to 50 tweets for performance
        sample = texts[:50]
        embeddings = model.encode(sample, show_progress_bar=False, batch_size=32)

        # Store mean embedding for cross-user analysis
        mean_embeddings[uid] = np.mean(embeddings, axis=0)

        # Pairwise cosine similarities (upper triangle)
        sim_matrix = cosine_similarity(embeddings)
        upper_tri = sim_matrix[np.triu_indices_from(sim_matrix, k=1)]

        feats['emb_avg_sim'] = float(np.mean(upper_tri))
        feats['emb_std_sim'] = float(np.std(upper_tri))
        feats['emb_min_sim'] = float(np.min(upper_tri))
        feats['emb_max_sim'] = float(np.max(upper_tri))
        feats['emb_median_sim'] = float(np.median(upper_tri))

        results.append(feats)

    return pd.DataFrame(results), mean_embeddings


# ============================================================
# 3b. GPT-2 Perplexity Features
# ============================================================
def load_perplexity_model(model_name='gpt2'):
    """Load GPT-2 model for perplexity computation."""
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
    model.eval()
    return model, tokenizer, device


def compute_perplexity(text, model, tokenizer, device, max_length=512):
    """Compute perplexity of a text using GPT-2."""
    try:
        inputs = tokenizer(text, return_tensors='pt', truncation=True,
                          max_length=max_length).to(device)
        if inputs['input_ids'].shape[1] < 2:
            return None
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs['input_ids'])
        return float(torch.exp(outputs.loss))
    except:
        return None


def extract_perplexity_features(posts_df, users_df, ppl_model, ppl_tokenizer, ppl_device):
    """Compute per-user perplexity statistics."""
    results = []

    for idx, (_, user) in enumerate(users_df.iterrows()):
        uid = user['id']
        user_posts = posts_df[posts_df['author_id'] == uid]
        texts = user_posts['text'].dropna().tolist()

        feats = {'user_id': uid}

        if not texts:
            feats.update({
                'ppl_mean': 0, 'ppl_std': 0, 'ppl_min': 0,
                'ppl_max': 0, 'ppl_median': 0, 'ppl_skew': 0,
                'ppl_low_ratio': 0
            })
            results.append(feats)
            continue

        # Sample up to 20 tweets for performance
        sample = texts[:20]
        perplexities = [compute_perplexity(t, ppl_model, ppl_tokenizer, ppl_device)
                       for t in sample]
        perplexities = [p for p in perplexities if p is not None and p < 10000]

        if perplexities:
            feats['ppl_mean'] = np.mean(perplexities)
            feats['ppl_std'] = np.std(perplexities)
            feats['ppl_min'] = np.min(perplexities)
            feats['ppl_max'] = np.max(perplexities)
            feats['ppl_median'] = np.median(perplexities)
            feats['ppl_skew'] = float(stats.skew(perplexities)) if len(perplexities) >= 3 else 0
            # Ratio of tweets with unusually low perplexity (< 50 = very "fluent")
            feats['ppl_low_ratio'] = np.mean([1 if p < 50 else 0 for p in perplexities])
        else:
            feats.update({
                'ppl_mean': 0, 'ppl_std': 0, 'ppl_min': 0,
                'ppl_max': 0, 'ppl_median': 0, 'ppl_skew': 0,
                'ppl_low_ratio': 0
            })

        results.append(feats)

        if (idx + 1) % 50 == 0:
            print(f"  Perplexity: {idx+1}/{len(users_df)} users processed")

    return pd.DataFrame(results)

print("NLP feature functions defined.")

In [None]:
# Load models once
print("Loading Sentence Transformer...")
embed_model = SentenceTransformer(EMBED_MODEL_NAME)
print("Loading GPT-2 for perplexity...")
ppl_model, ppl_tokenizer, ppl_device = load_perplexity_model('gpt2')
print(f"Models loaded. Perplexity device: {ppl_device}")

# Extract NLP features for all datasets
nlp_features = {}
all_mean_embeddings = {}

for ds_id, ds in datasets.items():
    print(f"\n{'='*50}")
    print(f"Dataset {ds_id} ({ds['meta']['lang']})")
    print(f"{'='*50}")

    # Embedding features
    print("Computing sentence embeddings...")
    emb_feats, mean_embs = extract_embedding_features(
        ds['posts'], ds['users'], model=embed_model)

    # Perplexity features
    print("Computing perplexity scores...")
    ppl_feats = extract_perplexity_features(
        ds['posts'], ds['users'], ppl_model, ppl_tokenizer, ppl_device)

    # Merge
    nlp_feat = emb_feats.merge(ppl_feats, on='user_id', how='outer')
    nlp_features[ds_id] = nlp_feat
    all_mean_embeddings[ds_id] = mean_embs

    print(f"  NLP features shape: {nlp_feat.shape}")

print("\nNLP feature extraction complete!")

## Section 4: Cross-User Features (Layer 3) + HDBSCAN Clustering + Combine All

Enhanced with:
- Original cross-user embedding similarity
- **HDBSCAN clustering** on embeddings (users in same cluster as bots are more suspicious)
- **Temporal pattern clustering** on hour distributions
- Combine ALL feature layers into final feature matrices

In [None]:
# ============================================================
# Layer 3: Cross-user embedding similarity features
# ============================================================
def extract_cross_user_features(mean_embeddings):
    """Compute how similar each user's avg embedding is to all other users."""
    user_ids = list(mean_embeddings.keys())
    if len(user_ids) < 2:
        return pd.DataFrame(columns=['user_id', 'cross_avg_sim', 'cross_max_sim',
                                      'cross_min_sim', 'cross_std_sim'])

    emb_matrix = np.array([mean_embeddings[uid] for uid in user_ids])
    sim_matrix = cosine_similarity(emb_matrix)

    results = []
    for i, uid in enumerate(user_ids):
        sims = np.delete(sim_matrix[i], i)  # Exclude self-similarity
        results.append({
            'user_id': uid,
            'cross_avg_sim': float(np.mean(sims)),
            'cross_max_sim': float(np.max(sims)),
            'cross_min_sim': float(np.min(sims)),
            'cross_std_sim': float(np.std(sims)),
        })

    return pd.DataFrame(results)


# ============================================================
# Layer 3b: HDBSCAN Clustering Features (NEW)
# ============================================================
def extract_clustering_features(mean_embeddings, posts_df, users_df):
    """
    Cluster users using HDBSCAN on embeddings and temporal patterns.
    Returns per-user cluster membership and cluster-level statistics.
    """
    user_ids = list(mean_embeddings.keys())
    results = {uid: {} for uid in users_df['id'].tolist()}

    # --- Embedding-based HDBSCAN Clustering ---
    if len(user_ids) >= 10:
        emb_matrix = np.array([mean_embeddings[uid] for uid in user_ids])

        # Normalize for better clustering
        scaler = StandardScaler()
        emb_scaled = scaler.fit_transform(emb_matrix)

        clusterer = hdbscan.HDBSCAN(min_cluster_size=3, min_samples=2, metric='euclidean')
        cluster_labels = clusterer.fit_predict(emb_scaled)

        for i, uid in enumerate(user_ids):
            results[uid]['emb_cluster_id'] = int(cluster_labels[i])
            results[uid]['emb_cluster_prob'] = float(clusterer.probabilities_[i])
            # Cluster size (how many users in same cluster)
            if cluster_labels[i] >= 0:
                results[uid]['emb_cluster_size'] = int((cluster_labels == cluster_labels[i]).sum())
            else:
                results[uid]['emb_cluster_size'] = 0  # Noise point
            results[uid]['emb_is_noise'] = int(cluster_labels[i] == -1)
    else:
        for uid in user_ids:
            results[uid]['emb_cluster_id'] = -1
            results[uid]['emb_cluster_prob'] = 0.0
            results[uid]['emb_cluster_size'] = 0
            results[uid]['emb_is_noise'] = 1

    # --- Temporal Pattern Clustering ---
    # Build 24-dim hour distribution vector per user
    temporal_vectors = {}
    for _, user in users_df.iterrows():
        uid = user['id']
        user_posts_local = posts_df[posts_df['author_id'] == uid]
        if len(user_posts_local) >= 3:
            try:
                ts = pd.to_datetime(user_posts_local['created_at'])
                hour_dist = np.zeros(24)
                for h in ts.dt.hour:
                    hour_dist[h] += 1
                hour_dist /= max(hour_dist.sum(), 1)
                temporal_vectors[uid] = hour_dist
            except:
                temporal_vectors[uid] = np.zeros(24)
        else:
            temporal_vectors[uid] = np.zeros(24)

    temp_user_ids = list(temporal_vectors.keys())
    if len(temp_user_ids) >= 10:
        temp_matrix = np.array([temporal_vectors[uid] for uid in temp_user_ids])

        temp_clusterer = hdbscan.HDBSCAN(min_cluster_size=3, min_samples=2, metric='euclidean')
        temp_labels = temp_clusterer.fit_predict(temp_matrix)

        for i, uid in enumerate(temp_user_ids):
            results[uid]['temp_cluster_id'] = int(temp_labels[i])
            results[uid]['temp_cluster_prob'] = float(temp_clusterer.probabilities_[i])
            if temp_labels[i] >= 0:
                results[uid]['temp_cluster_size'] = int((temp_labels == temp_labels[i]).sum())
            else:
                results[uid]['temp_cluster_size'] = 0
            results[uid]['temp_is_noise'] = int(temp_labels[i] == -1)
    else:
        for uid in temp_user_ids:
            results[uid]['temp_cluster_id'] = -1
            results[uid]['temp_cluster_prob'] = 0.0
            results[uid]['temp_cluster_size'] = 0
            results[uid]['temp_is_noise'] = 1

    # Fill defaults for any users missing features
    default_cluster = {
        'emb_cluster_id': -1, 'emb_cluster_prob': 0.0, 'emb_cluster_size': 0, 'emb_is_noise': 1,
        'temp_cluster_id': -1, 'temp_cluster_prob': 0.0, 'temp_cluster_size': 0, 'temp_is_noise': 1,
    }
    rows = []
    for uid in users_df['id'].tolist():
        row = {'user_id': uid}
        for k, v in default_cluster.items():
            row[k] = results.get(uid, {}).get(k, v)
        rows.append(row)

    return pd.DataFrame(rows)


# ============================================================
# Combine ALL features into final feature matrices
# ============================================================
feature_dfs = {}
for ds_id in datasets.keys():
    print(f"\nCombining features for dataset {ds_id}...")

    # Start with basic features
    df = basic_features[ds_id].copy()

    # Merge advanced temporal features (NEW)
    if ds_id in advanced_temporal_features:
        df = df.merge(advanced_temporal_features[ds_id], on='user_id', how='left')

    # Merge stylometry features (NEW)
    if ds_id in stylometry_features:
        df = df.merge(stylometry_features[ds_id], on='user_id', how='left')

    # Merge NLP features
    df = df.merge(nlp_features[ds_id], on='user_id', how='left')

    # Cross-user features
    if ds_id in all_mean_embeddings and all_mean_embeddings[ds_id]:
        cross_feats = extract_cross_user_features(all_mean_embeddings[ds_id])
        df = df.merge(cross_feats, on='user_id', how='left')

    # HDBSCAN clustering features (NEW)
    if ds_id in all_mean_embeddings and all_mean_embeddings[ds_id]:
        cluster_feats = extract_clustering_features(
            all_mean_embeddings[ds_id], datasets[ds_id]['posts'], datasets[ds_id]['users'])
        df = df.merge(cluster_feats, on='user_id', how='left')

    # Fill NaN
    df = df.fillna(0)
    feature_dfs[ds_id] = df
    print(f"  Final feature shape: {df.shape}")

# List all feature columns (exclude metadata columns)
meta_cols = ['user_id', 'is_bot']
feature_cols = [c for c in feature_dfs[30].columns if c not in meta_cols]
print(f"\nTotal features: {len(feature_cols)}")
print(f"Features: {feature_cols}")

## Section 5: Model Training & Stacking Ensemble (Enhanced)

Enhanced with:
- **Tuned scale_pos_weight**: Adjusted based on competition scoring (+4 TP, -1 FN, -2 FP)
- **Stacking Ensemble**: Layer 1 (XGBoost + LightGBM + CatBoost) -> Layer 2 (LogisticRegression)
- **Fine-grained threshold search**: Two-pass coarse-to-fine search for optimal threshold

In [None]:
# ============================================================
# Competition Scoring Function
# ============================================================
def competition_score(y_true, y_pred):
    """Calculate the competition score: +4 TP, -1 FN, -2 FP."""
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    tp = ((y_pred == 1) & (y_true == 1)).sum()
    fn = ((y_pred == 0) & (y_true == 1)).sum()
    fp = ((y_pred == 1) & (y_true == 0)).sum()
    tn = ((y_pred == 0) & (y_true == 0)).sum()
    score = 4 * tp - 1 * fn - 2 * fp
    return score, {'tp': int(tp), 'fn': int(fn), 'fp': int(fp), 'tn': int(tn)}


def find_optimal_threshold(y_true, y_proba):
    """
    Two-pass coarse-to-fine threshold search for competition score maximization.
    Pass 1: coarse search (step=0.05) to find approximate region
    Pass 2: fine search (step=0.002) around the best region
    """
    best_score = -np.inf
    best_threshold = 0.5
    best_details = {}

    # Pass 1: Coarse search
    for threshold in np.arange(0.05, 0.96, 0.05):
        y_pred = (y_proba >= threshold).astype(int)
        score, details = competition_score(y_true, y_pred)
        if score > best_score:
            best_score = score
            best_threshold = threshold
            best_details = details

    # Pass 2: Fine search around the best threshold from pass 1
    fine_start = max(0.01, best_threshold - 0.08)
    fine_end = min(0.99, best_threshold + 0.08)
    for threshold in np.arange(fine_start, fine_end, 0.002):
        y_pred = (y_proba >= threshold).astype(int)
        score, details = competition_score(y_true, y_pred)
        if score > best_score:
            best_score = score
            best_threshold = threshold
            best_details = details

    return best_threshold, best_score, best_details


# ============================================================
# Custom Asymmetric Loss for Competition Scoring (CORE CHANGE)
# ============================================================
# Competition scoring: +4 TP, -1 FN, -2 FP, 0 TN
#
# Cost analysis (from the model's perspective):
#   - Missing a bot (FN): lose +4 TP reward AND get -1 FN penalty = 5 point swing
#   - False alarm  (FP): get -2 FP penalty                       = 2 point swing
#   - Ratio: 5:2 = 2.5x bias towards catching bots (recall)
#
# Instead of using scale_pos_weight (indirect), we directly encode
# these asymmetric costs into the gradient/hessian of the loss function.
# This makes the model learn the EXACT cost structure during training.

W_POS = 5.0  # Weight for bot class (positive): TP value (4) + FN cost (1)
W_NEG = 2.0  # Weight for human class (negative): FP cost (2)


def _safe_sigmoid(x):
    """Numerically stable sigmoid function."""
    return 1.0 / (1.0 + np.exp(-np.clip(np.asarray(x, dtype=np.float64), -500, 500)))


def competition_loss_xgb(y_true, y_pred):
    """
    Custom asymmetric weighted cross-entropy for XGBoost (sklearn API).

    Directly encodes the competition's asymmetric scoring into the loss:
    - Bot samples (y=1) get weight W_POS=5 (high cost of missing)
    - Human samples (y=0) get weight W_NEG=2 (cost of false alarm)

    Args:
        y_true: true labels (0 or 1)
        y_pred: raw predictions (logits, before sigmoid)
    Returns:
        (gradient, hessian) tuple
    """
    p = _safe_sigmoid(y_pred)
    w = np.where(y_true == 1, W_POS, W_NEG)
    # Gradient of weighted binary cross-entropy w.r.t. logit
    grad = w * (p - y_true)
    # Hessian of weighted binary cross-entropy w.r.t. logit
    hess = w * np.maximum(p * (1.0 - p), 1e-7)
    return grad, hess


def competition_loss_lgb(y_true, y_pred):
    """
    Custom asymmetric weighted cross-entropy for LightGBM (sklearn API).
    Same math as XGBoost version, separate function for clarity.
    """
    p = _safe_sigmoid(y_pred)
    w = np.where(y_true == 1, W_POS, W_NEG)
    grad = w * (p - y_true)
    hess = w * np.maximum(p * (1.0 - p), 1e-7)
    return grad, hess


def train_models(X_train, y_train, params=None):
    """
    Train XGBoost, LightGBM, and CatBoost with CUSTOM ASYMMETRIC LOSS.
    Layer 1 of the stacking ensemble.

    Key change: replaced scale_pos_weight with custom objective functions
    that directly encode the competition's asymmetric scoring (4*TP - FN - 2*FP).

    Args:
        X_train: training features
        y_train: training labels
        params: optional dict of hyperparameters from Optuna optimization.
                If None, uses default hyperparameters.
    """
    # Default hyperparameters (used when params is None)
    if params is None:
        params = {
            'n_estimators': 600,
            'max_depth': 5,
            'learning_rate': 0.025,
            'subsample': 0.8,
            'colsample_bytree': 0.7,
            'min_child_weight': 3,
            'reg_alpha': 0.1,
            'reg_lambda': 1.0,
            'gamma': 0.1,
        }

    # XGBoost with custom asymmetric loss
    # NOTE: removed scale_pos_weight and eval_metric='logloss'
    #       because the custom objective handles asymmetric costs directly
    xgb_model = xgb.XGBClassifier(
        objective=competition_loss_xgb,
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        learning_rate=params['learning_rate'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        min_child_weight=params['min_child_weight'],
        reg_alpha=params['reg_alpha'],
        reg_lambda=params['reg_lambda'],
        gamma=params['gamma'],
        random_state=42, verbosity=0
    )
    xgb_model.fit(X_train, y_train)

    # LightGBM with custom asymmetric loss
    lgb_model = lgb.LGBMClassifier(
        objective=competition_loss_lgb,
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        learning_rate=params['learning_rate'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        min_child_weight=params['min_child_weight'],
        reg_alpha=params['reg_alpha'],
        reg_lambda=params['reg_lambda'],
        random_state=42, verbose=-1
    )
    lgb_model.fit(X_train, y_train)

    # CatBoost with equivalent asymmetric class weights
    # CatBoost doesn't support custom obj as easily via sklearn API,
    # so we use class_weights with the same 5:2 ratio
    cb_model = CatBoostClassifier(
        iterations=params['n_estimators'],
        depth=min(params['max_depth'], 10),  # CatBoost max depth capped at 10
        learning_rate=params['learning_rate'],
        class_weights={0: 1.0, 1: W_POS / W_NEG},  # {0: 1.0, 1: 2.5}
        random_seed=42, verbose=0
    )
    cb_model.fit(X_train, y_train)

    return {'xgb': xgb_model, 'lgb': lgb_model, 'cb': cb_model}


def get_base_model_probas(models, X):
    """
    Get probability predictions from each base model (for stacking).

    IMPORTANT: With custom objectives, XGBoost/LightGBM's predict_proba()
    returns RAW LOGITS instead of probabilities (they don't know the link
    function for custom objectives). We extract raw predictions and apply
    sigmoid manually.
    CatBoost with class_weights uses standard objective, so predict_proba works.
    """
    probas = {}
    for name, model in models.items():
        if name == 'xgb':
            # XGBoost: get raw margin via booster and apply sigmoid
            dmat = xgb.DMatrix(X)
            raw = model.get_booster().predict(dmat, output_margin=True)
            probas[name] = _safe_sigmoid(raw)
        elif name == 'lgb':
            # LightGBM: get raw score and apply sigmoid
            raw = model.predict(X, raw_score=True)
            probas[name] = _safe_sigmoid(raw)
        else:
            # CatBoost: predict_proba works correctly with class_weights
            probas[name] = model.predict_proba(X)[:, 1]
    return probas


def train_stacking_meta(models, X_train, y_train):
    """
    Train Layer 2 meta-learner using cross-validated predictions from Layer 1.
    Uses LogisticRegression as the meta-learner to learn optimal model weighting.
    """
    from sklearn.model_selection import StratifiedKFold

    n_models = len(models)
    meta_features = np.zeros((len(X_train), n_models))

    # Generate out-of-fold predictions for meta-training
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for fold_idx, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        X_tr, X_val = X_train[tr_idx], X_train[val_idx]
        y_tr = y_train[tr_idx]

        # Train fold models
        fold_models = train_models(X_tr, y_tr)

        # Get OOF predictions (using get_base_model_probas for proper sigmoid handling)
        fold_probas = get_base_model_probas(fold_models, X_val)
        for i, name in enumerate(fold_models.keys()):
            meta_features[val_idx, i] = fold_probas[name]

    # Train meta-learner on OOF predictions
    meta_model = LogisticRegressionCV(
        Cs=10, cv=3, scoring='roc_auc',
        class_weight='balanced', random_state=42, max_iter=1000
    )
    meta_model.fit(meta_features, y_train)

    print(f"  Stacking meta-learner trained. Coefficients: {dict(zip(models.keys(), meta_model.coef_[0]))}")
    return meta_model


def ensemble_predict_proba(models, X, meta_model=None):
    """
    Get ensemble probability.
    If meta_model is provided: use stacking (Layer 2 meta-learner).
    Otherwise: fall back to simple average.
    """
    base_probas = get_base_model_probas(models, X)
    probas_array = np.column_stack(list(base_probas.values()))

    if meta_model is not None:
        return meta_model.predict_proba(probas_array)[:, 1]
    else:
        return np.mean(probas_array, axis=1)


def evaluate_on_test(models, X_test, y_test, dataset_name="", meta_model=None):
    """Evaluate ensemble on test set with optimal threshold."""
    proba = ensemble_predict_proba(models, X_test, meta_model=meta_model)
    threshold, score, details = find_optimal_threshold(y_test, proba)

    y_pred = (proba >= threshold).astype(int)

    print(f"\n{'='*50}")
    print(f"Evaluation: {dataset_name}")
    print(f"{'='*50}")
    print(f"Optimal threshold: {threshold:.2f}")
    print(f"Competition score: {score}")
    print(f"Details: TP={details['tp']}, FN={details['fn']}, FP={details['fp']}, TN={details['tn']}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred, target_names=['Human', 'Bot']))

    return threshold, score, proba

print("Training functions defined.")

## Section 6: Cross-Validation Between Datasets

- English: Train on 30 -> Test on 32, then Train on 32 -> Test on 30
- French: Train on 31 -> Test on 33, then Train on 33 -> Test on 31

In [None]:
# ============================================================
# Optuna Bayesian Hyperparameter Optimization
# ============================================================
# Replaces manual grid/random search with efficient TPE (Tree-structured
# Parzen Estimator) sampler. The competition score (+4 TP, -1 FN, -2 FP)
# is used directly as the optimization objective.

def optuna_optimize_hyperparams(X_train, y_train, n_trials=200, n_inner_folds=3):
    """
    Use Optuna Bayesian optimization (TPE) to find the best hyperparameters
    for the XGBoost + LightGBM + CatBoost stacking ensemble.

    The objective maximizes the average competition score across inner CV folds.

    Args:
        X_train: training feature matrix (numpy array)
        y_train: training labels (numpy array)
        n_trials: number of Optuna trials (default 200, increase for better results)
        n_inner_folds: number of inner CV folds for evaluation (default 3)

    Returns:
        optuna.Study object (access best_params, best_value, etc.)
    """

    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
            'gamma': trial.suggest_float('gamma', 1e-8, 5.0, log=True),
        }

        # Inner cross-validation to evaluate this set of hyperparameters
        skf = StratifiedKFold(n_splits=n_inner_folds, shuffle=True, random_state=42)
        fold_scores = []

        for fold_idx, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
            X_tr, X_val = X_train[tr_idx], X_train[val_idx]
            y_tr, y_val = y_train[tr_idx], y_train[val_idx]

            # Train all base models with the suggested hyperparameters
            fold_models = train_models(X_tr, y_tr, params=params)

            # Get ensemble probability (simple average, no meta-learner in inner loop
            # for speed -- the meta-learner will be trained after optimization)
            probas = get_base_model_probas(fold_models, X_val)
            avg_proba = np.mean(np.column_stack(list(probas.values())), axis=1)

            # Evaluate with competition score using optimal threshold
            _, score, _ = find_optimal_threshold(y_val, avg_proba)
            fold_scores.append(score)

        return np.mean(fold_scores)

    # Create Optuna study with TPE sampler (Bayesian optimization)
    study = optuna.create_study(
        direction='maximize',
        sampler=optuna.samplers.TPESampler(seed=42),
        pruner=optuna.pruners.MedianPruner(n_startup_trials=10)
    )
    study.optimize(
        objective,
        n_trials=n_trials,
        show_progress_bar=True,
        n_jobs=1  # set to -1 for parallel trials if resources allow
    )

    # Print optimization results
    print(f"\n  Optuna optimization complete ({n_trials} trials)")
    print(f"  Best competition score (inner CV avg): {study.best_value:.2f}")
    print(f"  Best hyperparameters:")
    for key, value in study.best_params.items():
        if isinstance(value, float):
            print(f"    {key}: {value:.6f}")
        else:
            print(f"    {key}: {value}")

    return study


# ============================================================
# Save / Load Hyperparameters (JSON)
# ============================================================
# On Google Colab, saves to Google Drive so hyperparameters persist
# across sessions. Locally, saves to saved_hyperparams/ in the
# current working directory.
# ============================================================
import os

# Auto-detect Colab and use Google Drive for persistent storage
IN_COLAB = 'google.colab' in str(get_ipython()) if 'get_ipython' in dir() else False

if IN_COLAB:
    from google.colab import drive
    if not os.path.ismount('/content/drive'):
        drive.mount('/content/drive')
    HYPERPARAMS_DIR = "/content/drive/MyDrive/bot_or_not/saved_hyperparams"
    print(f"[Colab] Hyperparameters will be saved to Google Drive: {HYPERPARAMS_DIR}")
else:
    HYPERPARAMS_DIR = "saved_hyperparams"
    print(f"[Local] Hyperparameters will be saved to: {HYPERPARAMS_DIR}")

def save_hyperparams(params, name, score=None, directory=HYPERPARAMS_DIR):
    """
    Save hyperparameters to a JSON file for later reuse.

    Args:
        params: dict of hyperparameters
        name: identifier string, e.g. 'en_30_32' or 'best_english'
        score: optional competition score to save alongside
        directory: folder to save into (default: saved_hyperparams/)
    """
    os.makedirs(directory, exist_ok=True)
    filepath = os.path.join(directory, f"{name}.json")
    payload = {
        "hyperparameters": params,
        "score": score,
        "saved_at": pd.Timestamp.now().isoformat(),
    }
    with open(filepath, 'w') as f:
        json.dump(payload, f, indent=2, default=str)
    print(f"  [SAVED] Hyperparameters -> {filepath}")
    return filepath


def load_hyperparams(name, directory=HYPERPARAMS_DIR):
    """
    Load previously saved hyperparameters from a JSON file.

    Args:
        name: identifier string used when saving
        directory: folder to look in (default: saved_hyperparams/)

    Returns:
        dict of hyperparameters, or None if file not found
    """
    filepath = os.path.join(directory, f"{name}.json")
    if not os.path.exists(filepath):
        print(f"  [WARNING] No saved hyperparameters found at {filepath}")
        return None
    with open(filepath, 'r') as f:
        payload = json.load(f)
    params = payload["hyperparameters"]
    score = payload.get("score")
    saved_at = payload.get("saved_at", "unknown")
    print(f"  [LOADED] Hyperparameters from {filepath} (score={score}, saved_at={saved_at})")
    # Restore int types for integer hyperparameters
    int_keys = ['n_estimators', 'max_depth', 'min_child_weight']
    for k in int_keys:
        if k in params:
            params[k] = int(params[k])
    return params


def list_saved_hyperparams(directory=HYPERPARAMS_DIR):
    """List all saved hyperparameter files."""
    if not os.path.exists(directory):
        print("No saved hyperparameters directory found.")
        return []
    files = [f for f in os.listdir(directory) if f.endswith('.json')]
    if not files:
        print("No saved hyperparameters found.")
        return []
    print(f"Saved hyperparameters in '{directory}/':")
    for f in sorted(files):
        filepath = os.path.join(directory, f)
        with open(filepath, 'r') as fh:
            payload = json.load(fh)
        score = payload.get("score", "N/A")
        saved_at = payload.get("saved_at", "unknown")
        print(f"  - {f}: score={score}, saved_at={saved_at}")
    return files


print("Optuna hyperparameter optimization function defined.")
print("Hyperparameter save/load utilities defined.")

In [None]:
# Cross-validation pairs: (train_id, test_id, language)
cv_pairs = [
    (30, 32, 'English'),
    (32, 30, 'English'),
    (31, 33, 'French'),
    (33, 31, 'French'),
]

# ============================================================
# Optuna Configuration
# ============================================================
USE_OPTUNA = True          # Set to False to use default hyperparameters
OPTUNA_N_TRIALS = 200      # Number of Bayesian optimization trials (increase for better results)
OPTUNA_INNER_FOLDS = 3     # Number of inner CV folds for Optuna evaluation

# >>> NEW: Set to True to load previously saved hyperparams instead of re-running Optuna <<<
LOAD_SAVED_HYPERPARAMS = False  # Set to True to skip Optuna and load from saved_hyperparams/

cv_results = {}
trained_models = {}
trained_meta_models = {}
optuna_studies = {}  # Store Optuna studies for analysis

for train_id, test_id, lang in cv_pairs:
    print(f"\n{'#'*60}")
    print(f"Training on Dataset {train_id} -> Testing on Dataset {test_id} ({lang})")
    print(f"{'#'*60}")

    train_df = feature_dfs[train_id]
    test_df = feature_dfs[test_id]

    # Ensure consistent feature columns
    common_cols = [c for c in feature_cols if c in train_df.columns and c in test_df.columns]

    X_train = train_df[common_cols].values
    y_train = train_df['is_bot'].values
    X_test = test_df[common_cols].values
    y_test = test_df['is_bot'].values

    # ---- Hyperparameter Selection ----
    best_params = None
    param_name = f"cv_{train_id}_{test_id}"

    if LOAD_SAVED_HYPERPARAMS:
        # Try to load saved hyperparameters
        best_params = load_hyperparams(param_name)
        if best_params is None:
            print(f"  No saved params for {param_name}, falling back to Optuna...")
            LOAD_SAVED_HYPERPARAMS_FALLBACK = True
        else:
            LOAD_SAVED_HYPERPARAMS_FALLBACK = False
    else:
        LOAD_SAVED_HYPERPARAMS_FALLBACK = False

    if best_params is None and USE_OPTUNA and not LOAD_SAVED_HYPERPARAMS:
        # Run Optuna optimization
        print(f"  Running Optuna hyperparameter optimization ({OPTUNA_N_TRIALS} trials)...")
        study = optuna_optimize_hyperparams(
            X_train, y_train,
            n_trials=OPTUNA_N_TRIALS,
            n_inner_folds=OPTUNA_INNER_FOLDS
        )
        best_params = study.best_params
        optuna_studies[(train_id, test_id)] = study
        # Auto-save hyperparameters after Optuna optimization
        save_hyperparams(best_params, param_name, score=study.best_value)
    elif best_params is None and LOAD_SAVED_HYPERPARAMS_FALLBACK:
        # Fallback: run Optuna if saved params not found
        print(f"  Running Optuna hyperparameter optimization ({OPTUNA_N_TRIALS} trials)...")
        study = optuna_optimize_hyperparams(
            X_train, y_train,
            n_trials=OPTUNA_N_TRIALS,
            n_inner_folds=OPTUNA_INNER_FOLDS
        )
        best_params = study.best_params
        optuna_studies[(train_id, test_id)] = study
        save_hyperparams(best_params, param_name, score=study.best_value)

    # Train Layer 1 base models (with Optuna-optimized or default hyperparameters)
    print("  Training Layer 1 base models...")
    models = train_models(X_train, y_train, params=best_params)
    trained_models[(train_id, test_id)] = models

    # Train Layer 2 meta-learner (stacking)
    print("  Training Layer 2 meta-learner (stacking)...")
    meta_model = train_stacking_meta(models, X_train, y_train)
    trained_meta_models[(train_id, test_id)] = meta_model

    # Evaluate with stacking
    threshold, score, proba = evaluate_on_test(
        models, X_test, y_test, f"Train {train_id} -> Test {test_id} ({lang})",
        meta_model=meta_model)

    cv_results[(train_id, test_id)] = {
        'threshold': threshold, 'score': score,
        'lang': lang, 'proba': proba
    }

# Summary
print(f"\n{'='*60}")
print("CROSS-VALIDATION SUMMARY (with Stacking Ensemble)")
print(f"{'='*60}")
for (tr, te), res in cv_results.items():
    print(f"  Train {tr} -> Test {te} ({res['lang']}): "
          f"Score = {res['score']}, Threshold = {res['threshold']:.2f}")

In [None]:
# Feature importance analysis (using the first English model)
import matplotlib.pyplot as plt

first_key = list(trained_models.keys())[0]
xgb_model = trained_models[first_key]['xgb']
common_cols = [c for c in feature_cols if c in feature_dfs[first_key[0]].columns]

importance = pd.DataFrame({
    'feature': common_cols,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 10))
plt.barh(importance['feature'][:25], importance['importance'][:25])
plt.xlabel('Feature Importance')
plt.title('Top 25 Features (XGBoost)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 15 features:")
print(importance.head(15).to_string(index=False))

## Section 7: Train Final Models & Build Submission Pipeline

Train on ALL available practice data per language, then use for the final evaluation.

In [None]:
# ============================================================
# Train final models on ALL practice data per language (with Stacking)
# ============================================================
# Compute average optimal threshold from cross-validation
en_thresholds = [v['threshold'] for (tr, te), v in cv_results.items() if v['lang'] == 'English']
fr_thresholds = [v['threshold'] for (tr, te), v in cv_results.items() if v['lang'] == 'French']
final_en_threshold = np.mean(en_thresholds)
final_fr_threshold = np.mean(fr_thresholds)

print(f"Final English threshold: {final_en_threshold:.2f}")
print(f"Final French threshold: {final_fr_threshold:.2f}")

# Combine English datasets (30 + 32)
en_train = pd.concat([feature_dfs[30], feature_dfs[32]], ignore_index=True)
common_cols_en = [c for c in feature_cols if c in en_train.columns]
X_en = en_train[common_cols_en].values
y_en = en_train['is_bot'].values

# Select best Optuna params for each language
# Priority: 1) Load from saved files  2) From Optuna studies in memory  3) None (use defaults)
best_en_params = None
best_fr_params = None

if LOAD_SAVED_HYPERPARAMS:
    # Try to load saved "best" params for final models
    best_en_params = load_hyperparams("best_english")
    best_fr_params = load_hyperparams("best_french")

if best_en_params is None and USE_OPTUNA and optuna_studies:
    # English: pick params from the CV pair with the best study score
    en_studies = {k: v for k, v in optuna_studies.items() if cv_results[k]['lang'] == 'English'}
    if en_studies:
        best_en_key = max(en_studies, key=lambda k: en_studies[k].best_value)
        best_en_params = en_studies[best_en_key].best_params
        print(f"  Using Optuna params from study ({best_en_key}) for final English model")
        # Save the best English params for future reuse
        save_hyperparams(best_en_params, "best_english", score=en_studies[best_en_key].best_value)

if best_fr_params is None and USE_OPTUNA and optuna_studies:
    # French: pick params from the CV pair with the best study score
    fr_studies = {k: v for k, v in optuna_studies.items() if cv_results[k]['lang'] == 'French'}
    if fr_studies:
        best_fr_key = max(fr_studies, key=lambda k: fr_studies[k].best_value)
        best_fr_params = fr_studies[best_fr_key].best_params
        print(f"  Using Optuna params from study ({best_fr_key}) for final French model")
        # Save the best French params for future reuse
        save_hyperparams(best_fr_params, "best_french", score=fr_studies[best_fr_key].best_value)

print(f"\nTraining final English model on {len(X_en)} users ({y_en.sum()} bots)...")
final_en_models = train_models(X_en, y_en, params=best_en_params)
print("  Training English stacking meta-learner...")
final_en_meta = train_stacking_meta(final_en_models, X_en, y_en)

# Combine French datasets (31 + 33)
fr_train = pd.concat([feature_dfs[31], feature_dfs[33]], ignore_index=True)
common_cols_fr = [c for c in feature_cols if c in fr_train.columns]
X_fr = fr_train[common_cols_fr].values
y_fr = fr_train['is_bot'].values

print(f"Training final French model on {len(X_fr)} users ({y_fr.sum()} bots)...")
final_fr_models = train_models(X_fr, y_fr, params=best_fr_params)
print("  Training French stacking meta-learner...")
final_fr_meta = train_stacking_meta(final_fr_models, X_fr, y_fr)

print("\nFinal models (with stacking) trained!")

In [None]:
# ============================================================
# FINAL PIPELINE: Process a new dataset and output bot IDs (Enhanced)
# ============================================================
def detect_bots(json_path, models, threshold, feature_columns,
                embed_model, ppl_model, ppl_tokenizer, ppl_device,
                meta_model=None,
                team_name="myteam", lang="en"):
    """
    Complete bot detection pipeline for a new dataset (Enhanced v2).
    Includes: basic + advanced temporal + stylometry + NLP + cross-user + clustering features.
    Uses stacking ensemble if meta_model is provided.
    Input: path to dataset JSON
    Output: saves bot IDs to a text file and returns the list
    """
    print(f"{'='*60}")
    print(f"PROCESSING: {json_path}")
    print(f"{'='*60}")

    # Step 1: Load data
    print("[1/7] Loading data...")
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    posts_df = pd.DataFrame(data['posts'])
    users_df = pd.DataFrame(data['users'])
    ds_lang = data.get('lang', lang)
    print(f"  {len(users_df)} users, {len(posts_df)} posts, language: {ds_lang}")

    # Step 2: Basic features
    print("[2/7] Extracting basic features...")
    basic = extract_all_basic_features(posts_df, users_df)

    # Step 3: Advanced temporal features (NEW)
    print("[3/7] Extracting advanced temporal features...")
    temporal_results = []
    for _, user in users_df.iterrows():
        uid = user['id']
        user_posts = posts_df[posts_df['author_id'] == uid]
        f = {'user_id': uid}
        f.update(extract_advanced_temporal_features(user_posts))
        temporal_results.append(f)
    temporal_feats = pd.DataFrame(temporal_results)

    # Step 4: Stylometry features (NEW)
    print("[4/7] Extracting stylometry features...")
    stylo_results = []
    for _, user in users_df.iterrows():
        uid = user['id']
        user_posts = posts_df[posts_df['author_id'] == uid]
        f = {'user_id': uid}
        f.update(extract_stylometry_features(user_posts))
        stylo_results.append(f)
    stylo_feats = pd.DataFrame(stylo_results)

    # Step 5: NLP features
    print("[5/7] Computing NLP features (embeddings + perplexity)...")
    emb_feats, mean_embs = extract_embedding_features(posts_df, users_df, model=embed_model)
    ppl_feats = extract_perplexity_features(posts_df, users_df, ppl_model, ppl_tokenizer, ppl_device)
    nlp_feat = emb_feats.merge(ppl_feats, on='user_id', how='outer')

    # Step 6: Cross-user + clustering features
    print("[6/7] Computing cross-user + clustering features...")
    cross_feats = extract_cross_user_features(mean_embs) if mean_embs else pd.DataFrame()
    cluster_feats = extract_clustering_features(mean_embs, posts_df, users_df) if mean_embs else pd.DataFrame()

    # Combine all
    df = basic.copy()
    df = df.merge(temporal_feats, on='user_id', how='left')
    df = df.merge(stylo_feats, on='user_id', how='left')
    df = df.merge(nlp_feat, on='user_id', how='left')
    if not cross_feats.empty:
        df = df.merge(cross_feats, on='user_id', how='left')
    if not cluster_feats.empty:
        df = df.merge(cluster_feats, on='user_id', how='left')
    df = df.fillna(0)

    # Ensure feature columns match training
    for col in feature_columns:
        if col not in df.columns:
            df[col] = 0
    X = df[feature_columns].values

    # Step 7: Predict (with stacking if available)
    print("[7/7] Predicting (stacking ensemble)...")
    proba = ensemble_predict_proba(models, X, meta_model=meta_model)
    predictions = (proba >= threshold).astype(int)

    # Get bot user IDs
    bot_user_ids = df.loc[predictions == 1, 'user_id'].tolist()

    # Save to file
    output_filename = f"{team_name}.detections.{ds_lang}.txt"
    with open(output_filename, 'w') as f:
        for uid in bot_user_ids:
            f.write(f"{uid}\n")

    print(f"\n  Detected {len(bot_user_ids)} bots out of {len(users_df)} users")
    print(f"  Results saved to: {output_filename}")

    # Also print probability distribution for manual inspection
    print(f"\n  Probability distribution:")
    print(f"    Mean: {np.mean(proba):.4f}")
    print(f"    > 0.3: {(proba > 0.3).sum()} users")
    print(f"    > 0.5: {(proba > 0.5).sum()} users")
    print(f"    > 0.7: {(proba > 0.7).sum()} users")
    print(f"    > 0.9: {(proba > 0.9).sum()} users")

    return bot_user_ids, proba, df

print("Final pipeline defined.")

## Section 8: Validate Pipeline on Practice Data

Run the full pipeline on practice datasets to verify it works correctly and check the competition score.

In [None]:
# Validate on practice dataset 32 (English) using model trained on 30
print("VALIDATION: Using Train-30 model on Dataset 32 (with Stacking)")
val_json = os.path.join(DATA_DIR, 'dataset.posts&users.32.json')
val_bots = os.path.join(DATA_DIR, 'dataset.bots.32.txt')

detected_bots, probas, result_df = detect_bots(
    val_json,
    models=trained_models[(30, 32)],
    threshold=cv_results[(30, 32)]['threshold'],
    feature_columns=[c for c in feature_cols if c in feature_dfs[30].columns],
    embed_model=embed_model,
    ppl_model=ppl_model, ppl_tokenizer=ppl_tokenizer, ppl_device=ppl_device,
    meta_model=trained_meta_models.get((30, 32)),
    team_name="validation", lang="en"
)

# Check against known bots
with open(val_bots, 'r') as f:
    true_bots = set(line.strip() for line in f if line.strip())

detected_set = set(detected_bots)
tp = len(detected_set & true_bots)
fp = len(detected_set - true_bots)
fn = len(true_bots - detected_set)
score = 4 * tp - 1 * fn - 2 * fp

print(f"\nVALIDATION RESULTS:")
print(f"  True bots: {len(true_bots)}")
print(f"  Detected bots: {len(detected_set)}")
print(f"  TP: {tp}, FP: {fp}, FN: {fn}")
print(f"  Competition Score: {score}")

## Section 9: Final Evaluation (Run on Competition Day)

On **Feb 14, 12:00 PM EST**, upload the new evaluation dataset JSON and run this cell.
Change `TEAM_NAME` to your team name before running!

In [None]:
# ============================================================
# FINAL EVALUATION - CHANGE THESE BEFORE RUNNING
# ============================================================
TEAM_NAME = "myteam"  # <-- CHANGE THIS to your team name

# Upload the final evaluation dataset to Colab/Drive first, then set paths:
EVAL_EN_JSON = os.path.join(DATA_DIR, 'final_eval_en.json')  # <-- English eval dataset path
EVAL_FR_JSON = os.path.join(DATA_DIR, 'final_eval_fr.json')  # <-- French eval dataset path

# --- Run English detection (with stacking) ---
if os.path.exists(EVAL_EN_JSON):
    print("Running English bot detection (with stacking)...")
    en_bots, en_proba, en_df = detect_bots(
        EVAL_EN_JSON,
        models=final_en_models,
        threshold=final_en_threshold,
        feature_columns=common_cols_en,
        embed_model=embed_model,
        ppl_model=ppl_model, ppl_tokenizer=ppl_tokenizer, ppl_device=ppl_device,
        meta_model=final_en_meta,
        team_name=TEAM_NAME, lang="en"
    )
else:
    print(f"English evaluation file not found: {EVAL_EN_JSON}")

# --- Run French detection (with stacking) ---
if os.path.exists(EVAL_FR_JSON):
    print("\nRunning French bot detection (with stacking)...")
    fr_bots, fr_proba, fr_df = detect_bots(
        EVAL_FR_JSON,
        models=final_fr_models,
        threshold=final_fr_threshold,
        feature_columns=common_cols_fr,
        embed_model=embed_model,
        ppl_model=ppl_model, ppl_tokenizer=ppl_tokenizer, ppl_device=ppl_device,
        meta_model=final_fr_meta,
        team_name=TEAM_NAME, lang="fr"
    )
else:
    print(f"French evaluation file not found: {EVAL_FR_JSON}")

print("\n" + "="*60)
print("DONE! Check the output files:")
print(f"  English: {TEAM_NAME}.detections.en.txt")
print(f"  French:  {TEAM_NAME}.detections.fr.txt")
print(f"\nSubmit these files to: bot.or.not.competition.adm@gmail.com")
print(f"Deadline: Feb 14, 2026, 1:00 PM EST")
print("="*60)

In [None]:
# Running locally - files are already saved in the working directory
for fname in [f"{TEAM_NAME}.detections.en.txt", f"{TEAM_NAME}.detections.fr.txt"]:
    if os.path.exists(fname):
        print(f"File saved: {fname}")