In [None]:
from datasets import load_dataset

data = load_dataset("nahiar/twitter_bot_detection")

# Or convert to pandas DataFrame to use head()
df = data["train"].to_pandas()
df.head()

# Data Exploration

## Summary

In [None]:
columns = df.info()
print("Dataset Columns:", columns)

In [None]:
# find missing values
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

In [None]:
print(f"Train samples: {len(df)}")

In [None]:
# tampilkan masing masing column itu tuh kayak yang unique value nya berapa
for column in df.columns:
    unique_values = df[column].unique()
    print(f"{column}: {len(unique_values)}")

# Preprocessing

In [None]:
# --- FITUR YANG SUDAH ANDA BUAT (INI BAGUS, KITA PERTAHANKAN) ---
df["is_bot"] = df["account_type"].map({"human": 0, "bot": 1})
df["follower_following_ratio"] = df["followers_count"] / df["friends_count"].replace(
    0, 1
)

# --- PENINGKATAN & FITUR BARU ---

# 1. Dari kolom 'description'
# bio_length sudah bagus, mari kita tambahkan lagi. Bot seringkali tidak punya bio atau punya bio yang sangat pendek.
df["bio_length"] = df["description"].apply(lambda x: len(str(x)))


# 2. Dari kolom 'screen_name'
# Panjang username dan jumlah digit di dalamnya adalah sinyal klasik untuk akun bot/spam.
df["username_length"] = df["screen_name"].apply(lambda x: len(str(x)))
df["username_digit_count"] = df["screen_name"].apply(
    lambda x: sum(c.isdigit() for c in str(x))
)

# 3. Dari kolom boolean/lainnya
# has_profile_picture sudah bagus, mari kita buat lebih eksplisit namanya.
df["has_custom_profile_image"] = df["default_profile_image"].apply(
    lambda x: 0 if x else 1
)
# Apakah pengguna punya background kustom? Bot biasanya tidak.
df["has_custom_background"] = df["profile_background_image_url"].apply(
    lambda x: 0 if "default_profile" in str(x) or "theme1/bg.png" in str(x) else 1
)
# Apakah ada info lokasi?
df["has_location"] = df["location"].apply(
    lambda x: 0 if str(x).lower() in ["unknown", "none", ""] else 1
)
# Apakah Default Profile True? jika iya maka 0
df["is_default_profile"] = df["default_profile"].apply(lambda x: 0 if x else 1)

# Apakah geo_enabled? Jika iya maka 1, jika tidak maka 0
df["is_geo_enabled"] = df["geo_enabled"].apply(lambda x: 1 if x else 0)

# Apakah verified? Jika iya maka 1, jika tidak maka 0
df["is_verified"] = df["verified"].apply(lambda x: 1 if x else 0)

# Menampilkan 10 data teratas dengan fitur baru
df.head(10)

## Remove Unused Columns

In [None]:
df.drop(
    columns=[
        "location",
        "profile_background_image_url",
        "default_profile_image",
        "screen_name",
        "description",
        "Unnamed: 0",
        "created_at",
        "id",
        "lang",
        "geo_enabled",
        "verified",
        "default_profile",
        "profile_image_url"
    ],
    inplace=True,
)
print("Columns after removal:", df.columns)
df.head(10)

# Data Visualization & Analysis

Let's create comprehensive visualizations to understand our bot detection dataset better.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('default')
sns.set_palette("husl")

In [None]:
# Define feature groups for analysis and modeling
numeric_features = [
    'favourites_count', 'followers_count', 'friends_count', 'statuses_count',
    'average_tweets_per_day', 'account_age_days', 'follower_following_ratio',
    'bio_length', 'username_length', 'username_digit_count'
]

binary_features = [
    'has_custom_profile_image', 'has_custom_background', 'has_location',
    'is_default_profile', 'is_geo_enabled', 'is_verified'
]

all_features = numeric_features + binary_features

print("Numeric Features:", len(numeric_features))
print(numeric_features)
print("\nBinary Features:", len(binary_features))
print(binary_features)
print(f"\nTotal Features for Model: {len(all_features)}")

In [None]:
# 1. Bot vs Human Distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Bar plot
bot_counts = df['is_bot'].value_counts()
labels = ['Human', 'Bot']
colors = ['#2E86AB', '#A23B72']

axes[0].bar(labels, bot_counts.values, color=colors, alpha=0.8)
axes[0].set_title('Distribution of Bots vs Humans', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Count')
for i, v in enumerate(bot_counts.values):
    axes[0].text(i, v + 100, str(v), ha='center', fontweight='bold')

# Pie chart
axes[1].pie(bot_counts.values, labels=labels, colors=colors, autopct='%1.1f%%',
           startangle=90, textprops={'fontsize': 12, 'fontweight': 'bold'})
axes[1].set_title('Percentage Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"Total samples: {len(df):,}")
print(f"Bots: {bot_counts[1]:,} ({bot_counts[1]/len(df)*100:.1f}%)")
print(f"Humans: {bot_counts[0]:,} ({bot_counts[0]/len(df)*100:.1f}%)")
print(f"Bot to Human ratio: 1:{bot_counts[0]/bot_counts[1]:.1f}")

In [None]:
# 2. Feature Distribution Analysis
fig, axes = plt.subplots(2, 2, figsize=(20, 15))

# Key numeric features comparison
key_features = ['followers_count', 'friends_count', 'statuses_count', 'account_age_days']

for i, feature in enumerate(key_features):
    row, col = i // 2, i % 2

    # Box plot for each feature by bot/human
    df_plot = df[[feature, 'account_type']].copy()

    # Handle outliers by capping at 95th percentile for better visualization
    cap_value = df_plot[feature].quantile(0.95)
    df_plot[feature] = df_plot[feature].clip(upper=cap_value)

    sns.boxplot(data=df_plot, x='account_type', y=feature, ax=axes[row, col])
    axes[row, col].set_title(f'{feature.replace("_", " ").title()} Distribution', fontsize=14, fontweight='bold')
    axes[row, col].set_xlabel('Account Type')
    axes[row, col].set_ylabel(feature.replace("_", " ").title())

plt.tight_layout()
plt.show()

# Statistical comparison
print("=== STATISTICAL COMPARISON (HUMANS vs BOTS) ===")
for feature in key_features:
    human_data = df[df['account_type'] == 'human'][feature]
    bot_data = df[df['account_type'] == 'bot'][feature]

    print(f"\n{feature.upper()}:")
    print(f"  Humans - Mean: {human_data.mean():.2f}, Median: {human_data.median():.2f}")
    print(f"  Bots   - Mean: {bot_data.mean():.2f}, Median: {bot_data.median():.2f}")
    print(f"  Difference: {abs(human_data.mean() - bot_data.mean()):.2f}")

In [None]:
# 3. Binary Features Analysis
import pandas as pd

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for i, feature in enumerate(binary_features):
    # Create crosstab for binary feature vs account type
    crosstab = pd.crosstab(df[feature], df['account_type'], normalize='columns') * 100

    crosstab.plot(kind='bar', ax=axes[i], color=['#A23B72', '#2E86AB'])
    axes[i].set_title(f'{feature.replace("_", " ").title()}', fontsize=12, fontweight='bold')
    axes[i].set_xlabel('')
    axes[i].set_ylabel('Percentage (%)')
    axes[i].legend(['Bot', 'Human'])
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Binary features statistics
print("=== BINARY FEATURES ANALYSIS ===")
for feature in binary_features:
    human_pct = df[df['account_type'] == 'human'][feature].mean() * 100
    bot_pct = df[df['account_type'] == 'bot'][feature].mean() * 100
    print(f"{feature.replace('_', ' ').title()}:")
    print(f"  Humans: {human_pct:.1f}% | Bots: {bot_pct:.1f}% | Difference: {abs(human_pct - bot_pct):.1f}%")

In [None]:
# 4. Correlation Heatmap
import numpy as np

plt.figure(figsize=(14, 10))

# Select all features for correlation
correlation_features = all_features + ['is_bot']
correlation_matrix = df[correlation_features].corr()

# Create heatmap
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix,
            mask=mask,
            annot=True,
            fmt='.2f',
            cmap='RdBu_r',
            center=0,
            square=True,
            linewidths=0.5,
            cbar_kws={"shrink": .8})

plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold', pad=20)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Show top correlations with is_bot
bot_correlations = correlation_matrix['is_bot'].drop('is_bot').abs().sort_values(ascending=False)
print("=== TOP CORRELATIONS WITH BOT DETECTION ===")
for i, (feature, corr) in enumerate(bot_correlations.head(8).items(), 1):
    direction = "positive" if correlation_matrix['is_bot'][feature] > 0 else "negative"
    print(f"{i}. {feature.replace('_', ' ').title()}: {corr:.3f} ({direction})")

# Model Training & Evaluation

Let's train multiple models and select the best one for bot detection.

In [None]:
# Prepare data for training
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

# Prepare features and target
feature_columns = numeric_features + binary_features
X = df[feature_columns]
y = df['is_bot']

print("=== DATA PREPARATION ===")
print(f"Features shape: {X.shape}")
print(f"Target distribution:")
print(y.value_counts())
print(f"Features used: {feature_columns}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Train target distribution:\n{y_train.value_counts()}")
print(f"Test target distribution:\n{y_test.value_counts()}")

# Scale the features
scaler = RobustScaler()  # RobustScaler is less sensitive to outliers
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nData preparation completed!")

In [None]:
# Train multiple models including XGBoost and other advanced models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier, AdaBoostClassifier

models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss', n_jobs=-1),
    'LightGBM': LGBMClassifier(n_estimators=100, random_state=42, verbose=-1, n_jobs=-1),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(probability=True, random_state=42),
    'Naive Bayes': GaussianNB()
}

# Cross-validation results
cv_results = {}
cv_scores = {}

print("=== CROSS-VALIDATION RESULTS ===")
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    print(f"\nTraining {name}...")

    # Use scaled data for SVM, Logistic Regression, and Naive Bayes
    # Tree-based models (RF, XGB, LightGBM, Extra Trees, GB, AdaBoost) use original data
    if name in ['SVM', 'Logistic Regression', 'Naive Bayes']:
        X_cv = X_train_scaled
    else:
        X_cv = X_train

    # Cross-validation
    cv_score = cross_val_score(model, X_cv, y_train, cv=kfold, scoring='f1')
    cv_results[name] = cv_score
    cv_scores[name] = cv_score.mean()

    print(f"{name} - F1 Score: {cv_score.mean():.4f} (+/- {cv_score.std() * 2:.4f})")

# Sort models by performance
sorted_models = sorted(cv_scores.items(), key=lambda x: x[1], reverse=True)
print(f"\n=== MODEL RANKING (by F1 Score) ===")
for i, (name, score) in enumerate(sorted_models, 1):
    print(f"{i}. {name}: {score:.4f}")

best_model_name = sorted_models[0][0]
print(f"\nBest model: {best_model_name}")

# Visualize CV results with improved styling
plt.figure(figsize=(15, 8))
model_names = list(cv_results.keys())
cv_scores_list = [cv_results[name] for name in model_names]

# Create boxplot with colors
box_plot = plt.boxplot(cv_scores_list, labels=model_names, patch_artist=True)
colors = plt.cm.Set3(range(len(model_names)))
for patch, color in zip(box_plot['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

plt.title('Cross-Validation F1 Scores Comparison', fontsize=16, fontweight='bold')
plt.ylabel('F1 Score', fontsize=12)
plt.xlabel('Models', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Display top 3 models performance comparison
print(f"\n🏆 TOP 3 MODELS:")
for i, (name, score) in enumerate(sorted_models[:3], 1):
    print(f"{i}. {name}: {score:.4f} F1-Score")

In [None]:
# Train the best model on full training set
best_model = models[best_model_name]

# Use appropriate data for the best model
if best_model_name in ['SVM', 'Logistic Regression', 'Naive Bayes']:
    X_train_final = X_train_scaled
    X_test_final = X_test_scaled
else:
    X_train_final = X_train
    X_test_final = X_test

print(f"=== TRAINING FINAL MODEL: {best_model_name} ===")
best_model.fit(X_train_final, y_train)

# Make predictions
y_pred = best_model.predict(X_test_final)
y_pred_proba = best_model.predict_proba(X_test_final)[:, 1]

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

print(f"\n=== FINAL MODEL PERFORMANCE ===")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"AUC-ROC: {auc:.4f}")

print(f"\n=== DETAILED CLASSIFICATION REPORT ===")
print(classification_report(y_test, y_pred, target_names=['Human', 'Bot']))

# Confusion Matrix and Visualizations
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Human', 'Bot'], yticklabels=['Human', 'Bot'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

# ROC Curve
plt.subplot(1, 3, 2)
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")

# Feature Importance (if available)
plt.subplot(1, 3, 3)
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': feature_columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=True)

    plt.barh(range(len(feature_importance)), feature_importance['importance'])
    plt.yticks(range(len(feature_importance)), feature_importance['feature'])
    plt.title('Feature Importance')
    plt.xlabel('Importance')
elif hasattr(best_model, 'coef_'):
    feature_importance = pd.DataFrame({
        'feature': feature_columns,
        'importance': abs(best_model.coef_[0])
    }).sort_values('importance', ascending=True)

    plt.barh(range(len(feature_importance)), feature_importance['importance'])
    plt.yticks(range(len(feature_importance)), feature_importance['feature'])
    plt.title('Feature Importance (|Coefficients|)')
    plt.xlabel('|Coefficient|')
else:
    plt.text(0.5, 0.5, 'Feature importance\nnot available for\nthis model type',
             ha='center', va='center', transform=plt.gca().transAxes)
    plt.title('Feature Importance')

plt.tight_layout()
plt.show()

# 🚀 Model Deployment & Hugging Face Upload

Now let's save our best model and prepare it for deployment on Hugging Face Hub!

In [None]:
# Save the best model and preprocessing components
import pickle
import os
from datetime import datetime

# Create model directory
model_dir = "twitter_bot_detection_model"
os.makedirs(model_dir, exist_ok=True)

# Save the trained model
model_path = os.path.join(model_dir, "best_model.pkl")
joblib.dump(best_model, model_path)

# Save the scaler (for consistency in preprocessing)
scaler_path = os.path.join(model_dir, "scaler.pkl")
joblib.dump(scaler, scaler_path)

# Save feature names and metadata
metadata = {
    'model_name': best_model_name,
    'feature_columns': feature_columns,
    'numeric_features': numeric_features,
    'binary_features': binary_features,
    'performance_metrics': {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc_roc': auc
    },
    'training_date': datetime.now().isoformat(),
    'dataset_size': len(df),
    'features_count': len(feature_columns)
}

metadata_path = os.path.join(model_dir, "model_metadata.json")
import json
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

print("✅ Model saved successfully!")
print(f"📁 Model directory: {model_dir}")
print(f"🤖 Best model: {best_model_name}")
print(f"📊 Performance: {f1:.4f} F1-Score")
print(f"💾 Files saved:")
print(f"  - {model_path}")
print(f"  - {scaler_path}")
print(f"  - {metadata_path}")

# Create a simple prediction function
def predict_bot(input_data):
    """
    Predict if a Twitter account is a bot or human

    Args:
        input_data: dict with keys matching feature_columns

    Returns:
        dict with prediction and probability
    """
    # Convert to DataFrame
    import pandas as pd
    df_input = pd.DataFrame([input_data])

    # Ensure all features are present
    for col in feature_columns:
        if col not in df_input.columns:
            df_input[col] = 0

    # Reorder columns to match training
    df_input = df_input[feature_columns]

    # Make prediction
    prediction = best_model.predict(df_input)[0]
    probability = best_model.predict_proba(df_input)[0]

    return {
        'prediction': 'Bot' if prediction == 1 else 'Human',
        'confidence': max(probability),
        'bot_probability': probability[1],
        'human_probability': probability[0]
    }

# Test the prediction function
sample_data = {
    'favourites_count': 1000,
    'followers_count': 500,
    'friends_count': 200,
    'statuses_count': 1500,
    'average_tweets_per_day': 2.5,
    'account_age_days': 365,
    'follower_following_ratio': 2.5,
    'bio_length': 120,
    'username_length': 12,
    'username_digit_count': 2,
    'has_custom_profile_image': 1,
    'has_custom_background': 1,
    'has_location': 1,
    'is_default_profile': 1,
    'is_geo_enabled': 1,
    'is_verified': 0
}

test_prediction = predict_bot(sample_data)
print(f"\n🧪 Test Prediction:")
print(f"Sample account is predicted as: {test_prediction['prediction']}")
print(f"Confidence: {test_prediction['confidence']:.3f}")
print(f"Bot probability: {test_prediction['bot_probability']:.3f}")
print(f"Human probability: {test_prediction['human_probability']:.3f}")