# Sprint 3: Linguistic & Behavioral Features for Misinformation Detection

**Objective**: Test whether psychologically-grounded linguistic features improve model performance beyond bag-of-words.

**Experiment Design**:
1. **Baseline**: TF-IDF only (ROC-AUC = 0.859)
2. **Behavioral Features Only**: Sentiment, subjectivity, readability, certainty language
3. **Hybrid Model**: TF-IDF + Behavioral Features

## 1. Setup & Data Loading

In [None]:
import pandas as pd
import numpy as np
import joblib
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    roc_curve, auc, precision_recall_curve, f1_score, accuracy_score
)

import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✅ Libraries loaded")

In [None]:
# Load data
df = pd.read_parquet("../../data/processed/articles.parquet")
df["label_num"] = (df["label"] == "fake").astype(int)
feats = pd.read_parquet("../../data/processed/features.parquet")

# Combine
df = pd.concat([df.reset_index(drop=True), feats.reset_index(drop=True)], axis=1)

print(f"Loaded {len(df)} articles with {feats.shape[1]} behavioral features")
print(f"\nFeature columns: {feats.columns.tolist()}")
print(f"\nData shape: {df.shape}")

## 2. Feature Correlation with Label

In [None]:
# Correlation of behavioral features with label
feature_cols = feats.columns.tolist()
corr_with_label = df[feature_cols + ['label_num']].corr()['label_num'][:-1].sort_values(ascending=False)

print("\nCorrelation of Behavioral Features with Fake Label:")
print(corr_with_label)

# Visualization
fig, ax = plt.subplots(figsize=(10, 6))
corr_with_label.plot(kind='barh', ax=ax, color=['green' if x > 0 else 'red' for x in corr_with_label])
ax.set_title('Feature Correlation with Fake Label', fontweight='bold', fontsize=12)
ax.set_xlabel('Correlation Coefficient')
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.show()

## 3. Data Preparation

In [None]:
# Stratified split
X_train, X_val, y_train, y_val = train_test_split(
    df[["title"] + feature_cols],
    df["label_num"],
    test_size=0.2,
    stratify=df["label_num"],
    random_state=42
)

print(f"Train size: {len(X_train):,}")
print(f"Val size: {len(X_val):,}")

## 4. Behavioral-Only Model

In [None]:
# Scale features
scaler = StandardScaler()
X_train_behav = scaler.fit_transform(X_train[feature_cols])
X_val_behav = scaler.transform(X_val[feature_cols])

# Train
print("Training Behavioral-Only Model...")
model_behav = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=42)
model_behav.fit(X_train_behav, y_train)

# Evaluate
y_pred_behav = model_behav.predict(X_val_behav)
y_pred_proba_behav = model_behav.predict_proba(X_val_behav)[:, 1]
roc_auc_behav = roc_auc_score(y_val, y_pred_proba_behav)
f1_behav = f1_score(y_val, y_pred_behav)
acc_behav = accuracy_score(y_val, y_pred_behav)

print(f"\n{'='*60}")
print("BEHAVIORAL-ONLY MODEL RESULTS")
print(f"{'='*60}")
print(f"Accuracy: {acc_behav:.4f}")
print(f"F1 Score: {f1_behav:.4f}")
print(f"ROC-AUC: {roc_auc_behav:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_val, y_pred_behav, target_names=["Real", "Fake"]))

## 5. TF-IDF Model (Baseline - Load Pre-trained)

In [None]:
# Load pre-trained vectorizer and model
vectorizer = joblib.load("../../models/tfidf_vectorizer.joblib")
model_tfidf = joblib.load("../../models/logistic_regression.joblib")

# Vectorize
X_train_tfidf = vectorizer.transform(X_train["title"])
X_val_tfidf = vectorizer.transform(X_val["title"])

# Evaluate
y_pred_tfidf = model_tfidf.predict(X_val_tfidf)
y_pred_proba_tfidf = model_tfidf.predict_proba(X_val_tfidf)[:, 1]
roc_auc_tfidf = roc_auc_score(y_val, y_pred_proba_tfidf)
f1_tfidf = f1_score(y_val, y_pred_tfidf)
acc_tfidf = accuracy_score(y_val, y_pred_tfidf)

print(f"{'='*60}")
print("TF-IDF BASELINE (SPRINT 2)")
print(f"{'='*60}")
print(f"Accuracy: {acc_tfidf:.4f}")
print(f"F1 Score: {f1_tfidf:.4f}")
print(f"ROC-AUC: {roc_auc_tfidf:.4f}")

## 6. Hybrid Model: TF-IDF + Behavioral Features

In [None]:
# Combine TF-IDF + behavioral features
from scipy.sparse import hstack

X_train_hybrid = hstack([X_train_tfidf, X_train_behav])
X_val_hybrid = hstack([X_val_tfidf, X_val_behav])

print(f"Hybrid feature matrix shape: {X_train_hybrid.shape}")
print(f"  - TF-IDF features: {X_train_tfidf.shape[1]}")
print(f"  - Behavioral features: {X_train_behav.shape[1]}")

In [None]:
# Train hybrid model
print("\nTraining Hybrid Model (TF-IDF + Behavioral)...")
model_hybrid = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=42)
model_hybrid.fit(X_train_hybrid, y_train)

# Evaluate
y_pred_hybrid = model_hybrid.predict(X_val_hybrid)
y_pred_proba_hybrid = model_hybrid.predict_proba(X_val_hybrid)[:, 1]
roc_auc_hybrid = roc_auc_score(y_val, y_pred_proba_hybrid)
f1_hybrid = f1_score(y_val, y_pred_hybrid)
acc_hybrid = accuracy_score(y_val, y_pred_hybrid)

print(f"\n{'='*60}")
print("HYBRID MODEL RESULTS")
print(f"{'='*60}")
print(f"Accuracy: {acc_hybrid:.4f}")
print(f"F1 Score: {f1_hybrid:.4f}")
print(f"ROC-AUC: {roc_auc_hybrid:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_val, y_pred_hybrid, target_names=["Real", "Fake"]))

## 7. Model Comparison

In [None]:
# Comparison table
comparison = pd.DataFrame({
    'Model': ['Behavioral Only', 'TF-IDF (Baseline)', 'Hybrid (TF-IDF + Behavioral)'],
    'Accuracy': [acc_behav, acc_tfidf, acc_hybrid],
    'F1 Score': [f1_behav, f1_tfidf, f1_hybrid],
    'ROC-AUC': [roc_auc_behav, roc_auc_tfidf, roc_auc_hybrid]
})

print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)
print(comparison.to_string(index=False))
print(f"\nBest Model: {comparison.loc[comparison['ROC-AUC'].idxmax(), 'Model']} (ROC-AUC: {comparison['ROC-AUC'].max():.4f})")
print(f"\nImprovement over baseline:")
for idx, row in comparison.iterrows():
    if idx > 0:
        improvement = ((row['ROC-AUC'] - roc_auc_tfidf) / roc_auc_tfidf) * 100
        print(f"  {row['Model']}: {improvement:+.2f}%")

In [None]:
# Visualization
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(comparison))
width = 0.25

ax.bar(x - width, comparison['Accuracy'], width, label='Accuracy', alpha=0.8)
ax.bar(x, comparison['F1 Score'], width, label='F1 Score', alpha=0.8)
ax.bar(x + width, comparison['ROC-AUC'], width, label='ROC-AUC', alpha=0.8)

ax.set_ylabel('Score')
ax.set_title('Model Performance Comparison', fontweight='bold', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(comparison['Model'], rotation=15, ha='right')
ax.legend()
ax.set_ylim([0, 1])
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Feature Importance (Behavioral Features in Hybrid Model)

In [None]:
# Extract behavioral feature coefficients from hybrid model
behavioral_coefs = model_hybrid.coef_[0][-len(feature_cols):]
behavioral_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Coefficient': behavioral_coefs,
    'Abs_Coefficient': np.abs(behavioral_coefs)
}).sort_values('Abs_Coefficient', ascending=False)

print("\nBehavioral Feature Importance (in Hybrid Model):")
print(behavioral_importance)

# Visualization
fig, ax = plt.subplots(figsize=(10, 6))
colors = ['green' if x > 0 else 'red' for x in behavioral_importance['Coefficient']]
ax.barh(behavioral_importance['Feature'], behavioral_importance['Coefficient'], color=colors)
ax.set_title('Behavioral Feature Importance\n(Logistic Regression Coefficients)', fontweight='bold', fontsize=12)
ax.set_xlabel('Coefficient')
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
plt.tight_layout()
plt.show()

## 9. ROC Curve Comparison

In [None]:
# ROC curves
fpr_behav, tpr_behav, _ = roc_curve(y_val, y_pred_proba_behav)
fpr_tfidf, tpr_tfidf, _ = roc_curve(y_val, y_pred_proba_tfidf)
fpr_hybrid, tpr_hybrid, _ = roc_curve(y_val, y_pred_proba_hybrid)

fig, ax = plt.subplots(figsize=(10, 8))

ax.plot(fpr_behav, tpr_behav, color='purple', lw=2, label=f'Behavioral Only (AUC={roc_auc_behav:.3f})')
ax.plot(fpr_tfidf, tpr_tfidf, color='darkorange', lw=2, label=f'TF-IDF (AUC={roc_auc_tfidf:.3f})')
ax.plot(fpr_hybrid, tpr_hybrid, color='darkgreen', lw=2.5, label=f'Hybrid (AUC={roc_auc_hybrid:.3f})', linestyle='--')
ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')

ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate', fontsize=11)
ax.set_ylabel('True Positive Rate', fontsize=11)
ax.set_title('ROC Curve Comparison: All Models', fontweight='bold', fontsize=13)
ax.legend(loc="lower right", fontsize=10)
ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 10. Key Insights

In [None]:
print("\n" + "="*70)
print("KEY INSIGHTS FROM SPRINT 3")
print("="*70)

improvement_hybrid = ((roc_auc_hybrid - roc_auc_tfidf) / roc_auc_tfidf) * 100
improvement_behav = ((roc_auc_behav - roc_auc_tfidf) / roc_auc_tfidf) * 100

print(f"""
PERFORMANCE:
  • Behavioral-only achieves {roc_auc_behav:.4f} ROC-AUC ({improvement_behav:+.2f}% vs baseline)
  • TF-IDF baseline: {roc_auc_tfidf:.4f} ROC-AUC
  • Hybrid model: {roc_auc_hybrid:.4f} ROC-AUC ({improvement_hybrid:+.2f}% vs baseline)

MOST PREDICTIVE FEATURES:
""")

top_features = behavioral_importance.head(5)
for idx, row in top_features.iterrows():
    direction = "→ Real" if row['Coefficient'] > 0 else "→ Fake"
    print(f"  • {row['Feature']}: {row['Coefficient']:.4f} {direction}")

print(f"""
IMPLICATIONS:
  • Behavioral features alone are {'predictive' if roc_auc_behav > 0.65 else 'not very predictive'}
  • Hybrid model {'improves' if improvement_hybrid > 0 else 'does not improve'} over baseline
  • Text content (TF-IDF) remains {'the dominant signal' if roc_auc_tfidf > roc_auc_hybrid else 'secondary to linguistic style'}
  • Next: Transform fine-tuning for semantic understanding
""")