## 1. Setup & Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report, roc_curve, auc
)
import warnings
warnings.filterwarnings('ignore')

# NLP preprocessing
import re
import string
from nltk.corpus import stopwords
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ All imports successful")

## 2. Create Synthetic Misinformation Dataset

*(In production: Download LIAR or FakeNewsNet)*

In [None]:
# For capstone: create realistic synthetic misinformation data
# In production: Use actual LIAR or FakeNewsNet datasets

np.random.seed(42)

# Misinformation statements (synthetic examples)
false_statements = [
    "5G towers cause COVID-19 transmission",
    "Vaccines contain microchips for tracking",
    "The moon landing was faked",
    "Climate change is a hoax",
    "Chemtrails control weather",
    "Earth is flat and NASA lies",
    "Vaccines caused the autism crisis",
    "JFK assassination was conspiracy",
]

# True statements (synthetic examples)
true_statements = [
    "Water freezes at 0 degrees Celsius",
    "The Earth orbits the Sun",
    "Vaccines have saved millions of lives",
    "Carbon dioxide causes climate change",
    "DNA is the molecule of heredity",
    "Gravity pulls objects downward",
    "The sun is a star",
    "Sound travels slower than light",
]

# Create dataset
n_samples = 500
texts = []
labels = []

for i in range(n_samples // 2):
    texts.append(np.random.choice(false_statements) + f" statement variant {i}")
    labels.append('false')
    
    texts.append(np.random.choice(true_statements) + f" statement variant {i}")
    labels.append('true')

df_news = pd.DataFrame({
    'text': texts,
    'label': labels,
    'id': range(1, len(texts) + 1)
})

# Shuffle
df_news = df_news.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"✓ Created synthetic misinformation dataset: {len(df_news)} samples")
print(f"\nLabel distribution:")
print(df_news['label'].value_counts())
print(f"\nFirst 5 examples:")
print(df_news.head())

## 3. Data Quality & Exploration

In [None]:
# Check for missing data
print("Missing data:")
print(df_news.isnull().sum())

# Text statistics
df_news['text_length'] = df_news['text'].str.len()
df_news['word_count'] = df_news['text'].str.split().str.len()

print(f"\nText Statistics:")
print(f"Mean length: {df_news['text_length'].mean():.0f} chars, SD = {df_news['text_length'].std():.0f}")
print(f"Mean word count: {df_news['word_count'].mean():.1f}, SD = {df_news['word_count'].std():.1f}")
print(f"Length range: {df_news['text_length'].min()} - {df_news['text_length'].max()} chars")

# Class balance
print(f"\nClass balance:")
class_dist = df_news['label'].value_counts()
print(class_dist)
print(f"Balance ratio: {class_dist['true'] / class_dist['false']:.2f}")

## 4. Text Preprocessing

## 5. Train-Test Split (Stratified)

In [None]:
# Stratified split to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(
    df_news['text_cleaned'],
    df_news['label'],
    test_size=0.2,
    random_state=42,
    stratify=df_news['label']
)

print(f"Train set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"\nTrain class distribution:")
print(y_train.value_counts())
print(f"\nTest class distribution:")
print(y_test.value_counts())

## 6. Feature Engineering: TF-IDF Vectorization

## 7. Baseline Model 1: Logistic Regression + TF-IDF

# Train Naive Bayes
clf_nb = MultinomialNB()
clf_nb.fit(X_train_tfidf, y_train)

# Predictions
y_pred_nb = clf_nb.predict(X_test_tfidf)
y_pred_proba_nb = clf_nb.predict_proba(X_test_tfidf)[:, 1]  # Probability of 'true'

# Evaluation
acc_nb = accuracy_score(y_test, y_pred_nb)
prec_nb = precision_score(y_test, y_pred_nb, pos_label='true')
rec_nb = recall_score(y_test, y_pred_nb, pos_label='true')
f1_nb = f1_score(y_test, y_pred_nb, pos_label='true')
auc_nb = roc_auc_score(y_test.map({'true': 1, 'false': 0}), y_pred_proba_nb)

print("Naive Bayes + TF-IDF Performance:")
print("="*60)
print(f"Accuracy:  {acc_nb:.4f}")
print(f"Precision: {prec_nb:.4f}")
print(f"Recall:    {rec_nb:.4f}")
print(f"F1-Score:  {f1_nb:.4f}")
print(f"ROC-AUC:   {auc_nb:.4f}")

## 11. Confusion Matrices

# ROC curves
fig, ax = plt.subplots(figsize=(10, 7))

y_test_binary = y_test.map({'true': 1, 'false': 0})

# Logistic Regression
fpr_lr, tpr_lr, _ = roc_curve(y_test_binary, y_pred_proba_lr)
ax.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC={auc_lr:.3f})', linewidth=2)

# Naive Bayes
fpr_nb, tpr_nb, _ = roc_curve(y_test_binary, y_pred_proba_nb)
ax.plot(fpr_nb, tpr_nb, label=f'Naive Bayes (AUC={auc_nb:.3f})', linewidth=2)

# SVM
fpr_svm, tpr_svm, _ = roc_curve(y_test_binary, y_decision_svm)
ax.plot(fpr_svm, tpr_svm, label=f'Linear SVM (AUC={auc_svm:.3f})', linewidth=2)

# Diagonal (random classifier)
ax.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random')

ax.set_xlabel('False Positive Rate', fontsize=11)
ax.set_ylabel('True Positive Rate', fontsize=11)
ax.set_title('ROC Curves: Baseline Models', fontsize=12, fontweight='bold')
ax.legend(fontsize=10, loc='lower right')
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('docs/figures/07_baseline_roc_curves.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Saved: docs/figures/07_baseline_roc_curves.png")

# Get feature importance from Logistic Regression coefficients
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'coefficient': clf_lr.coef_[0]
})

# Top 15 features (by absolute coefficient)
top_features = feature_importance.reindex(
    feature_importance['coefficient'].abs().argsort()[-15:]
).sort_values('coefficient', ascending=True)

# Plot
fig, ax = plt.subplots(figsize=(10, 6))
colors = ['red' if x < 0 else 'green' for x in top_features['coefficient']]
ax.barh(top_features['feature'], top_features['coefficient'], color=colors, alpha=0.7)
ax.set_xlabel('Coefficient (importance)', fontsize=11)
ax.set_title('Top 15 Features: Logistic Regression\n(Red=predict false, Green=predict true)', 
             fontsize=12, fontweight='bold')
ax.grid(alpha=0.3, axis='x')

plt.tight_layout()
plt.savefig('docs/figures/08_baseline_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Saved: docs/figures/08_baseline_feature_importance.png")
print(f"\nTop 10 features predicting 'true' (misinformation):")
print(top_features[top_features['coefficient'] > 0].tail(10)[['feature', 'coefficient']].to_string(index=False))
print(f"\nTop 10 features predicting 'false' (credible):")
print(top_features[top_features['coefficient'] < 0].head(10)[['feature', 'coefficient']].to_string(index=False))

# Create error analysis dataframe
errors_df = pd.DataFrame({
    'text': X_test.values,
    'true_label': y_test.values,
    'pred_label': y_pred_lr,
    'confidence': np.max(clf_lr.predict_proba(X_test_tfidf), axis=1)
})

errors_df['is_correct'] = errors_df['true_label'] == errors_df['pred_label']
errors_df['is_false_positive'] = (errors_df['true_label'] == 'false') & (errors_df['pred_label'] == 'true')
errors_df['is_false_negative'] = (errors_df['true_label'] == 'true') & (errors_df['pred_label'] == 'false')

# Error statistics
print("Error Analysis (Logistic Regression):")
print("="*80)
print(f"Correct predictions: {errors_df['is_correct'].sum()} / {len(errors_df)}")
print(f"False Positives (predicted true, actually false): {errors_df['is_false_positive'].sum()}")
print(f"False Negatives (predicted false, actually true): {errors_df['is_false_negative'].sum()}")

print(f"\nExamples of False Positives (Model too confident misinformation):")
fp_examples = errors_df[errors_df['is_false_positive']].nlargest(3, 'confidence')
for idx, row in fp_examples.iterrows():
    print(f"  Text: {row['text'][:60]}... | Confidence: {row['confidence']:.3f}")

print(f"\nExamples of False Negatives (Model missed misinformation):")
fn_examples = errors_df[errors_df['is_false_negative']].nsmallest(3, 'confidence')
for idx, row in fn_examples.iterrows():
    print(f"  Text: {row['text'][:60]}... | Confidence: {row['confidence']:.3f}")