# Fake News Detection - Model Training

This notebook trains and compares multiple models:

## Traditional ML Models:
- Logistic Regression
- Naive Bayes
- Support Vector Machine (SVM)
- Random Forest

## Deep Learning Models:
- LSTM (Long Short-Term Memory)
- BiLSTM (Bidirectional LSTM)
- CNN-LSTM (Hybrid model)

In [1]:
# Import libraries
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import issparse
import warnings

warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

%matplotlib inline

## Part 1: Traditional ML Models with TF-IDF Features

In [2]:
# Load TF-IDF features
print("Loading TF-IDF features...")

X_train = np.load('../data/processed/features/tfidf/X_train.npy', allow_pickle=True)
y_train = np.load('../data/processed/features/tfidf/y_train.npy', allow_pickle=True)
X_val = np.load('../data/processed/features/tfidf/X_val.npy', allow_pickle=True)
y_val = np.load('../data/processed/features/tfidf/y_val.npy', allow_pickle=True)
X_test = np.load('../data/processed/features/tfidf/X_test.npy', allow_pickle=True)
y_test = np.load('../data/processed/features/tfidf/y_test.npy', allow_pickle=True)

print(f"Training data: {X_train.shape}")
print(f"Validation data: {X_val.shape}")
print(f"Test data: {X_test.shape}")

Loading TF-IDF features...
Training data: ()
Validation data: ()
Test data: ()


### 1.1 Logistic Regression

In [3]:
from src.models.traditional_models import TraditionalModelTrainer

# Train Logistic Regression
lr_trainer = TraditionalModelTrainer(model_type='logistic')
lr_trainer.train(X_train, y_train)

# Evaluate
lr_train_metrics = lr_trainer.evaluate(X_train, y_train, 'Training')
lr_val_metrics = lr_trainer.evaluate(X_val, y_val, 'Validation')
lr_test_metrics = lr_trainer.evaluate(X_test, y_test, 'Test')

# Classification report
print("\nClassification Report:")
print(lr_trainer.get_classification_report(X_test, y_test))

ðŸš€ GPU Optimizations Enabled:
   âœ… Mixed Precision (FP16) - 2-3x faster training
   âœ… XLA compilation - Optimized GPU kernels
   âœ… Memory growth - Efficient GPU memory usage
âœ… Found 1 GPU(s): [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
   Using GPU for training (GTX 1650 with 4GB VRAM)
âœ… Found 1 GPU(s): ['/physical_device:GPU:0']
   Using GPU for training (GTX 1650)
Training logistic model...


IndexError: tuple index out of range

In [None]:
# Confusion Matrix
from sklearn.metrics import ConfusionMatrixDisplay

cm = lr_trainer.get_confusion_matrix(X_test, y_test)

fig, ax = plt.subplots(figsize=(8, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Fake', 'Real'])
disp.plot(cmap='Blues', ax=ax)
plt.title('Logistic Regression - Confusion Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

### 1.2 Naive Bayes

In [None]:
# Train Naive Bayes
nb_trainer = TraditionalModelTrainer(model_type='naive_bayes')
nb_trainer.train(X_train, y_train)

# Evaluate
nb_train_metrics = nb_trainer.evaluate(X_train, y_train, 'Training')
nb_val_metrics = nb_trainer.evaluate(X_val, y_val, 'Validation')
nb_test_metrics = nb_trainer.evaluate(X_test, y_test, 'Test')

print("\nClassification Report:")
print(nb_trainer.get_classification_report(X_test, y_test))

### 1.3 Support Vector Machine

In [None]:
# Train SVM (may take longer)
print("Note: SVM training may take several minutes...")

svm_trainer = TraditionalModelTrainer(model_type='svm')
svm_trainer.train(X_train, y_train)

# Evaluate
svm_train_metrics = svm_trainer.evaluate(X_train, y_train, 'Training')
svm_val_metrics = svm_trainer.evaluate(X_val, y_val, 'Validation')
svm_test_metrics = svm_trainer.evaluate(X_test, y_test, 'Test')

print("\nClassification Report:")
print(svm_trainer.get_classification_report(X_test, y_test))

### 1.4 Random Forest

In [None]:
# Train Random Forest
rf_trainer = TraditionalModelTrainer(model_type='random_forest')
rf_trainer.train(X_train, y_train)

# Evaluate
rf_train_metrics = rf_trainer.evaluate(X_train, y_train, 'Training')
rf_val_metrics = rf_trainer.evaluate(X_val, y_val, 'Validation')
rf_test_metrics = rf_trainer.evaluate(X_test, y_test, 'Test')

print("\nClassification Report:")
print(rf_trainer.get_classification_report(X_test, y_test))

### 1.5 Compare Traditional ML Models

In [None]:
# Create comparison DataFrame
ml_results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Naive Bayes', 'SVM', 'Random Forest'],
    'Train Accuracy': [
        lr_train_metrics['accuracy'],
        nb_train_metrics['accuracy'],
        svm_train_metrics['accuracy'],
        rf_train_metrics['accuracy']
    ],
    'Val Accuracy': [
        lr_val_metrics['accuracy'],
        nb_val_metrics['accuracy'],
        svm_val_metrics['accuracy'],
        rf_val_metrics['accuracy']
    ],
    'Test Accuracy': [
        lr_test_metrics['accuracy'],
        nb_test_metrics['accuracy'],
        svm_test_metrics['accuracy'],
        rf_test_metrics['accuracy']
    ],
    'Test F1': [
        lr_test_metrics['f1'],
        nb_test_metrics['f1'],
        svm_test_metrics['f1'],
        rf_test_metrics['f1']
    ],
    'Test Precision': [
        lr_test_metrics['precision'],
        nb_test_metrics['precision'],
        svm_test_metrics['precision'],
        rf_test_metrics['precision']
    ],
    'Test Recall': [
        lr_test_metrics['recall'],
        nb_test_metrics['recall'],
        svm_test_metrics['recall'],
        rf_test_metrics['recall']
    ]
})

print("\n" + "="*80)
print("TRADITIONAL ML MODELS COMPARISON")
print("="*80)
print(ml_results.to_string(index=False))
print("="*80)

In [None]:
# Visualize comparison
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

metrics = ['Test Accuracy', 'Test F1', 'Test Precision', 'Test Recall']
colors = ['steelblue', 'coral', 'lightgreen', 'plum']

for idx, (metric, color) in enumerate(zip(metrics, colors)):
    ax = axes[idx // 2, idx % 2]
    ax.barh(ml_results['Model'], ml_results[metric], color=color, alpha=0.7, edgecolor='black')
    ax.set_xlabel(metric)
    ax.set_title(f'{metric} Comparison', fontsize=12, fontweight='bold')
    ax.set_xlim([0, 1])
    ax.grid(axis='x', alpha=0.3)
    
    # Add value labels
    for i, v in enumerate(ml_results[metric]):
        ax.text(v + 0.01, i, f'{v:.4f}', va='center')

plt.tight_layout()
plt.show()

## Part 2: Deep Learning Models

In [None]:
# Load preprocessed text data
print("Loading preprocessed data for deep learning...")

df = pd.read_csv('../data/processed/processed_news.csv')

# Split data
from sklearn.model_selection import train_test_split

train_val, test = train_test_split(df, test_size=0.15, random_state=42, stratify=df['label'])
train, val = train_test_split(train_val, test_size=0.15/0.85, random_state=42, stratify=train_val['label'])

texts_train = train['cleaned_text'].values
y_train_dl = train['label'].values
texts_val = val['cleaned_text'].values
y_val_dl = val['label'].values
texts_test = test['cleaned_text'].values
y_test_dl = test['label'].values

print(f"Training samples: {len(texts_train)}")
print(f"Validation samples: {len(texts_val)}")
print(f"Test samples: {len(texts_test)}")

### 2.1 LSTM Model

In [None]:
from src.models.deep_learning_models import LSTMModel

# Initialize LSTM
lstm_model = LSTMModel(max_features=10000, embedding_dim=128, max_length=500)

# Prepare data
print("Preparing data...")
X_train_lstm, y_train_lstm = lstm_model.prepare_data(texts_train, y_train_dl)
X_val_lstm = lstm_model.prepare_data(texts_val)
X_test_lstm = lstm_model.prepare_data(texts_test)

# Build and train
lstm_model.build_model()
print("\nLSTM Model Architecture:")
lstm_model.model.summary()

print("\nTraining LSTM...")
lstm_history = lstm_model.train(X_train_lstm, y_train_lstm, X_val_lstm, y_val_dl, epochs=10)

In [None]:
# Evaluate LSTM
lstm_test_metrics = lstm_model.evaluate(X_test_lstm, y_test_dl)

print("\nLSTM Test Metrics:")
print(f"  Accuracy:  {lstm_test_metrics['accuracy']:.4f}")
print(f"  Precision: {lstm_test_metrics['precision']:.4f}")
print(f"  Recall:    {lstm_test_metrics['recall']:.4f}")
print(f"  F1-Score:  {lstm_test_metrics['f1']:.4f}")

# Save model
lstm_model.save('lstm_model')

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Accuracy
axes[0].plot(lstm_history.history['accuracy'], label='Train', marker='o')
axes[0].plot(lstm_history.history['val_accuracy'], label='Validation', marker='s')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('LSTM - Training & Validation Accuracy', fontweight='bold')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Loss
axes[1].plot(lstm_history.history['loss'], label='Train', marker='o')
axes[1].plot(lstm_history.history['val_loss'], label='Validation', marker='s')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].set_title('LSTM - Training & Validation Loss', fontweight='bold')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

### 2.2 BiLSTM Model

In [None]:
from src.models.deep_learning_models import BiLSTMModel

# Initialize BiLSTM
bilstm_model = BiLSTMModel(max_features=10000, embedding_dim=128, max_length=500)

# Prepare data
X_train_bilstm, y_train_bilstm = bilstm_model.prepare_data(texts_train, y_train_dl)
X_val_bilstm = bilstm_model.prepare_data(texts_val)
X_test_bilstm = bilstm_model.prepare_data(texts_test)

# Build and train
bilstm_model.build_model()
print("\nBiLSTM Model Architecture:")
bilstm_model.model.summary()

print("\nTraining BiLSTM...")
bilstm_history = bilstm_model.train(X_train_bilstm, y_train_bilstm, X_val_bilstm, y_val_dl, epochs=10)

In [None]:
# Evaluate BiLSTM
bilstm_test_metrics = bilstm_model.evaluate(X_test_bilstm, y_test_dl)

print("\nBiLSTM Test Metrics:")
print(f"  Accuracy:  {bilstm_test_metrics['accuracy']:.4f}")
print(f"  Precision: {bilstm_test_metrics['precision']:.4f}")
print(f"  Recall:    {bilstm_test_metrics['recall']:.4f}")
print(f"  F1-Score:  {bilstm_test_metrics['f1']:.4f}")

# Save model
bilstm_model.save('bilstm_model')

### 2.3 CNN-LSTM Model

In [None]:
from src.models.deep_learning_models import CNNLSTMModel

# Initialize CNN-LSTM
cnn_lstm_model = CNNLSTMModel(max_features=10000, embedding_dim=128, max_length=500)

# Prepare data
X_train_cnn, y_train_cnn = cnn_lstm_model.prepare_data(texts_train, y_train_dl)
X_val_cnn = cnn_lstm_model.prepare_data(texts_val)
X_test_cnn = cnn_lstm_model.prepare_data(texts_test)

# Build and train
cnn_lstm_model.build_model()
print("\nCNN-LSTM Model Architecture:")
cnn_lstm_model.model.summary()

print("\nTraining CNN-LSTM...")
cnn_lstm_history = cnn_lstm_model.train(X_train_cnn, y_train_cnn, X_val_cnn, y_val_dl, epochs=10)

In [None]:
# Evaluate CNN-LSTM
cnn_lstm_test_metrics = cnn_lstm_model.evaluate(X_test_cnn, y_test_dl)

print("\nCNN-LSTM Test Metrics:")
print(f"  Accuracy:  {cnn_lstm_test_metrics['accuracy']:.4f}")
print(f"  Precision: {cnn_lstm_test_metrics['precision']:.4f}")
print(f"  Recall:    {cnn_lstm_test_metrics['recall']:.4f}")
print(f"  F1-Score:  {cnn_lstm_test_metrics['f1']:.4f}")

# Save model
cnn_lstm_model.save('cnn_lstm_model')

## Part 3: Overall Model Comparison

In [None]:
# Combine all results
all_results = pd.DataFrame({
    'Model': [
        'Logistic Regression', 'Naive Bayes', 'SVM', 'Random Forest',
        'LSTM', 'BiLSTM', 'CNN-LSTM'
    ],
    'Type': [
        'ML', 'ML', 'ML', 'ML',
        'DL', 'DL', 'DL'
    ],
    'Accuracy': [
        lr_test_metrics['accuracy'],
        nb_test_metrics['accuracy'],
        svm_test_metrics['accuracy'],
        rf_test_metrics['accuracy'],
        lstm_test_metrics['accuracy'],
        bilstm_test_metrics['accuracy'],
        cnn_lstm_test_metrics['accuracy']
    ],
    'F1-Score': [
        lr_test_metrics['f1'],
        nb_test_metrics['f1'],
        svm_test_metrics['f1'],
        rf_test_metrics['f1'],
        lstm_test_metrics['f1'],
        bilstm_test_metrics['f1'],
        cnn_lstm_test_metrics['f1']
    ],
    'Precision': [
        lr_test_metrics['precision'],
        nb_test_metrics['precision'],
        svm_test_metrics['precision'],
        rf_test_metrics['precision'],
        lstm_test_metrics['precision'],
        bilstm_test_metrics['precision'],
        cnn_lstm_test_metrics['precision']
    ],
    'Recall': [
        lr_test_metrics['recall'],
        nb_test_metrics['recall'],
        svm_test_metrics['recall'],
        rf_test_metrics['recall'],
        lstm_test_metrics['recall'],
        bilstm_test_metrics['recall'],
        cnn_lstm_test_metrics['recall']
    ]
})

# Sort by F1-Score
all_results = all_results.sort_values('F1-Score', ascending=False).reset_index(drop=True)

print("\n" + "="*100)
print("COMPLETE MODEL COMPARISON (Test Set Performance)")
print("="*100)
print(all_results.to_string(index=False))
print("="*100)

# Save results
all_results.to_csv('../results/model_comparison.csv', index=False)
print("\nâœ… Results saved to: results/model_comparison.csv")

In [None]:
# Visualize all models
fig, ax = plt.subplots(figsize=(14, 8))

x = np.arange(len(all_results))
width = 0.2

metrics_to_plot = ['Accuracy', 'F1-Score', 'Precision', 'Recall']
colors = ['steelblue', 'coral', 'lightgreen', 'plum']

for i, (metric, color) in enumerate(zip(metrics_to_plot, colors)):
    offset = width * (i - 1.5)
    ax.bar(x + offset, all_results[metric], width, label=metric, color=color, alpha=0.8)

ax.set_xlabel('Models', fontsize=12)
ax.set_ylabel('Score', fontsize=12)
ax.set_title('Complete Model Performance Comparison', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(all_results['Model'], rotation=45, ha='right')
ax.legend()
ax.set_ylim([0, 1.1])
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../results/model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("âœ… Comparison plot saved to: results/model_comparison.png")

## Summary

### Key Findings:

1. **Best Performing Model**: Check the sorted table above
2. **Traditional ML**: Fast training, good baseline performance
3. **Deep Learning**: Better at capturing sequential patterns

### Next Steps:

1. Phase 5: Model Evaluation & Analysis
2. Phase 6: Model Optimization & Hyperparameter Tuning
3. Phase 7: Deployment & Documentation