# Fake News Detection Analysis

This notebook provides a comprehensive analysis of fake news detection using machine learning techniques.

## 1. Import Libraries and Setup

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from fake_news_detector import FakeNewsDetector
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 2. Initialize and Load Data

In [None]:
# Initialize the detector
detector = FakeNewsDetector()

# Load sample data
df = detector.load_data()

# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"\nDataset info:")
df.info()

## 3. Exploratory Data Analysis

In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
df.head()

In [None]:
# Label distribution
plt.figure(figsize=(10, 6))

# Count plot
plt.subplot(1, 2, 1)
label_counts = df['label'].value_counts()
plt.pie(label_counts.values, labels=['Fake', 'Real'], autopct='%1.1f%%', startangle=90)
plt.title('Distribution of News Labels')

# Bar plot
plt.subplot(1, 2, 2)
sns.countplot(data=df, x='label')
plt.title('Count of Real vs Fake News')
plt.xlabel('Label (0=Fake, 1=Real)')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

print(f"Real news articles: {sum(df['label'])}")
print(f"Fake news articles: {len(df) - sum(df['label'])}")

In [None]:
# Text length analysis
df['title_length'] = df['title'].str.len()
df['text_length'] = df['text'].str.len()
df['content_length'] = df['content'].str.len()

plt.figure(figsize=(15, 5))

# Title length distribution
plt.subplot(1, 3, 1)
sns.boxplot(data=df, x='label', y='title_length')
plt.title('Title Length by Label')
plt.xlabel('Label (0=Fake, 1=Real)')

# Text length distribution
plt.subplot(1, 3, 2)
sns.boxplot(data=df, x='label', y='text_length')
plt.title('Text Length by Label')
plt.xlabel('Label (0=Fake, 1=Real)')

# Content length distribution
plt.subplot(1, 3, 3)
sns.boxplot(data=df, x='label', y='content_length')
plt.title('Content Length by Label')
plt.xlabel('Label (0=Fake, 1=Real)')

plt.tight_layout()
plt.show()

## 4. Text Preprocessing and Feature Extraction

In [None]:
# Prepare features
detector.prepare_features()

print(f"Training set shape: {detector.X_train.shape}")
print(f"Test set shape: {detector.X_test.shape}")
print(f"Number of features: {detector.X_train.shape[1]}")

In [None]:
# Show examples of processed text
print("Original vs Processed Text Examples:")
print("="*50)

for i in range(3):
    print(f"\nExample {i+1}:")
    print(f"Original: {df.iloc[i]['content'][:100]}...")
    print(f"Processed: {df.iloc[i]['processed_content'][:100]}...")
    print(f"Label: {'Real' if df.iloc[i]['label'] == 1 else 'Fake'}")

## 5. Model Training and Evaluation

In [None]:
# Train all models
detector.train_models()

In [None]:
# Evaluate models
results = detector.evaluate_models()

In [None]:
# Detailed classification reports
from sklearn.metrics import classification_report

print("Detailed Classification Reports:")
print("="*60)

for name, model in detector.trained_models.items():
    y_pred = model.predict(detector.X_test)
    print(f"\n{name}:")
    print("-" * 30)
    print(classification_report(detector.y_test, y_pred, target_names=['Fake', 'Real']))

## 6. Feature Analysis

In [None]:
# Feature importance for Random Forest
if 'Random Forest' in detector.trained_models:
    rf_model = detector.trained_models['Random Forest']
    feature_names = detector.vectorizer.get_feature_names_out()
    importances = rf_model.feature_importances_
    
    # Create feature importance dataframe
    feature_importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    # Plot top 20 features
    plt.figure(figsize=(12, 8))
    top_features = feature_importance_df.head(20)
    sns.barplot(data=top_features, y='feature', x='importance')
    plt.title('Top 20 Most Important Features (Random Forest)')
    plt.xlabel('Feature Importance')
    plt.tight_layout()
    plt.show()
    
    print("Top 10 Most Important Features:")
    print(feature_importance_df.head(10))

## 7. Word Clouds Visualization

In [None]:
# Generate word clouds
detector.generate_word_clouds()

## 8. Interactive Prediction Testing

In [None]:
# Test predictions on sample articles
test_articles = [
    "Scientists at Harvard University have published groundbreaking research on climate change in the journal Nature.",
    "SHOCKING: Local grandmother discovers aliens living in her attic for 20 years!",
    "The stock market experienced significant volatility today following the Federal Reserve's announcement.",
    "BREAKING: Time travel proven real by high school student using microwave and aluminum foil!"
]

print("Testing Predictions on Sample Articles:")
print("=" * 50)

for i, article in enumerate(test_articles, 1):
    print(f"\nTest Article {i}:")
    print(f"Text: {article}")
    
    # Test with different models
    for model_name in ['Naive Bayes', 'Random Forest']:
        result = detector.predict_news(article, model_name)
        print(f"{model_name}: {result['prediction']} (Confidence: {result['confidence']:.2%})")

## 9. Model Performance Comparison

In [None]:
# Create comprehensive performance comparison
from sklearn.metrics import precision_score, recall_score, f1_score

performance_metrics = []

for name, model in detector.trained_models.items():
    y_pred = model.predict(detector.X_test)
    
    metrics = {
        'Model': name,
        'Accuracy': accuracy_score(detector.y_test, y_pred),
        'Precision': precision_score(detector.y_test, y_pred),
        'Recall': recall_score(detector.y_test, y_pred),
        'F1-Score': f1_score(detector.y_test, y_pred)
    }
    performance_metrics.append(metrics)

performance_df = pd.DataFrame(performance_metrics)

# Display performance table
print("Model Performance Comparison:")
print(performance_df.round(4))

# Plot performance metrics
plt.figure(figsize=(12, 8))
metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score']

x = np.arange(len(performance_df))
width = 0.2

for i, metric in enumerate(metrics_to_plot):
    plt.bar(x + i*width, performance_df[metric], width, label=metric)

plt.xlabel('Models')
plt.ylabel('Score')
plt.title('Model Performance Comparison')
plt.xticks(x + width*1.5, performance_df['Model'], rotation=45)
plt.legend()
plt.ylim(0, 1.1)
plt.tight_layout()
plt.show()

## 10. Conclusion and Next Steps

### Key Findings:
1. **Model Performance**: Compare which models performed best on the fake news detection task
2. **Feature Importance**: Identify the most important words/features for classification
3. **Text Patterns**: Analyze differences between real and fake news content

### Next Steps for Improvement:
1. **Larger Dataset**: Train on larger, more diverse datasets
2. **Advanced Features**: Include metadata, source credibility, social media signals
3. **Deep Learning**: Experiment with BERT, LSTM, or other neural networks
4. **Real-time Updates**: Implement online learning for adapting to new patterns
5. **Ensemble Methods**: Combine multiple models for better performance

### Production Considerations:
1. **Scalability**: Optimize for handling large volumes of articles
2. **Interpretability**: Provide explanations for predictions
3. **Bias Detection**: Monitor and mitigate potential biases
4. **Continuous Monitoring**: Track model performance over time
