# Fake News Detection - Preprocessing & EDA

This notebook performs comprehensive exploratory data analysis on the preprocessed dataset.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter
import warnings

warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

%matplotlib inline

## 1. Load Preprocessed Data

In [None]:
# Load preprocessed data
df = pd.read_csv('../data/processed/processed_news.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
df.head()

## 2. Basic Statistics

In [None]:
# Dataset info
print("Dataset Information:")
print("="*50)
df.info()

In [None]:
# Statistical summary
df[['char_count', 'word_count', 'avg_word_length']].describe()

## 3. Class Distribution Analysis

In [None]:
# Class distribution
class_counts = df['label'].value_counts()
class_percentages = df['label'].value_counts(normalize=True) * 100

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Count plot
sns.countplot(data=df, x='label', ax=axes[0])
axes[0].set_title('Class Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Label (0=Fake, 1=Real)')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(['Fake', 'Real'])

# Percentage plot
axes[1].pie(class_counts, labels=['Fake', 'Real'], autopct='%1.1f%%', startangle=90)
axes[1].set_title('Class Distribution (Percentage)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"Fake news: {class_counts[0]:,} ({class_percentages[0]:.2f}%)")
print(f"Real news: {class_counts[1]:,} ({class_percentages[1]:.2f}%)")

## 4. Text Length Analysis

In [None]:
# Text length comparison
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Character count
df[df['label']==0]['char_count'].hist(bins=50, alpha=0.6, label='Fake', ax=axes[0,0], color='red')
df[df['label']==1]['char_count'].hist(bins=50, alpha=0.6, label='Real', ax=axes[0,0], color='green')
axes[0,0].set_title('Character Count Distribution', fontsize=12, fontweight='bold')
axes[0,0].set_xlabel('Character Count')
axes[0,0].set_ylabel('Frequency')
axes[0,0].legend()

# Word count
df[df['label']==0]['word_count'].hist(bins=50, alpha=0.6, label='Fake', ax=axes[0,1], color='red')
df[df['label']==1]['word_count'].hist(bins=50, alpha=0.6, label='Real', ax=axes[0,1], color='green')
axes[0,1].set_title('Word Count Distribution', fontsize=12, fontweight='bold')
axes[0,1].set_xlabel('Word Count')
axes[0,1].set_ylabel('Frequency')
axes[0,1].legend()

# Average word length
df[df['label']==0]['avg_word_length'].hist(bins=30, alpha=0.6, label='Fake', ax=axes[1,0], color='red')
df[df['label']==1]['avg_word_length'].hist(bins=30, alpha=0.6, label='Real', ax=axes[1,0], color='green')
axes[1,0].set_title('Average Word Length Distribution', fontsize=12, fontweight='bold')
axes[1,0].set_xlabel('Average Word Length')
axes[1,0].set_ylabel('Frequency')
axes[1,0].legend()

# Box plot comparison
df.boxplot(column='word_count', by='label', ax=axes[1,1])
axes[1,1].set_title('Word Count by Label', fontsize=12, fontweight='bold')
axes[1,1].set_xlabel('Label (0=Fake, 1=Real)')
axes[1,1].set_ylabel('Word Count')
plt.suptitle('')

plt.tight_layout()
plt.show()

In [None]:
# Statistical comparison
print("Text Statistics by Label:")
print("="*70)
print("\nFAKE NEWS:")
print(df[df['label']==0][['char_count', 'word_count', 'avg_word_length']].describe())
print("\nREAL NEWS:")
print(df[df['label']==1][['char_count', 'word_count', 'avg_word_length']].describe())

## 5. Word Cloud Visualization

In [None]:
# Word clouds for fake and real news
fig, axes = plt.subplots(1, 2, figsize=(20, 8))

# Fake news word cloud
fake_text = ' '.join(df[df['label']==0]['cleaned_text'].astype(str))
wordcloud_fake = WordCloud(width=800, height=400, 
                           background_color='white',
                           colormap='Reds',
                           max_words=100).generate(fake_text)
axes[0].imshow(wordcloud_fake, interpolation='bilinear')
axes[0].set_title('Fake News - Most Common Words', fontsize=16, fontweight='bold')
axes[0].axis('off')

# Real news word cloud
real_text = ' '.join(df[df['label']==1]['cleaned_text'].astype(str))
wordcloud_real = WordCloud(width=800, height=400,
                           background_color='white',
                           colormap='Greens',
                           max_words=100).generate(real_text)
axes[1].imshow(wordcloud_real, interpolation='bilinear')
axes[1].set_title('Real News - Most Common Words', fontsize=16, fontweight='bold')
axes[1].axis('off')

plt.tight_layout()
plt.show()

## 6. Most Common Words Analysis

In [None]:
def get_top_words(text_series, n=20):
    """Get top N most common words"""
    all_words = ' '.join(text_series.astype(str)).split()
    word_freq = Counter(all_words)
    return word_freq.most_common(n)

# Get top words for each class
top_fake = get_top_words(df[df['label']==0]['cleaned_text'], n=20)
top_real = get_top_words(df[df['label']==1]['cleaned_text'], n=20)

# Create DataFrames
df_fake_words = pd.DataFrame(top_fake, columns=['word', 'count'])
df_real_words = pd.DataFrame(top_real, columns=['word', 'count'])

# Plot
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# Fake news top words
axes[0].barh(df_fake_words['word'], df_fake_words['count'], color='red', alpha=0.7)
axes[0].set_title('Top 20 Words in Fake News', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Frequency')
axes[0].invert_yaxis()

# Real news top words
axes[1].barh(df_real_words['word'], df_real_words['count'], color='green', alpha=0.7)
axes[1].set_title('Top 20 Words in Real News', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Frequency')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

## 7. Sample Texts Comparison

In [None]:
# Display sample fake news
print("FAKE NEWS SAMPLE (Cleaned):")
print("="*80)
sample_fake = df[df['label']==0].sample(1, random_state=42).iloc[0]
print(f"Original Title: {sample_fake.get('title', 'N/A')}")
print(f"\nCleaned Text (first 500 chars):\n{sample_fake['cleaned_text'][:500]}...")
print(f"\nWord Count: {sample_fake['word_count']}")
print(f"Character Count: {sample_fake['char_count']}")

In [None]:
# Display sample real news
print("REAL NEWS SAMPLE (Cleaned):")
print("="*80)
sample_real = df[df['label']==1].sample(1, random_state=42).iloc[0]
print(f"Original Title: {sample_real.get('title', 'N/A')}")
print(f"\nCleaned Text (first 500 chars):\n{sample_real['cleaned_text'][:500]}...")
print(f"\nWord Count: {sample_real['word_count']}")
print(f"Character Count: {sample_real['char_count']}")

## 8. Correlations

In [None]:
# Correlation matrix
corr_cols = ['label', 'char_count', 'word_count', 'avg_word_length']
correlation = df[corr_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 9. Key Insights Summary

In [None]:
print("KEY INSIGHTS FROM EDA:")
print("="*70)

# Class balance
balance_ratio = (df['label']==0).sum() / (df['label']==1).sum()
print(f"\n1. Class Balance: {balance_ratio:.2f}:1 (Fake:Real)")
if 0.8 <= balance_ratio <= 1.2:
    print("   ✅ Dataset is well-balanced")
else:
    print("   ⚠️ Dataset is imbalanced - may need SMOTE or class weights")

# Text length differences
fake_avg_words = df[df['label']==0]['word_count'].mean()
real_avg_words = df[df['label']==1]['word_count'].mean()
print(f"\n2. Average Word Count:")
print(f"   Fake news: {fake_avg_words:.0f} words")
print(f"   Real news: {real_avg_words:.0f} words")
print(f"   Difference: {abs(fake_avg_words - real_avg_words):.0f} words")

# Data quality
print(f"\n3. Data Quality:")
print(f"   Total samples: {len(df):,}")
print(f"   Missing values: {df.isnull().sum().sum()}")
print(f"   ✅ Ready for feature engineering")

print("\n" + "="*70)

## Next Steps

1. **Phase 3: Feature Engineering**
   - TF-IDF vectorization
   - Word2Vec embeddings
   - BERT embeddings

2. **Phase 4: Model Development**
   - Train traditional ML models
   - Train deep learning models
   
3. **Phase 5: Model Evaluation**
   - Compare model performance
   - Select best model