# FakeNewsNet Exploratory Data Analysis

**Sprint 1**: Data ingestion, cleaning, and exploration of the FakeNewsNet dataset.

Dataset: Labeled news articles from Politifact and GossipCop

## 1. Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Load data
df = pd.read_parquet("../../data/processed/articles.parquet")
print(f"Loaded {len(df)} articles")
df.head()

In [None]:
# Dataset info
print(f"Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nData types:\n{df.dtypes}")
print(f"\nMissing values:\n{df.isnull().sum()}")

## 2. Class Balance Analysis

In [None]:
# Class distribution
class_counts = df['label'].value_counts()
class_props = df['label'].value_counts(normalize=True)

print("Class Distribution:")
print(class_counts)
print("\nProportions:")
print(class_props)
print(f"\nImbalance ratio (Real:Fake): {class_counts['real'] / class_counts['fake']:.2f}x")

In [None]:
# Bar chart
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
class_counts.plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'])
axes[0].set_title('Article Count by Label', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Label')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)
for i, v in enumerate(class_counts):
    axes[0].text(i, v + 200, str(v), ha='center', fontweight='bold')

# Pie chart
colors = ['#2ecc71', '#e74c3c']
axes[1].pie(class_counts, labels=class_counts.index, autopct='%1.1f%%', 
            colors=colors, startangle=90)
axes[1].set_title('Class Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# By dataset
print("\n" + "="*50)
print("Class Distribution by Dataset")
print("="*50)
print(pd.crosstab(df['dataset'], df['label'], margins=True))

## 3. Title Length Analysis

In [None]:
# Statistics by label
print("Title Length Statistics by Label:")
print(df.groupby('label')[['title_length', 'title_chars']].describe().round(2))

In [None]:
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Histogram - Title Length
for label in ['real', 'fake']:
    axes[0, 0].hist(df[df['label'] == label]['title_length'], 
                    alpha=0.6, label=label, bins=30)
axes[0, 0].set_xlabel('Words in Title')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Title Length Distribution', fontweight='bold')
axes[0, 0].legend()

# Boxplot - Title Length
sns.boxplot(data=df, x='label', y='title_length', ax=axes[0, 1], 
            palette=['#2ecc71', '#e74c3c'])
axes[0, 1].set_title('Title Length by Label', fontweight='bold')
axes[0, 1].set_xlabel('Label')
axes[0, 1].set_ylabel('Words in Title')

# Histogram - Title Characters
for label in ['real', 'fake']:
    axes[1, 0].hist(df[df['label'] == label]['title_chars'], 
                    alpha=0.6, label=label, bins=30)
axes[1, 0].set_xlabel('Characters in Title')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Title Character Length Distribution', fontweight='bold')
axes[1, 0].legend()

# Boxplot - Title Characters
sns.boxplot(data=df, x='label', y='title_chars', ax=axes[1, 1], 
            palette=['#2ecc71', '#e74c3c'])
axes[1, 1].set_title('Title Character Length by Label', fontweight='bold')
axes[1, 1].set_xlabel('Label')
axes[1, 1].set_ylabel('Characters in Title')

plt.tight_layout()
plt.show()

In [None]:
# T-test for title length differences
from scipy.stats import ttest_ind

real_titles = df[df['label'] == 'real']['title_length']
fake_titles = df[df['label'] == 'fake']['title_length']

t_stat, p_value = ttest_ind(real_titles, fake_titles)
print(f"\nT-test: Title Length (Real vs Fake)")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.2e}")
print(f"Significant? {p_value < 0.05}")
print(f"\nMean title length - Real: {real_titles.mean():.2f}, Fake: {fake_titles.mean():.2f}")

## 4. Most Common Words

In [None]:
# Word frequency analysis
def get_top_words(texts, n=20):
    """Extract top N words from text."""
    vectorizer = CountVectorizer(max_features=1000, stop_words='english', 
                                lowercase=True, min_df=2)
    X = vectorizer.fit_transform(texts)
    word_freq = np.array(X.sum(axis=0)).flatten()
    words = vectorizer.get_feature_names_out()
    
    sorted_idx = np.argsort(word_freq)[::-1]
    top_words = [(words[i], word_freq[i]) for i in sorted_idx[:n]]
    return pd.DataFrame(top_words, columns=['word', 'frequency'])

# Real articles
real_words = get_top_words(df[df['label'] == 'real']['title'], n=20)
print("\nTop 20 Words in REAL Articles:")
print(real_words)

In [None]:
# Fake articles
fake_words = get_top_words(df[df['label'] == 'fake']['title'], n=20)
print("\nTop 20 Words in FAKE Articles:")
print(fake_words)

In [None]:
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Real
axes[0].barh(real_words['word'].iloc[::-1], real_words['frequency'].iloc[::-1], 
             color='#2ecc71')
axes[0].set_title('Top 20 Words - REAL Articles', fontweight='bold', fontsize=12)
axes[0].set_xlabel('Frequency')

# Fake
axes[1].barh(fake_words['word'].iloc[::-1], fake_words['frequency'].iloc[::-1], 
             color='#e74c3c')
axes[1].set_title('Top 20 Words - FAKE Articles', fontweight='bold', fontsize=12)
axes[1].set_xlabel('Frequency')

plt.tight_layout()
plt.show()

## 5. Data Quality Summary

In [None]:
print("\n" + "="*60)
print("DATA QUALITY SUMMARY")
print("="*60)

print(f"\nâœ… Total articles: {len(df):,}")
print(f"âœ… Real: {(df['label']=='real').sum():,} ({(df['label']=='real').sum()/len(df)*100:.1f}%)")
print(f"âœ… Fake: {(df['label']=='fake').sum():,} ({(df['label']=='fake').sum()/len(df)*100:.1f}%)")

print(f"\nâœ… Missing values:")
print(df.isnull().sum())

print(f"\nâœ… Dataset sources:")
print(df['dataset'].value_counts())

print(f"\nâœ… Title length range: {df['title_length'].min():.0f} - {df['title_length'].max():.0f} words")
print(f"âœ… Title character range: {df['title_chars'].min():.0f} - {df['title_chars'].max():.0f} chars")

print("\n" + "="*60)
print("Ready for modeling! ðŸš€")
print("="*60)