# Exploratory Data Analysis

This notebook provides additional exploratory analysis of the news classification dataset.


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


In [4]:
# Load data
train_df = pd.read_csv('data/train.csv')
val_df = pd.read_csv('data/val.csv')
test_df = pd.read_csv('data/test.csv')

print(f"Train: {len(train_df)} samples")
print(f"Val: {len(val_df)} samples")
print(f"Test: {len(test_df)} samples")


FileNotFoundError: [Errno 2] No such file or directory: 'data/train.csv'

In [None]:
# Class distribution
class_names = ['World', 'Sports', 'Business', 'Sci/Tech']

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (name, df) in enumerate([('Train', train_df), ('Val', val_df), ('Test', test_df)]):
    class_counts = df['label'].value_counts().sort_index()
    axes[idx].bar(class_names, class_counts.values, color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'])
    axes[idx].set_title(f'{name} Set Class Distribution', fontsize=14, fontweight='bold')
    axes[idx].set_ylabel('Count', fontsize=12)
    axes[idx].tick_params(axis='x', rotation=45)
    
    # Add count labels on bars
    for i, v in enumerate(class_counts.values):
        axes[idx].text(i, v, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.savefig('data/class_distribution_detailed.png', dpi=150, bbox_inches='tight')
plt.show()


In [None]:
# Text length analysis
train_df['text_length'] = train_df['text'].str.len()
train_df['word_count'] = train_df['text'].str.split().str.len()

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Character length distribution
axes[0].hist(train_df['text_length'], bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(train_df['text_length'].mean(), color='r', linestyle='--', linewidth=2, label=f'Mean: {train_df["text_length"].mean():.0f}')
axes[0].axvline(train_df['text_length'].median(), color='g', linestyle='--', linewidth=2, label=f'Median: {train_df["text_length"].median():.0f}')
axes[0].set_xlabel('Character Count', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Text Length Distribution (Characters)', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Word count distribution
axes[1].hist(train_df['word_count'], bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[1].axvline(train_df['word_count'].mean(), color='r', linestyle='--', linewidth=2, label=f'Mean: {train_df["word_count"].mean():.0f}')
axes[1].axvline(train_df['word_count'].median(), color='g', linestyle='--', linewidth=2, label=f'Median: {train_df["word_count"].median():.0f}')
axes[1].set_xlabel('Word Count', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].set_title('Text Length Distribution (Words)', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('data/length_distribution_detailed.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nStatistics:")
print(f"Character length - Mean: {train_df['text_length'].mean():.1f}, Median: {train_df['text_length'].median():.1f}, Std: {train_df['text_length'].std():.1f}")
print(f"Word count - Mean: {train_df['word_count'].mean():.1f}, Median: {train_df['word_count'].median():.1f}, Std: {train_df['word_count'].std():.1f}")


In [None]:
# Sample examples from each class
print("Sample Examples from Each Class:\n")
print("=" * 80)

for label, class_name in enumerate(class_names):
    sample = train_df[train_df['label'] == label].iloc[0]
    print(f"\n{class_name} (Label {label}):")
    print(f"Text: {sample['text'][:200]}...")
    print(f"Length: {len(sample['text'])} characters, {len(sample['text'].split())} words")
