# 1. Data Exploration and Initial Analysis

This notebook performs initial data loading, exploration, and visualization of the disaster tweet classification dataset.

## Objectives
- Load and examine the dataset structure
- Perform initial statistical analysis
- Visualize data distributions and patterns
- Identify data quality issues

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import warnings
warnings.filterwarnings('ignore')

# Load configuration
with open('../config/hyperparameters.json', 'r') as f:
    config = json.load(f)

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

print("✅ Libraries imported successfully!")

## 1.1 Data Loading

In [None]:
# Load datasets
train_path = config['data_paths']['train_raw']
test_path = config['data_paths']['test_raw']

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")
print(f"\nTrain columns: {list(df_train.columns)}")
print(f"Test columns: {list(df_test.columns)}")

# Memory usage
train_memory = df_train.memory_usage().sum() / 1024**2
test_memory = df_test.memory_usage().sum() / 1024**2
print(f"\nTrain memory usage: {train_memory:.2f} MB")
print(f"Test memory usage: {test_memory:.2f} MB")

## 1.2 Data Overview

In [None]:
# Display first few rows
print("Training data sample:")
display(df_train.head())

print("\nTest data sample:")
display(df_test.head())

In [None]:
# Data types and missing values
print("Training data info:")
df_train.info()

print("\n" + "="*50 + "\n")
print("Missing values in train:")
print(df_train.isnull().sum())

print("\n" + "="*50 + "\n")
print("Missing values in test:")
print(df_test.isnull().sum())

## 1.3 Class Distribution

In [None]:
# Class distribution
class_counts = df_train['target'].value_counts()
class_percentages = df_train['target'].value_counts(normalize=True) * 100

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Bar plot
class_counts.plot(kind='bar', ax=ax1, color=['lightcoral', 'lightblue'])
ax1.set_title('Class Distribution')
ax1.set_xlabel('Class')
ax1.set_ylabel('Count')
ax1.set_xticks([0, 1])
ax1.set_xticklabels(['Non-Disaster', 'Disaster'])
ax1.grid(alpha=0.3)

# Add count labels
for i, count in enumerate(class_counts):
    ax1.text(i, count + 50, str(count), ha='center', va='bottom')

# Pie chart
ax2.pie(class_percentages, labels=['Non-Disaster', 'Disaster'], autopct='%1.1f%%',
        colors=['lightcoral', 'lightblue'], startangle=90)
ax2.set_title('Class Percentage')

plt.tight_layout()
plt.show()

print(f"Class distribution:")
print(f"Non-Disaster (0): {class_counts[0]} ({class_percentages[0]:.1f}%)")
print(f"Disaster (1): {class_counts[1]} ({class_percentages[1]:.1f}%)")

## 1.4 Text Length Analysis

In [None]:
# Add text length columns
df_train['text_length'] = df_train['text'].str.len()
df_train['word_count'] = df_train['text'].str.split().str.len()

df_test['text_length'] = df_test['text'].str.len()
df_test['word_count'] = df_test['text'].str.split().str.len()

# Text length statistics
print("Text length statistics (train):")
print(df_train['text_length'].describe())

print("\nWord count statistics (train):")
print(df_train['word_count'].describe())

In [None]:
# Visualize text length distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Text length by class
sns.histplot(data=df_train, x='text_length', hue='target', kde=True, ax=axes[0, 0])
axes[0, 0].set_title('Text Length Distribution by Class')
axes[0, 0].set_xlabel('Character Count')
axes[0, 0].legend(['Disaster', 'Non-Disaster'])

# Word count by class
sns.histplot(data=df_train, x='word_count', hue='target', kde=True, ax=axes[0, 1])
axes[0, 1].set_title('Word Count Distribution by Class')
axes[0, 1].set_xlabel('Word Count')
axes[0, 1].legend(['Disaster', 'Non-Disaster'])

# Box plot for text length
sns.boxplot(data=df_train, x='target', y='text_length', ax=axes[1, 0])
axes[1, 0].set_title('Text Length by Class (Box Plot)')
axes[1, 0].set_xlabel('Class')
axes[1, 0].set_ylabel('Character Count')
axes[1, 0].set_xticklabels(['Non-Disaster', 'Disaster'])

# Box plot for word count
sns.boxplot(data=df_train, x='target', y='word_count', ax=axes[1, 1])
axes[1, 1].set_title('Word Count by Class (Box Plot)')
axes[1, 1].set_xlabel('Class')
axes[1, 1].set_ylabel('Word Count')
axes[1, 1].set_xticklabels(['Non-Disaster', 'Disaster'])

plt.tight_layout()
plt.show()

## 1.5 Keyword Analysis

In [None]:
# Keywords analysis
print("Unique keywords in train:", df_train['keyword'].nunique())
print("Unique keywords in test:", df_test['keyword'].nunique())

# Top keywords by class
disaster_keywords = df_train[df_train['target'] == 1]['keyword'].value_counts().head(20)
non_disaster_keywords = df_train[df_train['target'] == 0]['keyword'].value_counts().head(20)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

# Disaster keywords
disaster_keywords.plot(kind='bar', ax=ax1, color='red')
ax1.set_title('Top 20 Keywords in Disaster Tweets')
ax1.set_xlabel('Keyword')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=45)

# Non-disaster keywords
non_disaster_keywords.plot(kind='bar', ax=ax2, color='blue')
ax2.set_title('Top 20 Keywords in Non-Disaster Tweets')
ax2.set_xlabel('Keyword')
ax2.set_ylabel('Count')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 1.6 Location Analysis

In [None]:
# Location analysis
print("Unique locations in train:", df_train['location'].nunique())
print("Tweets with location info: {df_train['location'].notna().sum() / len(df_train) * 100:.1f}%")

# Top locations
top_locations = df_train['location'].value_counts().head(15)

plt.figure(figsize=(12, 6))
top_locations.plot(kind='bar')
plt.title('Top 15 Locations')
plt.xlabel('Location')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.grid(alpha=0.3)
plt.show()

## 1.7 Sample Tweets by Class

In [None]:
# Display sample tweets
print("Sample Disaster Tweets:")
disaster_samples = df_train[df_train['target'] == 1]['text'].head(5)
for i, tweet in enumerate(disaster_samples, 1):
    print(f"{i}. {tweet}")

print("\n" + "="*50 + "\n")
print("Sample Non-Disaster Tweets:")
non_disaster_samples = df_train[df_train['target'] == 0]['text'].head(5)
for i, tweet in enumerate(non_disaster_samples, 1):
    print(f"{i}. {tweet}")

## 1.8 Data Quality Assessment

In [None]:
# Check for duplicates
train_duplicates = df_train.duplicated().sum()
test_duplicates = df_test.duplicated().sum()

print(f"Duplicate rows in train: {train_duplicates}")
print(f"Duplicate rows in test: {test_duplicates}")

# Check for empty texts
empty_train = df_train['text'].str.strip().eq('').sum()
empty_test = df_test['text'].str.strip().eq('').sum()

print(f"\nEmpty texts in train: {empty_train}")
print(f"Empty texts in test: {empty_test}")

# Check for very short texts (< 10 characters)
short_train = (df_train['text'].str.len() < 10).sum()
short_test = (df_test['text'].str.len() < 10).sum()

print(f"\nVery short texts (< 10 chars) in train: {short_train}")
print(f"Very short texts (< 10 chars) in test: {short_test}")

## 1.9 Summary Statistics

In [None]:
# Summary statistics
summary_stats = {
    'Dataset': ['Training', 'Test'],
    'Total Samples': [len(df_train), len(df_test)],
    'Disaster %': [df_train['target'].mean() * 100, 'N/A'],
    'Non-Disaster %': [(1 - df_train['target'].mean()) * 100, 'N/A'],
    'Avg Text Length': [df_train['text_length'].mean(), df_test['text_length'].mean()],
    'Avg Word Count': [df_train['word_count'].mean(), df_test['word_count'].mean()],
    'Unique Keywords': [df_train['keyword'].nunique(), df_test['keyword'].nunique()],
    'Unique Locations': [df_train['location'].nunique(), df_test['location'].nunique()]
}

summary_df = pd.DataFrame(summary_stats)
print("Dataset Summary Statistics:")
display(summary_df.round(2))

## 1.10 Save Enhanced Data

Save the enhanced datasets with basic statistics for use in subsequent notebooks.

In [None]:
# Save enhanced datasets with basic statistics
df_train.to_csv('Data/train_enhanced.csv', index=False)
df_test.to_csv('Data/test_enhanced.csv', index=False)

print("✅ Enhanced datasets saved:")
print("- Data/train_enhanced.csv")
print("- Data/test_enhanced.csv")

# Save summary statistics
summary_df.to_csv('results/metrics/data_summary.csv', index=False)
print("\n✅ Summary statistics saved to results/metrics/data_summary.csv")

print("\n" + "="*60)
print("🎉 Data exploration completed successfully!")
print("Next: Feature Engineering (02_feature_engineering.ipynb)")
print("="*60)