# D4H Incident Data Exploration

This notebook explores Search and Rescue incident data from D4H (Disaster4Help), loaded from `incidents.json`.

## Overview
- Load and clean incident data using our custom data utilities
- Perform exploratory data analysis
- Visualize key patterns and insights

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import HeatMap
from datetime import datetime

# Import our custom data utilities
import data_utils

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
# Load and process the incidents data using our helper functions
print("Loading incidents data...")
df = data_utils.load_and_process_incidents('incidents.json')

print(f"Successfully loaded {len(df)} incidents")
print(f"Data shape: {df.shape}")
print(f"Date range: {df['startsAt'].min()} to {df['startsAt'].max()}")

## Data Exploration

Let's explore the structure and content of our incident data.

In [None]:
# Descriptive statistics for numerical columns
print("=== NUMERICAL STATISTICS ===")
numerical_cols = df.select_dtypes(include=[np.number]).columns
display(df[numerical_cols].describe())

print("\n=== INCIDENT SUMMARY ===")
summary = data_utils.get_incident_summary(df)
for key, value in summary.items():
    print(f"{key}: {value}")

print("\n=== CATEGORICAL INSIGHTS ===")
if 'address.town' in df.columns:
    print(f"Top 5 towns by incident count:")
    print(df['address.town'].value_counts().head())

if 'night' in df.columns:
    print(f"\nDay vs Night incidents:")
    print(df['night'].value_counts())

In [None]:
# Data overview and structure
print("=== DATASET OVERVIEW ===")
print(f"Shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n=== COLUMN INFORMATION ===")
df.info()

print("\n=== MISSING VALUES ===")
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(1)
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
}).sort_values('Missing Count', ascending=False)
print(missing_df[missing_df['Missing Count'] > 0].head(10))

In [None]:
# Display the first few rows of the cleaned data
print("First 5 incidents:")
display(df.head())

print("\nColumn names and types:")
print(df.dtypes)

In [None]:
# Verify data cleaning results
print("=== DATA CLEANING VERIFICATION ===")

# Check if coordinates were properly parsed
if 'latitude' in df.columns and 'longitude' in df.columns:
    print("✓ Coordinates successfully parsed")
    print(f"  Latitude range: {df['latitude'].min():.4f} to {df['latitude'].max():.4f}")
    print(f"  Longitude range: {df['longitude'].min():.4f} to {df['longitude'].max():.4f}")
else:
    print("✗ Coordinates not found")

# Check date parsing
date_cols = ['createdAt', 'startsAt', 'endsAt']
for col in date_cols:
    if col in df.columns:
        print(f"✓ {col}: {df[col].dtype}")
    else:
        print(f"✗ {col}: not found")

# Check for HTML in descriptions
if 'description' in df.columns:
    html_count = df['description'].str.contains('<.*>', regex=True, na=False).sum()
    print(f"HTML tags in descriptions: {html_count} (should be 0 after cleaning)")

## Temporal Analysis

Let's analyze incident patterns over time.

In [None]:
# Temporal analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Incident Temporal Patterns', fontsize=16)

# Incidents by year
if 'startsAt' in df.columns:
    df['year'] = df['startsAt'].dt.year
    yearly_counts = df['year'].value_counts().sort_index()
    
    axes[0,0].bar(yearly_counts.index, yearly_counts.values)
    axes[0,0].set_title('Incidents by Year')
    axes[0,0].set_xlabel('Year')
    axes[0,0].set_ylabel('Number of Incidents')
    axes[0,0].tick_params(axis='x', rotation=45)

# Incidents by month
if 'startsAt' in df.columns:
    df['month'] = df['startsAt'].dt.month
    monthly_counts = df['month'].value_counts().sort_index()
    
    axes[0,1].bar(monthly_counts.index, monthly_counts.values)
    axes[0,1].set_title('Incidents by Month')
    axes[0,1].set_xlabel('Month')
    axes[0,1].set_ylabel('Number of Incidents')

# Incidents by day of week
if 'startsAt' in df.columns:
    df['day_of_week'] = df['startsAt'].dt.day_name()
    day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    day_counts = df['day_of_week'].value_counts().reindex(day_order)
    
    axes[1,0].bar(range(len(day_counts)), day_counts.values)
    axes[1,0].set_title('Incidents by Day of Week')
    axes[1,0].set_xlabel('Day of Week')
    axes[1,0].set_ylabel('Number of Incidents')
    axes[1,0].set_xticks(range(len(day_counts)))
    axes[1,0].set_xticklabels(day_counts.index, rotation=45)

# Incidents by hour
if 'startsAt' in df.columns:
    df['hour'] = df['startsAt'].dt.hour
    hourly_counts = df['hour'].value_counts().sort_index()
    
    axes[1,1].bar(hourly_counts.index, hourly_counts.values)
    axes[1,1].set_title('Incidents by Hour of Day')
    axes[1,1].set_xlabel('Hour')
    axes[1,1].set_ylabel('Number of Incidents')

plt.tight_layout()
plt.show()

## Geographical Analysis

Explore the spatial distribution of incidents.

In [None]:
# Geographical analysis
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Geographical Distribution of Incidents', fontsize=16)

# Top locations by incident count
if 'address.town' in df.columns:
    top_towns = df['address.town'].value_counts().head(10)
    axes[0].barh(range(len(top_towns)), top_towns.values)
    axes[0].set_yticks(range(len(top_towns)))
    axes[0].set_yticklabels(top_towns.index)
    axes[0].set_title('Top 10 Towns by Incident Count')
    axes[0].set_xlabel('Number of Incidents')

# Distance distribution
if 'distance' in df.columns:
    # Remove outliers for better visualization
    q99 = df['distance'].quantile(0.99)
    distance_filtered = df[df['distance'] <= q99]['distance']
    
    axes[1].hist(distance_filtered, bins=50, alpha=0.7, edgecolor='black')
    axes[1].set_title('Distribution of Incident Distances')
    axes[1].set_xlabel('Distance (meters)')
    axes[1].set_ylabel('Frequency')

# Scatter plot of coordinates (if available)
if 'latitude' in df.columns and 'longitude' in df.columns:
    valid_coords = df.dropna(subset=['latitude', 'longitude'])
    axes[2].scatter(valid_coords['longitude'], valid_coords['latitude'], 
                   alpha=0.6, s=20)
    axes[2].set_title('Incident Locations')
    axes[2].set_xlabel('Longitude')
    axes[2].set_ylabel('Latitude')
    axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print geographical summary
print("=== GEOGRAPHICAL SUMMARY ===")
if 'address.town' in df.columns:
    print(f"Number of unique towns: {df['address.town'].nunique()}")
    # Filter out rows with missing or empty town values
    towns_with_name = df[df['address.town'].notna() & (df['address.town'].str.strip() != "")]
    if not towns_with_name.empty:
        print(f"Most incidents in: {towns_with_name['address.town'].mode().iloc[0]}")
    else:
        print("No valid town data available.")

if 'distance' in df.columns:
    print(f"Average distance: {df['distance'].mean():.0f} meters")
    print(f"Max distance: {df['distance'].max():.0f} meters")

## Interactive Heatmap

Let's create an interactive heatmap of incident locations using Folium to visualize the geographic density of Search and Rescue incidents.

In [None]:
# Create interactive heatmap using Folium
if 'latitude' in df.columns and 'longitude' in df.columns:
    # Filter out rows with valid coordinates
    valid_coords_df = df.dropna(subset=['latitude', 'longitude']).copy()
    
    print(f"Creating maps with {len(valid_coords_df)} incidents that have coordinates")
    
    if len(valid_coords_df) > 0:
        # Taylor's Landing coordinates (home base)
        taylors_landing_lat = 47.94964
        taylors_landing_lon = -122.077301
        
        print(f"Map centered on Taylor's Landing: {taylors_landing_lat}, {taylors_landing_lon}")
        
        # === HEATMAP ===
        # Create base map centered on Taylor's Landing
        heatmap = folium.Map(
            location=[taylors_landing_lat, taylors_landing_lon],
            zoom_start=9,
            tiles='OpenStreetMap'
        )
        
        # Add Taylor's Landing marker
        folium.Marker(
            location=[taylors_landing_lat, taylors_landing_lon],
            popup=folium.Popup("<b>Taylor's Landing</b><br>SAR Home Base", max_width=200),
            icon=folium.Icon(color='green', icon='home', prefix='fa')
        ).add_to(heatmap)
        
        # Prepare data for heatmap - list of [lat, lon] pairs
        heat_data = [[row['latitude'], row['longitude']] for _, row in valid_coords_df.iterrows()]
        
        # Add heatmap layer
        HeatMap(
            heat_data,
            min_opacity=0.3,
            max_zoom=18,
            radius=17,
            blur=15,
            gradient={
            0.1: 'blue',
            0.3: 'lime',
            0.5: 'yellow',
            0.7: 'orange',
            0.8: 'red'
            }
        ).add_to(heatmap)
        
        # Add layer control
        folium.LayerControl().add_to(heatmap)
        
        print("📍 Incident Density Heatmap:")
        display(heatmap)
        
        # === INCIDENT POINTS MAP ===
        # Create second map for individual incident markers
        incidents_map = folium.Map(
            location=[taylors_landing_lat, taylors_landing_lon],
            zoom_start=9,
            tiles='CartoDB positron'
        )
        
        # Add Taylor's Landing marker
        folium.Marker(
            location=[taylors_landing_lat, taylors_landing_lon],
            popup=folium.Popup("<b>Taylor's Landing</b><br>SAR Home Base", max_width=200),
            icon=folium.Icon(color='green', icon='home', prefix='fa')
        ).add_to(incidents_map)
        
        # Add all incident markers
        for _, incident in valid_coords_df.iterrows():
            # Create popup text with incident details
            popup_text = f"""
            <b>Incident:</b> {incident.get('referenceDescription', 'N/A')}<br>
            <b>Date:</b> {str(incident.get('startsAt', 'N/A'))[:10]}<br>
            <b>Town:</b> {incident.get('address.town', 'N/A')}<br>
            <b>Attendance:</b> {incident.get('countAttendance', 'N/A')} people<br>
            <b>Night:</b> {'Yes' if incident.get('night', False) else 'No'}<br>
            <b>Distance:</b> {incident.get('distance', 'N/A')} meters
            """
            
            # Color code by night/day
            color = 'darkblue' if incident.get('night', False) else 'orange'
            fillColor = 'blue' if incident.get('night', False) else 'yellow'
            
            folium.CircleMarker(
                location=[incident['latitude'], incident['longitude']],
                radius=4,
                popup=folium.Popup(popup_text, max_width=300),
                color=color,
                fillColor=fillColor,
                fillOpacity=0.7,
                weight=2
            ).add_to(incidents_map)
        
        # Add layer control
        folium.LayerControl().add_to(incidents_map)
        
        print(f"\n🗺️ Individual Incident Locations ({len(valid_coords_df)} incidents):")
        print("Blue markers = Night incidents, Yellow markers = Day incidents")
        display(incidents_map)
        
    else:
        print("No valid coordinates found for maps")
else:
    print("Latitude and longitude columns not found")

## Summary and Key Insights

Based on our analysis of the D4H incident data, here are the key findings:

In [None]:
# Generate summary insights
print("🔍 KEY INSIGHTS FROM D4H INCIDENT DATA ANALYSIS")
print("=" * 60)

# Data overview
print(f"📊 DATASET OVERVIEW:")
print(f"   • Total incidents analyzed: {len(df):,}")
print(f"   • Date range: {df['startsAt'].min().strftime('%Y-%m-%d')} to {df['startsAt'].max().strftime('%Y-%m-%d')}")
print(f"   • Geographic coverage: {df['address.town'].nunique()} towns across {df['address.region'].nunique()} regions")

# Temporal patterns
print(f"\n⏰ TEMPORAL PATTERNS:")
if 'night' in df.columns:
    night_pct = (df['night'].sum() / len(df) * 100)
    print(f"   • {night_pct:.1f}% of incidents occur at night")

if 'year' in df.columns:
    peak_year = df['year'].mode().iloc[0]
    peak_count = df['year'].value_counts().iloc[0]
    print(f"   • Peak incident year: {peak_year} ({peak_count} incidents)")

if 'month' in df.columns:
    peak_month = df['month'].mode().iloc[0]
    month_names = ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                   'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    print(f"   • Most active month: {month_names[peak_month]}")

# Geographic patterns
print(f"\n🗺️ GEOGRAPHIC PATTERNS:")
if 'address.town' in df.columns:
    # Filter out empty/missing town names
    towns_with_data = df[df['address.town'].notna() & (df['address.town'].str.strip() != "")]
    if not towns_with_data.empty:
        top_town = towns_with_data['address.town'].mode().iloc[0]
        top_count = towns_with_data['address.town'].value_counts().iloc[0]
        print(f"   • Highest incident town: {top_town} ({top_count} incidents)")

if 'distance' in df.columns:
    avg_distance = df['distance'].mean()
    max_distance = df['distance'].max()
    print(f"   • Average incident distance: {avg_distance:.0f} meters")
    print(f"   • Maximum incident distance: {max_distance:.0f} meters")

# Team response patterns
print(f"\n👥 TEAM RESPONSE PATTERNS:")
if 'countAttendance' in df.columns:
    avg_attendance = df['countAttendance'].mean()
    max_attendance = df['countAttendance'].max()
    print(f"   • Average team size: {avg_attendance:.1f} people")
    print(f"   • Largest response: {max_attendance} people")

if 'percAttendance' in df.columns:
    avg_attendance_pct = df['percAttendance'].mean()
    print(f"   • Average attendance rate: {avg_attendance_pct:.1f}%")

# Data quality
print(f"\n✅ DATA QUALITY:")
coord_coverage = len(df.dropna(subset=['latitude', 'longitude'])) / len(df) * 100
print(f"   • Geographic coordinate coverage: {coord_coverage:.1f}%")

missing_descriptions = df['description'].isna().sum()
print(f"   • Incidents with descriptions: {len(df) - missing_descriptions:,} ({(len(df) - missing_descriptions)/len(df)*100:.1f}%)")

print(f"\n💡 The interactive heatmap above shows the geographic distribution of incidents,")
print(f"   with red areas indicating higher concentrations of Search and Rescue activity.")
print("=" * 60)