# Global Disaster Analysis
**Student Name:** Alperen Sağlam - 150240715

**Student Name:** İbrahim Bancar - 150220313




### 1. Introduction
This notebook performs a comprehensive **Exploratory Data Analysis (EDA)** on the Global Disaster Dataset. The goal is to understand the underlying structure of the data, identify key patterns, clean inconsistencies, and visualize high-impact trends.

**Objectives:**
- Understand the structure and content of the dataset
- Analyze distributions of key impact and response variables
- Explore temporal and geographical patterns
- Identify correlations and potential outliers

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
import geopandas as gpd
import geodatasets
from utils import *

In [None]:
# Load dataset
file_path = "../data/public_emdat_custom_request_2018-2024.xlsx"
df = pd.read_excel(file_path)

print(f"Data Shape: {df.shape}")
df.head(3)


In [None]:
# General info
print("--- Info ---")
print(df.info()) 

In [None]:
# Check for missing values
print("\n--- Missing Values ---")
print(df.isnull().sum())

In [None]:
# Statistical summary
print("\n--- Statistics ---")
print(df.describe().T)

In [None]:
# Value counts
if "Disaster Type" in df.columns:
    print("\n--- Disaster Types ---")
    print(df["Disaster Type"].value_counts())

In [None]:
# Ensure numeric
for c in ["Start Year", "Start Month", "Start Day", "End Year", "End Month", "End Day"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# Fill missing month/day to create a valid date:
df["start_month_f"] = df["Start Month"].fillna(1).astype(int)
df["start_day_f"]   = df["Start Day"].fillna(1).astype(int)

df["event_start_date"] = pd.to_datetime(
    dict(year=df["Start Year"].astype(int), month=df["start_month_f"], day=df["start_day_f"]),
    errors="coerce"
)

# Useful temporal features
df["year"] = df["event_start_date"].dt.year
df["month"] = df["event_start_date"].dt.month

print("Start date range:",
      df["event_start_date"].min().date(),
      "to",
      df["event_start_date"].max().date())

df[["event_start_date", "year", "month", "Start Year", "Start Month", "Start Day"]].head()


In [None]:
print("Number of unique disaster types:")
print(df["Disaster Type"].nunique())

print("\nTop 10 most frequent disaster types:")
print(df["Disaster Type"].value_counts().head(5))


In [None]:
print("\nNumber of unique countries:")
print(df["Country"].nunique())

print("\nTop 10 most affected countries by event count:")
print(df["Country"].value_counts().head(10))


In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 8))
# Flatten axes to index subplots with a single list
axes = axes.flatten()

# Log tick marks for people-related distributions
ticks_people = [1, 10, 100, 1_000, 10_000, 100_000, 1_000_000]
# Log tick marks for USD-related distributions
ticks_usd    = [1_000, 10_000, 100_000, 1_000_000, 10_000_000, 100_000_000, 1_000_000_000]

plot_hist_log_highlight_median(
    axes[0],
    df["Total Deaths"],
    "Total Deaths",
    "People Count (log scale)",
    ticks_people
)

plot_hist_log_highlight_median(
    axes[1],
    df["No. Injured"],
    "Number of Injured",
    "People Count (log scale)",
    ticks_people
)

plot_hist_log_highlight_median(
    axes[2],
    df["Total Affected"],
    "Total Affected",
    "People Count (log scale)",
    ticks_people
)

plot_hist_log_highlight_median(
    ax=axes[3],
    # Convert from '000 US$ to USD
    s_raw=df["Total Damage, Adjusted ('000 US$)"] * 1000,
    title="Distribution of Economic Loss",
    xlabel="Economic Damage (USD, Log Scale)",
    ticks_raw=ticks_usd,
    prefix="$",
    bins_n=30,
    # Shift the label to the right to avoid overlap on log scale
    text_offset_factor=1.8
)

# Adjust spacing to prevent subplot overlap
plt.tight_layout(rect=[0, 0.03, 1, 0.98])
plt.show()


In [None]:
# Prepare economic damage data and convert to USD
data = df["Total Damage, Adjusted ('000 US$)"].dropna() * 1000
# Keep only positive values
data = data[data > 0]

# Enable grid style for better readability
sns.set_style("whitegrid")
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot histogram on linear scale to illustrate the problem
axes[0].hist(data, bins=50, color='gray', alpha=0.7, edgecolor='white')
# Set title explaining why linear scale fails
axes[0].set_title("Linear Scale: Why the Data Is Not Visible?", fontsize=14, fontweight='bold')
# Label x-axis in USD
axes[0].set_xlabel("Economic Damage (USD)")
# Label y-axis as event count
axes[0].set_ylabel("Number of Events")
# Apply currency formatter to x-axis
axes[0].xaxis.set_major_formatter(formatter)

# Add explanatory text about data compression
axes[0].text(
    0.5,
    0.5,
    "99% of the data is compressed\ninto a single bar on the left.\n(Extreme right-skewed distribution)",
    transform=axes[0].transAxes,
    ha='center',
    color='red',
    fontsize=11,
    fontweight='bold'
)

# Compute logarithmic bins for better distribution visibility
bins = np.logspace(np.log10(data.min()), np.log10(data.max()), 40)

# Plot histogram on logarithmic scale
n, bins, patches = axes[1].hist(data, bins=bins, color='#1f77b4', alpha=0.8, edgecolor='white')
# Set x-axis to logarithmic scale
axes[1].set_xscale("log")

# Compute median value for reference
median_val = data.median()
# Draw median reference line
axes[1].axvline(
    median_val,
    color='red',
    linestyle='--',
    linewidth=2,
    label=f'Median: {currency_format(median_val, 0)}'
)

# Set title explaining the benefit of log scale
axes[1].set_title("Logarithmic Scale: True Distribution Emerges", fontsize=14, fontweight='bold')
# Label x-axis for log-scaled USD values
axes[1].set_xlabel("Economic Damage (USD, Log Scale)")
# Label y-axis as event count
axes[1].set_ylabel("Number of Events")
# Apply currency formatter to x-axis
axes[1].xaxis.set_major_formatter(formatter)
# Show legend for median reference
axes[1].legend()

# Add overall figure title
plt.suptitle("Economic Damage Analysis: Why Log Scale Matters", fontsize=16, y=1.05)
# Adjust layout to prevent overlaps
plt.tight_layout()
plt.show()


In [None]:
# Define numerical columns to analyze for outliers
outlier_cols = {
    "Total Deaths": df["Total Deaths"],
    "No. Injured": df["No. Injured"],
    "Total Affected": df["Total Affected"],
    "Economic Loss (USD)": df["Total Damage, Adjusted ('000 US$)"] * 1000
}

# Store summary statistics for each variable
summary = []

for name, s_raw in outlier_cols.items():
    # Convert values to numeric and drop invalid entries
    s = pd.to_numeric(s_raw, errors="coerce").dropna()
    # Keep only positive values
    s = s[s > 0]

    # Skip variables with no valid data
    if len(s) == 0:
        continue

    # Compute percentile-based thresholds
    p95 = np.percentile(s, 95)
    p99 = np.percentile(s, 99)
    # Get maximum observed value
    max_val = s.max()

    # Total number of observations
    n = len(s)
    # Count values above 95th percentile
    n_p95 = (s > p95).sum()
    # Count values above 99th percentile
    n_p99 = (s > p99).sum()

    # Append computed statistics to summary list
    summary.append({
        "Variable": name,
        "Count": n,
        "P95": p95,
        "P99": p99,
        "Max": max_val,
        "Events > P95 (%)": 100 * n_p95 / n,
        "Events > P99 (%)": 100 * n_p99 / n
    })

# Create a DataFrame summarizing outlier statistics
outlier_summary = pd.DataFrame(summary)
outlier_summary

In [None]:
# Convert to long format for seaborn
df_human = pd.melt(
    df,
    value_vars=["Total Deaths", "No. Injured", "Total Affected"],
    var_name="Metric",
    value_name="Value"
)
df_econ = pd.melt(
    df,
    value_vars=["Total Damage, Adjusted ('000 US$)"],
    var_name="Metric",
    value_name="Value"
)

# Convert economic values to USD and keep positives
df_econ["Value"] = df_econ["Value"] * 1000
df_econ = df_econ[df_econ["Value"] > 0]
# Keep only positive human values
df_human = df_human[df_human["Value"] > 0]

# Create side-by-side subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 7), gridspec_kw={'width_ratios': [2, 1]})

# Human impact boxplot with hue to avoid palette warning
sns.boxplot(
    data=df_human,
    x="Metric",
    y="Value",
    hue="Metric",
    ax=axes[0],
    palette="Reds",
    showfliers=True,
    width=0.5,
    legend=False
)
# Use log scale for heavy-tailed data
axes[0].set_yscale("log")
# Set titles and labels
axes[0].set_title("Human Impact Analysis (Log Scale)", fontweight="bold")
axes[0].set_ylabel("Number of People")
axes[0].set_xlabel("")
# Format y-axis ticks
axes[0].yaxis.set_major_formatter(ticker.FuncFormatter(human_format))
# Enable grid
axes[0].grid(True, axis="y", alpha=0.3, which="major")

# Economic impact boxplot
sns.boxplot(
    data=df_econ,
    x="Metric",
    y="Value",
    hue="Metric",
    ax=axes[1],
    color="#2ca02c",
    showfliers=True,
    width=0.4,
    legend=False
)
# Use log scale for economic loss
axes[1].set_yscale("log")
# Set titles and labels
axes[1].set_title("Economic Impact Analysis (Log Scale)", fontweight="bold")
axes[1].set_ylabel("Economic Loss (USD)")
axes[1].set_xlabel("")
# Format y-axis as currency
axes[1].yaxis.set_major_formatter(ticker.FuncFormatter(currency_format))
# Enable grid
axes[1].grid(True, axis="y", alpha=0.3, which="major")

# Add overall title
plt.suptitle("Outlier Analysis: Detecting Extreme Disaster Events", fontsize=16, fontweight="bold", y=0.98)
# Adjust layout
plt.tight_layout()
plt.show()


In [None]:
# =========================
# Extended Correlation Matrix (Log-Transformed Impact Variables)
# =========================

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

corr_cols = {
    "Deaths": df["Total Deaths"],
    "Injured": df["No. Injured"],
    "Affected": df["Total Affected"],
    "Homeless": df["No. Homeless"],
    "Economic Loss (USD)": df["Total Damage, Adjusted ('000 US$)"] * 1000,
    "Reconstruction Cost (USD)": df["Reconstruction Costs, Adjusted ('000 US$)"] * 1000,
    "Insured Damage (USD)": df["Insured Damage, Adjusted ('000 US$)"] * 1000
}

corr_df = pd.DataFrame()

for name, s_raw in corr_cols.items():
    s = pd.to_numeric(s_raw, errors="coerce").fillna(0)
    corr_df[name] = np.log1p(s)

# Correlation matrix
corr_matrix = corr_df.corr(method="pearson")

# Plot
plt.figure(figsize=(9, 7))
sns.heatmap(
    corr_matrix,
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    vmin=-1,
    vmax=1,
    square=True,
    linewidths=0.5,
    center= 0
)

plt.title(
    "Correlation Matrix of Disaster Impact Variables\n(Log-Transformed)",
    fontweight="bold"
)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# Prepare median economic loss per disaster type in USD
impact_by_type = (
    df.assign(econ_loss_usd=df["Total Damage, Adjusted ('000 US$)"] * 1000)
      .groupby("Disaster Type")["econ_loss_usd"]
      .median()
      .sort_values(ascending=False)
      .head(10)
      .reset_index()
)

# Create a wide figure to fit category labels
plt.figure(figsize=(14, 7))
# Enable grid background for readability
sns.set_style("whitegrid")

# Draw vertical bar plot with hue to avoid seaborn FutureWarning
ax = sns.barplot(
    data=impact_by_type,
    x="Disaster Type",
    y="econ_loss_usd",
    hue="Disaster Type",
    palette="Reds_r",
    edgecolor="black",
    legend=False
)

# Apply logarithmic scale on y-axis
ax.set_yscale("log")

# Format y-axis values as currency
ax.yaxis.set_major_formatter(ticker.FuncFormatter(currency_format))

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right', fontsize=11, fontweight='bold')

# Annotate bars with median values
for i, v in enumerate(impact_by_type["econ_loss_usd"]):
    ax.text(
        i,
        v * 1.15,
        currency_format2(v, 0),
        color='black',
        ha='center',
        va='bottom',
        fontweight='bold',
        fontsize=10
    )

# Set plot title
plt.title(
    "Top 10 Disaster Types by Typical Cost (Median - Log Scale)",
    fontsize=16,
    fontweight='bold',
    pad=20
)

plt.xlabel("")
# Label y-axis
plt.ylabel("Median Economic Loss (USD)", fontsize=12)

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()
