# Exploratory Data Analysis (EDA)

This notebook is for EDA and first cleaning only.

It does **not** train models.

Goal:
- understand data quality
- check distributions and seasonality
- create a clean feature table for later modeling

## Step 1: Load data

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.style.use("seaborn-v0_8-whitegrid")

ROOT = Path.cwd()
FEATURE_PATH = ROOT / "Data" / "processed" / "zh_monthly_features.parquet"
SITES_PATH = ROOT / "Data" / "processed" / "zh_sites.parquet"

if not FEATURE_PATH.exists() or not SITES_PATH.exists():
    raise FileNotFoundError("Run Data/datasets.ipynb first to create processed files.")

df = pd.read_parquet(FEATURE_PATH)
sites = pd.read_parquet(SITES_PATH)

df["month"] = pd.to_datetime(df["month"])

print("features shape:", df.shape)
print("sites shape:", sites.shape)
print("date range:", df["month"].min().date(), "->", df["month"].max().date())
print("number of sites:", df["site_id"].nunique())

## Step 2: Data quality checks

In [None]:
print("Columns:")
print(df.columns.tolist())

print()
print("Duplicate site-month rows:", int(df.duplicated(["site_id", "month"]).sum()))

missing_pct = (df.isna().mean() * 100).sort_values(ascending=False)
print()
print("Missing rate (%):")
print(missing_pct.round(2).astype(str) + "%")


In [None]:
# Missingness by site for meteo
meteo_cols = ["tp_mm", "pet_mm", "t2m_c"]
site_meteo_missing = (
    df.groupby("site_id")[meteo_cols]
    .apply(lambda x: x.isna().mean().mean())
    .rename("meteo_missing_ratio")
    .reset_index()
)

print("Site-level meteo missing ratio counts:")
print(site_meteo_missing["meteo_missing_ratio"].value_counts().sort_index())

sites_all_meteo_missing = set(
    site_meteo_missing.loc[site_meteo_missing["meteo_missing_ratio"] == 1.0, "site_id"].tolist()
)
print()
print("Sites with all meteo missing:", len(sites_all_meteo_missing))
print("Sites with usable meteo:", df["site_id"].nunique() - len(sites_all_meteo_missing))


In [None]:
fig, ax = plt.subplots(figsize=(7, 3.8))
missing_pct.sort_values(ascending=False).plot(kind="bar", ax=ax)
ax.set_ylabel("Missing (%)")
ax.set_title("Missing Rate by Column")
plt.tight_layout()
plt.show()

## Step 3: Distribution checks

In [None]:
num_cols = ["tp_mm", "pet_mm", "t2m_c", "spi3_z", "spi6_z", "spi12_z", "pumping_m3_month"]
summary = df[num_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]).T
print(summary[["mean", "std", "min", "1%", "5%", "50%", "95%", "99%", "max"]])

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(14, 3.8))

df["tp_mm"].dropna().hist(ax=axes[0], bins=40)
axes[0].set_title("tp_mm")


df["t2m_c"].dropna().hist(ax=axes[1], bins=40)
axes[1].set_title("t2m_c")

clip_99 = df["pumping_m3_month"].quantile(0.99)
df["pumping_m3_month"].clip(upper=clip_99).hist(ax=axes[2], bins=40)
axes[2].set_title("pumping_m3_month (clip 99%)")

plt.tight_layout()
plt.show()

## Step 4: Time patterns

In [None]:
# Monthly average over all sites
monthly_avg = (
    df.groupby("month")[["tp_mm", "pet_mm", "t2m_c", "pumping_m3_month"]]
    .mean()
    .sort_index()
)

fig, axes = plt.subplots(2, 2, figsize=(12, 6), sharex=True)
monthly_avg["tp_mm"].plot(ax=axes[0, 0], title="Mean tp_mm")
monthly_avg["pet_mm"].plot(ax=axes[0, 1], title="Mean pet_mm")
monthly_avg["t2m_c"].plot(ax=axes[1, 0], title="Mean t2m_c")
monthly_avg["pumping_m3_month"].plot(ax=axes[1, 1], title="Mean pumping_m3_month")
plt.tight_layout()
plt.show()

In [None]:
# Calendar month seasonality
tmp = df.copy()
tmp["cal_month"] = tmp["month"].dt.month
season = tmp.groupby("cal_month")[["tp_mm", "pet_mm", "t2m_c"]].mean()

fig, ax = plt.subplots(figsize=(8, 4))
season.plot(ax=ax)
ax.set_title("Seasonality by Calendar Month")
ax.set_xlabel("Month")
plt.tight_layout()
plt.show()

## Step 5: Feature relationships

In [None]:
corr_cols = ["tp_mm", "pet_mm", "t2m_c", "spi3_z", "spi6_z", "spi12_z", "pumping_m3_month"]
corr = df[corr_cols].corr(numeric_only=True)
print(corr.round(3))

fig, ax = plt.subplots(figsize=(7, 5))
im = ax.imshow(corr.values, cmap="coolwarm", vmin=-1, vmax=1)
ax.set_xticks(range(len(corr_cols)))
ax.set_yticks(range(len(corr_cols)))
ax.set_xticklabels(corr_cols, rotation=45, ha="right")
ax.set_yticklabels(corr_cols)
ax.set_title("Correlation Matrix")
plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
plt.tight_layout()
plt.show()

## Step 6: First cleaning for model input

In [None]:
work = df.copy()

# 1) drop site-level ERA5 metadata
drop_cols = ["number", "latitude", "longitude", "expver"]
work = work.drop(columns=[c for c in drop_cols if c in work.columns])

# 2) remove sites with all meteo missing
work = work[~work["site_id"].isin(sites_all_meteo_missing)].copy()

# 3) remove rows with missing hydro features
work = work[work["tp_mm"].notna() & work["pet_mm"].notna() & work["t2m_c"].notna()].copy()

# 4) remove SPI warm-up rows
work = work[work["spi12_z"].notna()].copy()

# 5) pumping transform
work["pumping_m3_month"] = work["pumping_m3_month"].clip(lower=0)
work["pumping_m3_month_log1p"] = np.log1p(work["pumping_m3_month"])

# 6) month cyclical encoding
work["month_sin"] = np.sin(2 * np.pi * work["month"].dt.month / 12.0)
work["month_cos"] = np.cos(2 * np.pi * work["month"].dt.month / 12.0)

work = work.sort_values(["site_id", "month"]).reset_index(drop=True)

sites_clean = sites[sites["site_id"].isin(work["site_id"].unique())].copy()

print("clean feature shape:", work.shape)
print("clean sites shape:", sites_clean.shape)
print("clean date range:", work["month"].min().date(), "->", work["month"].max().date())
print("clean site count:", work["site_id"].nunique())

In [None]:
# Feature groups for your ablation design
feature_sets = {
    "hydro_only": ["tp_mm", "pet_mm", "t2m_c"],
    "hydro_plus_pumping": ["tp_mm", "pet_mm", "t2m_c", "pumping_m3_month_log1p"],
    "hydro_plus_pumping_plus_spi": ["tp_mm", "pet_mm", "t2m_c", "pumping_m3_month_log1p", "spi3_z", "spi6_z", "spi12_z"],
}

for k, v in feature_sets.items():
    print(k, "->", v)

## Step 7: Save EDA outputs

In [None]:
out_dir = ROOT / "Data" / "processed"
out_dir.mkdir(parents=True, exist_ok=True)

feat_out = out_dir / "zh_features_model_ready.parquet"
site_out = out_dir / "zh_sites_model_ready.parquet"

saved_feat = None
saved_site = None

try:
    work.to_parquet(feat_out, index=False)
    sites_clean.to_parquet(site_out, index=False)
    saved_feat, saved_site = feat_out, site_out
except Exception:
    feat_out = out_dir / "zh_features_model_ready.pkl"
    site_out = out_dir / "zh_sites_model_ready.pkl"
    work.to_pickle(feat_out)
    sites_clean.to_pickle(site_out)
    saved_feat, saved_site = feat_out, site_out

print("Saved:")
print("-", saved_feat)
print("-", saved_site)
print(work.head())

## Status vs your thesis design

What is done now:
- EDA is done.
- Hydro, pumping, and SPI features are prepared.
- A clean feature table is saved for modeling.

What is not done yet:
- target variable (`y_anomaly`, SGI-like) is still missing.
- train/validation/test with target is not built yet.
- model training and evaluation are not done yet.