In [7]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
NEE vs lagged predictors correlation scan (by calendar month), all sites.
Includes lag 0 (same-month) through lag 12.

Outputs (under OUT_DIR):
  - corr_matrix_<var>.csv                 (target_month x lag_months [0..12])
  - best_lag_per_month_all_variables.csv
  - overall_best_lag_summary.csv
  - corr_heatmap_<var>_annotated.png
"""

import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ----------------- Config -----------------
IN_CSV  = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data_v4.csv"
OUT_DIR = "/explore/nobackup/people/spotter5/anna_v/v2/lag_correlation_scan"

PREDICTORS = ["tmean_C", "pr", "NDVI", "snow_cover", "snow_depth"]
MAX_LAG    = 12
MIN_PAIRS  = 24   # min (x,y) pairs required for a Pearson r

# ----------------- Helper functions -----------------
def add_lags_for_site(g: pd.DataFrame, cols, max_lag=12) -> pd.DataFrame:
    """Add 1..max_lag month lags for selected columns within one site."""
    g = g.sort_values(["year", "month"]).copy()
    g["date"] = pd.to_datetime(dict(year=g["year"], month=g["month"], day=1))
    g = g.sort_values("date").reset_index(drop=True)
    for col in cols:
        if col in g.columns:
            # lag 0 is just the same-month column; no need to add explicitly
            for L in range(1, max_lag + 1):
                g[f"{col}_lag{L}"] = g[col].shift(L)
    return g

def corr_with_min_pairs(x, y, min_pairs=MIN_PAIRS):
    v = pd.DataFrame({"x": x, "y": y}).dropna()
    return v["x"].corr(v["y"]) if len(v) >= min_pairs else np.nan

def plot_corr_heatmap_with_values(corr_df: pd.DataFrame, var_name: str, out_file: Path):
    """
    Draw a 12×13 heatmap (target months 1..12, lag 0..12) of NEE correlations with lagged `var_name`,
    annotate r in each cell.
    """
    fig, ax = plt.subplots(figsize=(12, 8))

    im = ax.imshow(corr_df.values, cmap="bwr", vmin=-1, vmax=1,
                   origin="upper", aspect="auto")

    cbar = fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    cbar.set_label(f"Pearson r (NEE vs {var_name} lag)", fontsize=9)  # smaller legend text

    ax.set_xticks(np.arange(len(corr_df.columns)))
    ax.set_yticks(np.arange(len(corr_df.index)))
    ax.set_xticklabels(corr_df.columns, fontsize=12)   # larger axis tick labels
    ax.set_yticklabels(corr_df.index, fontsize=12)
    ax.set_xlabel("Lag (months, 0 = same month)", fontsize=12)
    ax.set_ylabel("Target month", fontsize=12)
    ax.set_title(f"NEE correlation vs. {var_name} lags", fontsize=12, pad=16)

    # annotate each cell with r (2 decimals)
    for i in range(len(corr_df.index)):
        for j in range(len(corr_df.columns)):
            val = corr_df.iloc[i, j]
            if pd.notna(val):
                ax.text(j, i, f"{val:.2f}", ha="center", va="center",
                        color="black", fontsize=9)

    fig.tight_layout()
    fig.savefig(out_file, dpi=200)
    plt.close(fig)

# ----------------- Main pipeline -----------------
def main():
    os.makedirs(OUT_DIR, exist_ok=True)

    # ----- Load & prepare -----
    df = pd.read_csv(IN_CSV)

    # Normalize NEE column name and compute tmean_C if needed
    if "NEE" not in df.columns and "nee" in df.columns:
        df = df.rename(columns={"nee": "NEE"})
    if "tmean_C" not in df.columns and {"tmmn", "tmmx"}.issubset(df.columns):
        df["tmean_C"] = df[["tmmn", "tmmx"]].mean(axis=1)

    # Filter and standardize types
    df = df.dropna(subset=["site_reference", "year", "month"])
    df["year"]  = pd.to_numeric(df["year"], errors="coerce").astype(int)
    df["month"] = pd.to_numeric(df["month"], errors="coerce").astype(int)
    df = df[df["year"] >= 2001].copy()

    # Collapse to one row per site-year-month (mean if duplicates)
    group_keys = ["site_reference", "year", "month"]
    agg_map = {c: "mean" for c in set(["NEE"] + PREDICTORS) if c in df.columns}
    dfm = (
        df.groupby(group_keys, as_index=False)
          .agg(agg_map)
          .sort_values(group_keys)
          .reset_index(drop=True)
    )
    dfm["target_month"] = dfm["month"]

    # ----- Add lags within each site -----
    df_lagged = (
        dfm.groupby("site_reference", group_keys=False)
           .apply(lambda g: add_lags_for_site(g, PREDICTORS, MAX_LAG))
           .reset_index(drop=True)
    )

    # ----- Correlations (per target month × lag 0..12) -----
    heatmaps = {
        var: pd.DataFrame(index=range(1,13), columns=range(0,MAX_LAG+1), dtype=float)
        for var in PREDICTORS
    }

    for var in PREDICTORS:
        for m in range(1, 13):  # target month
            sub = df_lagged[df_lagged["target_month"] == m]
            for L in range(0, MAX_LAG + 1):
                if L == 0:
                    series = sub.get(var)  # same-month predictor
                else:
                    series = sub.get(f"{var}_lag{L}")
                r = corr_with_min_pairs(sub["NEE"], series, MIN_PAIRS)
                heatmaps[var].loc[m, L] = r

    # ----- Save correlation matrices & summaries -----
    best_rows = []
    for var, mat in heatmaps.items():
        mat.index.name = "target_month"
        mat.columns.name = "lag_months"
        mat.to_csv(Path(OUT_DIR) / f"corr_matrix_{var}.csv", float_format="%.4f")

        for m in mat.index:
            row = mat.loc[m].dropna()
            if row.empty:
                best_rows.append({"variable": var, "target_month": m,
                                  "best_lag": np.nan, "corr": np.nan})
            else:
                k = row.abs().idxmax()
                best_rows.append({"variable": var, "target_month": m,
                                  "best_lag": int(k), "corr": float(row[k])})

    best_df = pd.DataFrame(best_rows)
    best_df.to_csv(Path(OUT_DIR) / "best_lag_per_month_all_variables.csv",
                   index=False, float_format="%.4f")

    summary = []
    for var, mat in heatmaps.items():
        mean_abs = mat.abs().mean(axis=0)
        k = mean_abs.idxmax()
        summary.append({
            "variable": var,
            "overall_best_lag": int(k),
            "overall_mean_abs_corr": float(mean_abs[k])
        })
    pd.DataFrame(summary).to_csv(
        Path(OUT_DIR) / "overall_best_lag_summary.csv",
        index=False, float_format="%.4f"
    )

    # ----- Plot annotated heatmaps -----
    for var, mat in heatmaps.items():
        plot_corr_heatmap_with_values(
            mat,
            var,
            Path(OUT_DIR) / f"corr_heatmap_{var}_annotated.png"
        )

    print("Done. Results saved under:", OUT_DIR)

if __name__ == "__main__":
    main()


  df = pd.read_csv(IN_CSV)


Done. Results saved under: /explore/nobackup/people/spotter5/anna_v/v2/lag_correlation_scan


In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Lagged-predictor correlation scan (by calendar month), per target variable.
Targets: 'nee', 'gpp', 'reco', 'ch4_flux_total'
Lags: 0..12 months (0 = same-month)

For each TARGET:
  OUT_DIR/<target>/
    - corr_matrix_<predictor>.csv                 (target_month x lag_months [0..12])
    - best_lag_per_month_all_variables.csv
    - overall_best_lag_summary.csv
    - corr_heatmap_<predictor>_annotated.png
"""

import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ----------------- Config -----------------
IN_CSV   = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data_v4.csv"
OUT_DIR  = "/explore/nobackup/people/spotter5/anna_v/v2/lag_correlation_scan"

# predictors must exist in the data (case-insensitive); missing ones are skipped gracefully
PREDICTORS = ["tmean_C", "pr", "NDVI", "snow_cover", "snow_depth"]
TARGETS    = ["nee", "gpp", "reco", "ch4_flux_total"]

MAX_LAG    = 12
MIN_PAIRS  = 24   # min (x,y) pairs required for a Pearson r

# ----------------- Helper functions -----------------
def add_lags_for_site(g: pd.DataFrame, cols, max_lag=12) -> pd.DataFrame:
    """Add 1..max_lag month lags for selected columns within one site."""
    g = g.sort_values(["year", "month"]).copy()
    g["date"] = pd.to_datetime(dict(year=g["year"], month=g["month"], day=1))
    g = g.sort_values("date").reset_index(drop=True)
    for col in cols:
        if col in g.columns:
            for L in range(1, max_lag + 1):
                g[f"{col}_lag{L}"] = g[col].shift(L)
    return g

def corr_with_min_pairs(x, y, min_pairs=MIN_PAIRS):
    v = pd.DataFrame({"x": x, "y": y}).dropna()
    return v["x"].corr(v["y"]) if len(v) >= min_pairs else np.nan

def plot_corr_heatmap_with_values(corr_df: pd.DataFrame, target_name: str, var_name: str, out_file: Path):
    """
    Draw a 12×13 heatmap (target months 1..12, lag 0..12) of correlations:
    target_name vs lagged var_name; annotate r in each cell.
    """
    fig, ax = plt.subplots(figsize=(12, 8))

    im = ax.imshow(corr_df.values, cmap="bwr", vmin=-1, vmax=1,
                   origin="upper", aspect="auto")

    cbar = fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    cbar.set_label(f"Pearson r ({target_name} vs {var_name} lag)", fontsize=11)

    ax.set_xticks(np.arange(len(corr_df.columns)))
    ax.set_yticks(np.arange(len(corr_df.index)))
    ax.set_xticklabels(corr_df.columns, fontsize=12)
    ax.set_yticklabels(corr_df.index, fontsize=12)
    ax.set_xlabel("Lag (months, 0 = same month)", fontsize=12)
    ax.set_ylabel("Target month", fontsize=12)
    ax.set_title(f"{target_name} correlation vs. {var_name} lags", fontsize=12, pad=16)

    # annotate each cell with r (2 decimals)
    for i in range(len(corr_df.index)):
        for j in range(len(corr_df.columns)):
            val = corr_df.iloc[i, j]
            if pd.notna(val):
                ax.text(j, i, f"{val:.2f}", ha="center", va="center",
                        color="black", fontsize=9)

    fig.tight_layout()
    fig.savefig(out_file, dpi=200)
    plt.close(fig)

def normalize_columns_case_insensitive(df: pd.DataFrame) -> pd.DataFrame:
    """Lower-case all column names for uniform access; keep original data."""
    df = df.copy()
    df.columns = [c.lower() for c in df.columns]
    return df

def ensure_tmean(df: pd.DataFrame) -> pd.DataFrame:
    """
    Ensure 'tmean_c' exists.
    If missing but 'tmmn' and 'tmmx' exist, create it as mean of tmmn/tmmx.
    """
    if "tmean_c" not in df.columns:
        if {"tmmn", "tmmx"}.issubset(df.columns):
            df["tmean_c"] = df[["tmmn", "tmmx"]].mean(axis=1)
    return df

# ----------------- Main pipeline -----------------
def main():
    os.makedirs(OUT_DIR, exist_ok=True)

    # ----- Load & prepare (once) -----
    df = pd.read_csv(IN_CSV)
    df = normalize_columns_case_insensitive(df)
    df = ensure_tmean(df)

    # Required indexing columns
    required_idx = {"site_reference", "year", "month"}
    if not required_idx.issubset(df.columns):
        missing = sorted(list(required_idx - set(df.columns)))
        raise ValueError(f"Missing required columns: {missing}")

    # Filter and standardize types
    df = df.dropna(subset=["site_reference", "year", "month"])
    df["year"]  = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
    df["month"] = pd.to_numeric(df["month"], errors="coerce").astype("Int64")
    df = df[df["year"] >= 2001].copy()

    # Available predictors/targets (lowercase)
    available_cols = set(df.columns)

    # Compute once: collapse to one row per site-year-month using mean (handles accidental duplicates)
    base_group_keys = ["site_reference", "year", "month"]

    # We'll rebuild agg_map per target to include that target plus predictors that exist
    for target in TARGETS:
        target_lc = target.lower()
        if target_lc not in available_cols:
            print(f"[WARN] Target '{target}' not found in data. Skipping.")
            continue

        # Per-target OUT_DIR
        target_dir = Path(OUT_DIR) / target_lc
        target_dir.mkdir(parents=True, exist_ok=True)

        # Use only predictors that exist
        preds_present = []
        for p in PREDICTORS:
            p_lc = p.lower()
            if p_lc in available_cols:
                preds_present.append(p_lc)
            else:
                print(f"[WARN] Predictor '{p}' not found. Skipping for target '{target}'.")
        if not preds_present:
            print(f"[WARN] No predictors available for target '{target}'. Skipping.")
            continue

        # Build aggregation map for groupby
        agg_map = {c: "mean" for c in [target_lc] + preds_present if c in available_cols}

        dfm = (
            df.groupby(base_group_keys, as_index=False)
              .agg(agg_map)
              .sort_values(base_group_keys)
              .reset_index(drop=True)
        )
        dfm["target_month"] = dfm["month"]

        # Add lags within each site for the predictors only
        df_lagged = (
            dfm.groupby("site_reference", group_keys=False)
               .apply(lambda g: add_lags_for_site(g, preds_present, MAX_LAG))
               .reset_index(drop=True)
        )

        # Pre-allocate heatmaps (per predictor)
        heatmaps = {
            var: pd.DataFrame(index=range(1,13), columns=range(0,MAX_LAG+1), dtype=float)
            for var in preds_present
        }

        # Compute correlations: per target month x lag
        for var in preds_present:
            for m in range(1, 13):  # target month
                sub = df_lagged[df_lagged["target_month"] == m]
                for L in range(0, MAX_LAG + 1):
                    if L == 0:
                        series = sub.get(var)  # same-month predictor
                    else:
                        series = sub.get(f"{var}_lag{L}")
                    r = corr_with_min_pairs(sub[target_lc], series, MIN_PAIRS)
                    heatmaps[var].loc[m, L] = r

        # ----- Save correlation matrices & summaries -----
        best_rows = []
        for var, mat in heatmaps.items():
            mat.index.name = "target_month"
            mat.columns.name = "lag_months"
            mat.to_csv(target_dir / f"corr_matrix_{var}.csv", float_format="%.4f")

            for m in mat.index:
                row = mat.loc[m].dropna()
                if row.empty:
                    best_rows.append({"variable": var, "target_month": m,
                                      "best_lag": np.nan, "corr": np.nan})
                else:
                    k = row.abs().idxmax()
                    best_rows.append({"variable": var, "target_month": m,
                                      "best_lag": int(k), "corr": float(row[k])})

        best_df = pd.DataFrame(best_rows)
        best_df.to_csv(target_dir / "best_lag_per_month_all_variables.csv",
                       index=False, float_format="%.4f")

        summary = []
        for var, mat in heatmaps.items():
            mean_abs = mat.abs().mean(axis=0)
            k = mean_abs.idxmax()
            summary.append({
                "variable": var,
                "overall_best_lag": int(k),
                "overall_mean_abs_corr": float(mean_abs[k])
            })
        pd.DataFrame(summary).to_csv(
            target_dir / "overall_best_lag_summary.csv",
            index=False, float_format="%.4f"
        )

        # ----- Plot annotated heatmaps -----
        for var, mat in heatmaps.items():
            plot_corr_heatmap_with_values(
                mat,
                target_name=target_lc,
                var_name=var,
                out_file=target_dir / f"corr_heatmap_{var}_annotated.png"
            )

        print(f"✓ Done target '{target}'. Results saved under: {target_dir}")

    print("All requested targets processed.")


if __name__ == "__main__":
    main()


  df = pd.read_csv(IN_CSV)


✓ Done target 'nee'. Results saved under: /explore/nobackup/people/spotter5/anna_v/v2/lag_correlation_scan/nee
✓ Done target 'gpp'. Results saved under: /explore/nobackup/people/spotter5/anna_v/v2/lag_correlation_scan/gpp
✓ Done target 'reco'. Results saved under: /explore/nobackup/people/spotter5/anna_v/v2/lag_correlation_scan/reco
✓ Done target 'ch4_flux_total'. Results saved under: /explore/nobackup/people/spotter5/anna_v/v2/lag_correlation_scan/ch4_flux_total
All requested targets processed.


In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Lagged-predictor multiple regression scan (by calendar month), per target variable.
Targets: 'nee', 'gpp', 'reco', 'ch4_flux_total'
Lags: 0..12 months (0 = same-month)

For each TARGET:
  OUT_DIR/<target>/
    - coef_matrix_<predictor>.csv                 (target_month x lag_months [0..12])
    - pval_matrix_<predictor>.csv                 (target_month x lag_months [0..12])
    - best_lag_per_month_all_variables.csv
    - overall_best_lag_summary.csv
    - coef_heatmap_<predictor>_annotated.png

Notes:
- Within each (target_month m, lag L) slice, we regress z(target) on z(predictor_lagL) with
  categorical controls for the predictor's month-of-year (pred_month), i.e., C(pred_month).
- z(·) denotes standardization within the slice to make coefficients comparable across predictors.
"""

import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

# statsmodels for OLS with categorical controls
import statsmodels.api as sm

warnings.filterwarnings("ignore")

# ----------------- Config -----------------
IN_CSV   = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data_v4.csv"
OUT_DIR  = "/explore/nobackup/people/spotter5/anna_v/v2/lag_correlation_scan_multiple"

# predictors must exist in the data (case-insensitive); missing ones are skipped gracefully
PREDICTORS = ["tmean_C", "pr", "NDVI", "snow_cover", "snow_depth"]
TARGETS    = ["nee", "gpp", "reco", "ch4_flux_total"]

MAX_LAG    = 12
MIN_PAIRS  = 24   # min rows required for a regression fit

# ----------------- Helper functions -----------------
def add_lags_for_site(g: pd.DataFrame, cols, max_lag=12) -> pd.DataFrame:
    """Add 1..max_lag month lags for selected columns within one site."""
    g = g.sort_values(["year", "month"]).copy()
    g["date"] = pd.to_datetime(dict(year=g["year"], month=g["month"], day=1))
    g = g.sort_values("date").reset_index(drop=True)
    for col in cols:
        if col in g.columns:
            for L in range(1, max_lag + 1):
                g[f"{col}_lag{L}"] = g[col].shift(L)
    return g

def month_minus_lag(month_series: pd.Series, L: int) -> pd.Series:
    """
    Given 1..12 months and a lag L, compute the month-of-year of the lagged value:
    pred_month = ((month - L - 1) % 12) + 1
    """
    return ((month_series.astype(int) - L - 1) % 12) + 1

def standardize(s: pd.Series) -> pd.Series:
    mu = s.mean()
    sd = s.std(ddof=0)
    if sd == 0 or np.isnan(sd):
        return pd.Series(np.nan, index=s.index)
    return (s - mu) / sd

def run_partial_regression(y: pd.Series, x: pd.Series, pred_month: pd.Series, min_rows: int = MIN_PAIRS):
    """
    Fit OLS: z(y) ~ z(x) + C(pred_month). Return (coef, pval) for z(x).
    If not enough data or degenerate, return (np.nan, np.nan).
    """
    data = pd.DataFrame({"y": y, "x": x, "pred_month": pred_month}).dropna()
    if len(data) < min_rows:
        return np.nan, np.nan

    # Standardize within slice so the coefficient is on a comparable scale
    data["y_z"] = standardize(data["y"])
    data["x_z"] = standardize(data["x"])

    data = data.dropna(subset=["y_z", "x_z"])
    if len(data) < min_rows:
        return np.nan, np.nan

    # Categorical controls for predictor's month-of-year
    dummies = pd.get_dummies(data["pred_month"].astype(int), prefix="pm", drop_first=True)
    X = pd.concat([data["x_z"], dummies], axis=1)
    X = sm.add_constant(X, has_constant="add")
    model = sm.OLS(data["y_z"], X, missing="drop")

    try:
        res = model.fit()
    except Exception:
        return np.nan, np.nan

    if "x_z" not in res.params:
        return np.nan, np.nan

    return float(res.params["x_z"]), float(res.pvalues.get("x_z", np.nan))

def plot_coef_heatmap_with_values(df_coef: pd.DataFrame, target_name: str, var_name: str, out_file: Path):
    """
    Draw a 12×13 heatmap (target months 1..12, lag 0..12) of standardized regression coefficients:
    coef on z(var_lagL) controlling for predictor's month-of-year.
    """
    fig, ax = plt.subplots(figsize=(12, 8))

    im = ax.imshow(df_coef.values, cmap="bwr", vmin=-1, vmax=1,
                   origin="upper", aspect="auto")

    cbar = fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    cbar.set_label(f"Std. coef: {var_name} (lag) → {target_name}", fontsize=11)

    ax.set_xticks(np.arange(len(df_coef.columns)))
    ax.set_yticks(np.arange(len(df_coef.index)))
    ax.set_xticklabels(df_coef.columns, fontsize=12)
    ax.set_yticklabels(df_coef.index, fontsize=12)
    ax.set_xlabel("Lag (months, 0 = same month)", fontsize=12)
    ax.set_ylabel("Target month", fontsize=12)
    ax.set_title(f"{target_name}: partial effect of lagged {var_name}\n"
                 f"(controls: predictor month-of-year)", fontsize=12, pad=16)

    # annotate each cell with coef (2 decimals)
    for i in range(len(df_coef.index)):
        for j in range(len(df_coef.columns)):
            val = df_coef.iloc[i, j]
            if pd.notna(val):
                ax.text(j, i, f"{val:.2f}", ha="center", va="center",
                        color="black", fontsize=9)

    fig.tight_layout()
    fig.savefig(out_file, dpi=200)
    plt.close(fig)

def normalize_columns_case_insensitive(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.lower() for c in df.columns]
    return df

def ensure_tmean(df: pd.DataFrame) -> pd.DataFrame:
    """Ensure 'tmean_c' exists; if 'tmmn' & 'tmmx' exist, create it as their mean."""
    if "tmean_c" not in df.columns and {"tmmn", "tmmx"}.issubset(df.columns):
        df["tmean_c"] = df[["tmmn", "tmmx"]].mean(axis=1)
    return df

# ----------------- Main pipeline -----------------
def main():
    os.makedirs(OUT_DIR, exist_ok=True)

    # ----- Load & prepare (once) -----
    df = pd.read_csv(IN_CSV)
    df = normalize_columns_case_insensitive(df)
    df = ensure_tmean(df)

    # Required indexing columns
    required_idx = {"site_reference", "year", "month"}
    if not required_idx.issubset(df.columns):
        missing = sorted(list(required_idx - set(df.columns)))
        raise ValueError(f"Missing required columns: {missing}")

    # Filter and standardize types
    df = df.dropna(subset=["site_reference", "year", "month"])
    df["year"]  = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
    df["month"] = pd.to_numeric(df["month"], errors="coerce").astype("Int64")
    df = df[df["year"] >= 2001].copy()

    available_cols = set(df.columns)

    base_group_keys = ["site_reference", "year", "month"]

    for target in TARGETS:
        target_lc = target.lower()
        if target_lc not in available_cols:
            print(f"[WARN] Target '{target}' not found in data. Skipping.")
            continue

        target_dir = Path(OUT_DIR) / target_lc
        target_dir.mkdir(parents=True, exist_ok=True)

        preds_present = []
        for p in PREDICTORS:
            p_lc = p.lower()
            if p_lc in available_cols:
                preds_present.append(p_lc)
            else:
                print(f"[WARN] Predictor '{p}' not found. Skipping for target '{target}'.")
        if not preds_present:
            print(f"[WARN] No predictors available for target '{target}'. Skipping.")
            continue

        # Aggregate to one row per site-year-month
        agg_map = {c: "mean" for c in [target_lc] + preds_present if c in available_cols}
        dfm = (
            df.groupby(base_group_keys, as_index=False)
              .agg(agg_map)
              .sort_values(base_group_keys)
              .reset_index(drop=True)
        )
        dfm["target_month"] = dfm["month"]

        # Add lags within each site for the predictors only
        df_lagged = (
            dfm.groupby("site_reference", group_keys=False)
               .apply(lambda g: add_lags_for_site(g, preds_present, MAX_LAG))
               .reset_index(drop=True)
        )

        # Pre-allocate matrices
        coef_mats = {var: pd.DataFrame(index=range(1,13), columns=range(0,MAX_LAG+1), dtype=float)
                     for var in preds_present}
        pval_mats = {var: pd.DataFrame(index=range(1,13), columns=range(0,MAX_LAG+1), dtype=float)
                     for var in preds_present}

        # Compute partial regression coef per target month x lag
        for var in preds_present:
            for m in range(1, 13):  # target month
                sub = df_lagged[df_lagged["target_month"] == m].copy()

                for L in range(0, MAX_LAG + 1):
                    if L == 0:
                        x_series = sub.get(var)
                        pred_mo = sub["target_month"]  # predictor month == target month when L=0
                    else:
                        x_series = sub.get(f"{var}_lag{L}")
                        pred_mo = month_minus_lag(sub["target_month"], L)

                    coef, pval = run_partial_regression(
                        y=sub[target_lc],
                        x=x_series,
                        pred_month=pred_mo,
                        min_rows=MIN_PAIRS
                    )
                    coef_mats[var].loc[m, L] = coef
                    pval_mats[var].loc[m, L] = pval

        # ----- Save matrices & summaries -----
        best_rows = []
        for var in preds_present:
            cm = coef_mats[var]
            pm = pval_mats[var]

            cm.index.name = "target_month"
            cm.columns.name = "lag_months"
            pm.index.name = "target_month"
            pm.columns.name = "lag_months"

            cm.to_csv(target_dir / f"coef_matrix_{var}.csv", float_format="%.4f")
            pm.to_csv(target_dir / f"pval_matrix_{var}.csv", float_format="%.4g")

            # Best (by |coef|) per target month
            for m in cm.index:
                row = cm.loc[m].dropna()
                if row.empty:
                    best_rows.append({"variable": var, "target_month": m,
                                      "best_lag": np.nan, "std_coef": np.nan, "pval": np.nan})
                else:
                    k = row.abs().idxmax()
                    best_rows.append({"variable": var, "target_month": m,
                                      "best_lag": int(k),
                                      "std_coef": float(row[k]),
                                      "pval": float(pval_mats[var].loc[m, k])})

        best_df = pd.DataFrame(best_rows)
        best_df.to_csv(target_dir / "best_lag_per_month_all_variables.csv",
                       index=False, float_format="%.4f")

        # Overall summary: which lag maximizes mean |coef| across months
        summary = []
        for var in preds_present:
            cm = coef_mats[var]
            mean_abs = cm.abs().mean(axis=0)
            k = mean_abs.idxmax()
            summary.append({
                "variable": var,
                "overall_best_lag": int(k),
                "overall_mean_abs_std_coef": float(mean_abs[k])
            })
        pd.DataFrame(summary).to_csv(
            target_dir / "overall_best_lag_summary.csv",
            index=False, float_format="%.4f"
        )

        # ----- Plot annotated heatmaps of standardized coefficients -----
        for var in preds_present:
            plot_coef_heatmap_with_values(
                coef_mats[var],
                target_name=target_lc,
                var_name=var,
                out_file=target_dir / f"coef_heatmap_{var}_annotated.png"
            )

        print(f"✓ Done target '{target}'. Results saved under: {target_dir}")

    print("All requested targets processed.")

if __name__ == "__main__":
    main()


✓ Done target 'nee'. Results saved under: /explore/nobackup/people/spotter5/anna_v/v2/lag_correlation_scan_multiple/nee
✓ Done target 'gpp'. Results saved under: /explore/nobackup/people/spotter5/anna_v/v2/lag_correlation_scan_multiple/gpp
✓ Done target 'reco'. Results saved under: /explore/nobackup/people/spotter5/anna_v/v2/lag_correlation_scan_multiple/reco
✓ Done target 'ch4_flux_total'. Results saved under: /explore/nobackup/people/spotter5/anna_v/v2/lag_correlation_scan_multiple/ch4_flux_total
All requested targets processed.
