In [3]:
import pandas as pd
df = pd.read_csv("Table_Hrly_ACRE_clean.csv")


In [10]:
from lingam import VARLiNGAM

# Choose variables: target + a few strong candidates (non-lagged originals)
# Best practice: include the ORIGINAL variables (not the lag-expanded columns) for VARLiNGAM
core = [target]

# Add a few predictor originals that seem relevant in weather/soil context if present
likely = [
    "Precipitation (in)",
    "Solar Radiation (W / m²)",
    "Air Temp (°F)",
    "Relative Humidity (%)",
    "Wind Speed (mph)",
    '4" Soil Temp (°F)',
    '8" Soil Temp (°F)',
    '20" Soil Temp (°F)',
]
for c in likely:
    if c in df.columns and c != target:
        core.append(c)

# Keep only numeric + drop missing
ts = df[core].dropna().reset_index(drop=True)

# Fit VARLiNGAM (choose lag order; try 1..24 and compare later)
p = 6  # start with 6 hours; try 12/24 too
model = VARLiNGAM(lags=p, criterion=None)  # criterion None = use fixed p
model.fit(ts.values)

# Adjacency matrices:
# model.adjacency_matrices_[0] is instantaneous effects
# model.adjacency_matrices_[k] for lag k effects (k=1..p)
A0 = model.adjacency_matrices_[0]
print("Instantaneous adjacency A0 shape:", A0.shape)

# Map back to variable names
var_names = ts.columns.tolist()

Instantaneous adjacency A0 shape: (8, 8)


In [9]:
target_idx = var_names.index(target)

# Instantaneous causes of target: nonzero column entries into target
inst_parents = [(var_names[i], A0[target_idx, i]) for i in range(len(var_names)) if abs(A0[target_idx, i]) > 1e-6]
inst_parents = sorted(inst_parents, key=lambda x: abs(x[1]), reverse=True)

print("Instantaneous parents of target (name, weight):")
print(inst_parents)

# Lagged causes: look at A_k for k=1..p
lagged = []
for k in range(1, p+1):
    Ak = model.adjacency_matrices_[k]
    for i in range(len(var_names)):
        w = Ak[target_idx, i]
        if abs(w) > 1e-6:
            lagged.append((k, var_names[i], w))

lagged = sorted(lagged, key=lambda x: abs(x[2]), reverse=True)
print("\nTop lagged causes into target (lag, name, weight):")
print(lagged[:30])

Instantaneous parents of target (name, weight):
[('Precipitation (in)', -0.1290921959845814)]

Top lagged causes into target (lag, name, weight):
[(1, '20" Soil Temp (°F)', 0.8251207850705259), (4, '20" Soil Temp (°F)', 0.1075203091701007), (5, '8" Soil Temp (°F)', 0.0805514570522858), (3, '20" Soil Temp (°F)', 0.07825303468746679), (1, '8" Soil Temp (°F)', -0.056505633625590405), (1, '4" Soil Temp (°F)', -0.025152061370113905), (3, 'Air Temp (°F)', -0.0013661129192195298), (5, 'Air Temp (°F)', -0.0012285947218860984), (5, 'Relative Humidity (%)', -0.0006684934705670051), (4, 'Relative Humidity (%)', -0.00016138351664550323)]


In [5]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error

def mape(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    denom = np.maximum(np.abs(y_true), eps)
    return np.mean(np.abs((y_true - y_pred) / denom)) * 100

def make_lagged_selected(df, cols_needed, max_lag):
    # Keep only needed columns
    df2 = df[cols_needed].copy()

    # Force numeric (non-numeric becomes NaN)
    df2 = df2.apply(pd.to_numeric, errors="coerce")

    # Build lagged features
    X = pd.concat([df2.shift(l).add_suffix(f"_lag{l}") for l in range(1, max_lag+1)], axis=1)
    y = df2.copy()

    # Drop rows where the target is NaN OR any chosen predictors are NaN
    return X, y

# -----------------------------
# YOUR SETTINGS
# -----------------------------
target = '20" Soil Temp (°F)'

causal_lagged = [
    (1, 'Precipitation (in)'),
    (2, 'Precipitation (in)'),
    (5, 'Precipitation (in)'),
    (1, '4" Soil Water Content (%)'),
    (5, '4" Soil Water Content (%)'),
    (4, '4" Soil Temp (°F)'),
    (6, '20" Soil Temp (°F)'),
    (1, 'Air Temp (°F)'),
]

# columns we need in df BEFORE lagging
cols_needed = sorted(set([name for _, name in causal_lagged] + [target]))

# IMPORTANT: ensure time order
df = df.sort_index()

print("Rows in raw df:", len(df))
print("Rows after keeping only needed cols:", len(df[cols_needed]))

# Auto choose max_lag safely
requested_max_lag = 6
max_lag = min(requested_max_lag, max(1, len(df) - 5))  # keep some room
print("Using max_lag =", max_lag)

X_all, Y_all = make_lagged_selected(df, cols_needed, max_lag)

# Build predictor column list
predictor_cols = []
for lag, name in causal_lagged:
    if lag <= max_lag:
        predictor_cols.append(f"{name}_lag{lag}")

# (optional) always include own lags 1..max_lag
for lag in range(1, max_lag+1):
    col = f"{target}_lag{lag}"
    if col not in predictor_cols:
        predictor_cols.append(col)

# Keep only predictors that exist
predictor_cols = [c for c in predictor_cols if c in X_all.columns]

# Now drop NA only on needed things
needed_for_model = predictor_cols + [target]
tmp = pd.concat([X_all[predictor_cols], Y_all[[target]]], axis=1)

print("Rows before dropna (lagged table):", len(tmp))
tmp = tmp.dropna()
print("Rows after dropna:", len(tmp))

if len(tmp) == 0:
    print("\n❌ Still 0 rows after dropna.")
    print("Most likely: your target/predictor columns are non-numeric or mostly missing.")
    print("Try: print(df[cols_needed].head(20)) and df[cols_needed].isna().mean()")
    raise SystemExit

X_all = tmp[predictor_cols]
y_all = tmp[target]

# Train/test split
split = int(len(X_all) * 0.8)
X_train, X_test = X_all.iloc[:split], X_all.iloc[split:]
y_train, y_test = y_all.iloc[:split], y_all.iloc[split:]

print("Train rows:", len(X_train), "Test rows:", len(X_test))

model = Ridge(alpha=1.0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape_val = mape(y_test, y_pred)

print("\n✅ Forecast metrics (1-step ahead):")
print(f"MAE : {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAPE: {mape_val:.2f}%")

Rows in raw df: 1363
Rows after keeping only needed cols: 1363
Using max_lag = 6
Rows before dropna (lagged table): 1363
Rows after dropna: 1357
Train rows: 1085 Test rows: 272

✅ Forecast metrics (1-step ahead):
MAE : 0.0348
RMSE: 0.0565
MAPE: 0.05%
