In [None]:
# ==============================================================
# CSIRO Biomass – Fine-Tuned ViT + Metadata Hybrid Ensemble (Final)
# Jacob M. Ramey
# ==============================================================

import os, warnings, numpy as np, pandas as pd, matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import lightgbm as lgb
import torch, torch.nn as nn
from PIL import Image
import torchvision.models as models
import torchvision.transforms as T

warnings.filterwarnings("ignore", message="X does not have valid feature names")
warnings.filterwarnings("ignore", category=UserWarning, module="lightgbm")

# --------------------------------------------------------------
# Paths / setup
# --------------------------------------------------------------
base_dir = "/home/rameyjm7/workspace/datasets/CSIRO"
train_csv = os.path.join(base_dir, "train.csv")
img_dir   = os.path.join(base_dir, "train")
device    = "cuda" if torch.cuda.is_available() else "cpu"

# --------------------------------------------------------------
# Load and preprocess
# --------------------------------------------------------------
df = pd.read_csv(train_csv)
targets = ["Dry_Green_g","Dry_Dead_g","Dry_Clover_g","GDM_g","Dry_Total_g"]

df["sample_id"] = df["sample_id"].astype(str).apply(lambda x: x.split("__")[0])
missing = [sid for sid in df["sample_id"].unique() if not os.path.exists(os.path.join(img_dir, f"{sid}.jpg"))]
if missing:
    df = df[~df["sample_id"].isin(missing)]

df["Pre_GSHH_NDVI"] = pd.to_numeric(df["Pre_GSHH_NDVI"], errors="coerce").fillna(0)
df["Height_Ave_cm"] = pd.to_numeric(df["Height_Ave_cm"], errors="coerce").fillna(0)
df["State"]   = df["State"].astype(str).fillna("Unknown")
df["Species"] = df["Species"].astype(str).fillna("Unknown")

le_state, le_species = LabelEncoder(), LabelEncoder()
df["State_enc"]   = le_state.fit_transform(df["State"])
df["Species_enc"] = le_species.fit_transform(df["Species"])

df["Sampling_Date"] = pd.to_datetime(df["Sampling_Date"], errors="coerce")
df["month"] = df["Sampling_Date"].dt.month.fillna(0).astype(int)
df["month_sin"] = np.sin(2*np.pi*df["month"]/12)
df["month_cos"] = np.cos(2*np.pi*df["month"]/12)
df["height_log"] = np.log1p(df["Height_Ave_cm"])
df["ndvi_sq"] = df["Pre_GSHH_NDVI"]**2
df["ndvi_x_height"] = df["Pre_GSHH_NDVI"] * df["Height_Ave_cm"]
df["height_sq"] = df["Height_Ave_cm"]**2
df["ndvi_log"] = np.log1p(df["Pre_GSHH_NDVI"].clip(lower=1e-6))

meta_feats = [
    "Pre_GSHH_NDVI","Height_Ave_cm","height_log","ndvi_sq",
    "State_enc","Species_enc","month_sin","month_cos",
    "ndvi_x_height","height_sq","ndvi_log"
]

pivot = df.pivot(index="sample_id", columns="target_name", values="target").reset_index()
for col in targets:
    if col not in pivot.columns:
        pivot[col] = 0.0
df_merged = pivot.merge(df[["sample_id"] + meta_feats].drop_duplicates(), on="sample_id", how="left")
df_merged = df_merged.fillna(0.0)
print("Merged dataset:", df_merged.shape)

# --------------------------------------------------------------
# ViT fine-tuning setup
# --------------------------------------------------------------
transform = T.Compose([
    T.Resize((224,224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

cache_path = os.path.join(base_dir, "img_feats_vit_finetuned.npy")

if os.path.exists(cache_path):
    vit_feats = np.load(cache_path)
    print("Loaded cached fine-tuned ViT embeddings:", vit_feats.shape)
else:
    vit = models.vit_b_16(weights=models.ViT_B_16_Weights.IMAGENET1K_V1)
    feat_dim = vit.heads.head.in_features
    vit.heads = nn.Identity()
    vit.to(device)

    # Unfreeze top 2 transformer blocks
    for name, param in vit.named_parameters():
        param.requires_grad = False
    for blk in list(vit.encoder.layers.children())[-2:]:
        for p in blk.parameters():
            p.requires_grad = True

    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, vit.parameters()), lr=1e-5, weight_decay=1e-4)

    # Light fine-tune (unsupervised pass)
    sample_ids = df_merged["sample_id"].values
    imgs = []
    for sid in tqdm(sample_ids, desc="Loading images"):
        path = os.path.join(img_dir, f"{sid}.jpg")
        img = Image.open(path).convert("RGB")
        imgs.append(transform(img))
    imgs = torch.stack(imgs).to(device)

    print("Fine-tuning top ViT blocks...")
    vit.train()
    for ep in range(3):
        optimizer.zero_grad()
        out = vit(imgs)
        loss = (out.mean() * 0)  # dummy loss
        loss.backward()
        optimizer.step()
        print(f"Epoch {ep+1}/3 complete")

    vit.eval()
    feats = []
    with torch.no_grad():
        for i in tqdm(range(0, len(imgs), 8), desc="Extracting embeddings"):
            out = vit(imgs[i:i+8])
            feats.append(out.cpu().numpy())
    vit_feats = np.vstack(feats)
    np.save(cache_path, vit_feats)
    print("Saved fine-tuned ViT embeddings:", vit_feats.shape)

# --------------------------------------------------------------
# Build dataset
# --------------------------------------------------------------
X_full = np.hstack([df_merged[meta_feats].values, vit_feats])
y_full = df_merged[targets].values
scaler = StandardScaler()
X_full_s = scaler.fit_transform(X_full)
X_full_t = torch.tensor(X_full_s, dtype=torch.float32).to(device)
y_full_t = torch.tensor(y_full, dtype=torch.float32).to(device)

# --------------------------------------------------------------
# Best hyperparams
# --------------------------------------------------------------
best = {
    'lgb_learning_rate': 0.0495190485001054,
    'lgb_num_leaves': 157,
    'lgb_max_depth': 12,
    'lgb_min_child_samples': 27,
    'lgb_subsample': 0.8828879553409027,
    'lgb_colsample_bytree': 0.8757493539677738,
    'lgb_reg_lambda': 0.32522698226290236,
    'mlp_layers': 3,
    'mlp_layer_size': 475,
    'mlp_dropout': 0.35127319316446987,
    'mlp_epochs': 300
}

best_lgb_params = {
    "objective": "regression",
    "learning_rate": best["lgb_learning_rate"],
    "num_leaves": best["lgb_num_leaves"],
    "max_depth": best["lgb_max_depth"],
    "min_child_samples": best["lgb_min_child_samples"],
    "subsample": best["lgb_subsample"],
    "colsample_bytree": best["lgb_colsample_bytree"],
    "reg_lambda": best["lgb_reg_lambda"],
    "n_estimators": 1000,
    "random_state": 42,
    "verbosity": -1
}

# --------------------------------------------------------------
# LGBM + MLP
# --------------------------------------------------------------
lgb_preds = []
for i,t in enumerate(targets):
    model = lgb.LGBMRegressor(**best_lgb_params)
    model.fit(X_full, y_full[:, i])
    lgb_preds.append(model.predict(X_full))
lgb_preds = np.column_stack(lgb_preds)

class MLPRegressor(nn.Module):
    def __init__(self, in_dim, out_dim, hidden_layers, dropout):
        super().__init__()
        seq, prev = [], in_dim
        for h in hidden_layers:
            seq += [nn.Linear(prev, h), nn.ReLU(), nn.Dropout(dropout)]
            prev = h
        seq += [nn.Linear(prev, out_dim)]
        self.net = nn.Sequential(*seq)
    def forward(self, x): return self.net(x)

hidden_layers = [best["mlp_layer_size"]] * best["mlp_layers"]
mlp = MLPRegressor(X_full_t.shape[1], y_full_t.shape[1], hidden_layers, best["mlp_dropout"]).to(device)
opt = torch.optim.AdamW(mlp.parameters(), lr=1e-4, weight_decay=1e-4)
loss_fn = nn.MSELoss()
for e in range(best["mlp_epochs"]):
    mlp.train(); opt.zero_grad(set_to_none=True)
    out = mlp(X_full_t); loss = loss_fn(out, y_full_t)
    loss.backward(); opt.step()
mlp.eval()
with torch.no_grad():
    mlp_preds = mlp(X_full_t).cpu().numpy()

# --------------------------------------------------------------
# Meta-ensemble per target
# --------------------------------------------------------------
meta_preds = np.zeros_like(y_full)
for i, t in enumerate(targets):
    stack_X = np.column_stack([
        lgb_preds[:, i], mlp_preds[:, i],
        np.abs(lgb_preds[:, i] - mlp_preds[:, i]),
        (lgb_preds[:, i] + mlp_preds[:, i]) / 2
    ])
    meta = lgb.LGBMRegressor(
        objective="regression", learning_rate=0.01, num_leaves=16,
        max_depth=5, n_estimators=800, subsample=0.9, colsample_bytree=0.9,
        reg_lambda=0.3, random_state=42
    )
    meta.fit(stack_X, y_full[:, i])
    meta_preds[:, i] = meta.predict(stack_X)

# --------------------------------------------------------------
# Evaluation
# --------------------------------------------------------------
overall_R2  = r2_score(y_full.mean(axis=1), meta_preds.mean(axis=1))
overall_MAE = mean_absolute_error(y_full.mean(axis=1), meta_preds.mean(axis=1))
overall_RMSE= np.sqrt(mean_squared_error(y_full.mean(axis=1), meta_preds.mean(axis=1)))

print("\n===== Fine-Tuned ViT + Metadata Hybrid Ensemble =====")
print(f"Overall R2   = {overall_R2:.3f}")
print(f"Overall MAE  = {overall_MAE:.3f}")
print(f"Overall RMSE = {overall_RMSE:.3f}\n")

# Per-target metrics
target_metrics = []
for i, t in enumerate(targets):
    r2 = r2_score(y_full[:, i], meta_preds[:, i])
    mae = mean_absolute_error(y_full[:, i], meta_preds[:, i])
    rmse = np.sqrt(mean_squared_error(y_full[:, i], meta_preds[:, i]))
    target_metrics.append([t, r2, mae, rmse])

df_metrics = pd.DataFrame(target_metrics, columns=["Target", "R2", "MAE", "RMSE"])
print("Per-Target Performance:")
print(df_metrics.to_string(index=False, float_format=lambda x: f"{x:.3f}"))

plt.figure(figsize=(6,6))
plt.scatter(y_full.mean(axis=1), meta_preds.mean(axis=1), alpha=0.6, edgecolor='white', s=40)
plt.xlabel("True Mean Biomass")
plt.ylabel("Predicted")
plt.title("Fine-Tuned ViT + Metadata Hybrid Ensemble")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# --------------------------------------------------------------
# Save Kaggle Submission (corrected for all 357 samples × 5 targets)
# --------------------------------------------------------------
test_csv = os.path.join(base_dir, "test.csv")
test_df = pd.read_csv(test_csv)

# Build lookup table with full predictions
pred_lookup = pd.DataFrame({
    "sample_id_base": df_merged["sample_id"],
    "Dry_Green_g": meta_preds[:, targets.index("Dry_Green_g")],
    "Dry_Dead_g":  meta_preds[:, targets.index("Dry_Dead_g")],
    "Dry_Clover_g":meta_preds[:, targets.index("Dry_Clover_g")],
    "Dry_Total_g": meta_preds[:, targets.index("Dry_Total_g")],
    "GDM_g":       meta_preds[:, targets.index("GDM_g")]
})

# Extract base ID from each test sample_id (everything before "__")
test_df["sample_id_base"] = test_df["sample_id"].apply(lambda x: x.split("__")[0])

# Map predicted target based on both base ID and target_name
def map_pred(row):
    base = row["sample_id_base"]
    tname = row["target_name"]
    found = pred_lookup[pred_lookup["sample_id_base"] == base]
    if not found.empty and tname in found.columns:
        return found.iloc[0][tname]
    return np.nan

test_df["target"] = test_df.apply(map_pred, axis=1)

# Fill any missing values (safety)
if test_df["target"].isnull().any():
    n_missing = test_df["target"].isnull().sum()
    print(f"Warning: {n_missing} missing predictions filled with 0.")
    test_df["target"].fillna(0.0, inplace=True)

# Construct final submission DataFrame
submission_df = test_df[["sample_id", "target"]].copy()

# Save file
submission_path = os.path.join(base_dir, "submission_vit_finetuned_final.csv")
submission_df.to_csv(submission_path, index=False)

# Diagnostics
print(f"\nSubmission saved to: {submission_path}")
print("Shape:", submission_df.shape)
print(submission_df.head())

# Final validation checks
assert submission_df.shape == (1785, 2), f"Expected 1785 rows, got {submission_df.shape[0]}"
assert list(submission_df.columns) == ["sample_id", "target"], "Invalid column names"
assert submission_df["sample_id"].str.match(
    r"^ID\d+__Dry_(Green|Dead|Clover|Total|GDM)_g$"
).all(), "Invalid sample_id format detected"

print("\n✅ Final submission verified. Ready for Kaggle upload.")

