# XG Boost Model

# (1) Load + merge ALL training datasets (Water Quality + TerraClimate + Landsat)

- This cell:
- loads the 3 training CSVs
- creates stable merge keys (rounding lat/lon + normalized date)
- merges features onto the water quality targets

In [5]:
import os
import pandas as pd
import numpy as np

BASE_DIR = "./" 

WQ_TRAIN_PATH = os.path.join(BASE_DIR, "water_quality_training_dataset.csv")
TC_TRAIN_PATH = os.path.join(BASE_DIR, "terraclimate_features_training.csv")
LS_TRAIN_PATH = os.path.join(BASE_DIR, "landsat_features_training.csv")

def _prep_keys(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "Sample Date" in df.columns:
        dt = pd.to_datetime(df["Sample Date"], errors="coerce")
    elif "Sample_Date" in df.columns:
        dt = pd.to_datetime(df["Sample_Date"], errors="coerce")
        df["Sample Date"] = df["Sample_Date"]
    else:
        raise ValueError("Could not find 'Sample Date' column.")
    df["Sample Date"] = dt.dt.strftime("%Y-%m-%d")

    df["Latitude"]  = pd.to_numeric(df["Latitude"], errors="coerce")
    df["Longitude"] = pd.to_numeric(df["Longitude"], errors="coerce")

    df["lat_k"] = df["Latitude"].round(5)
    df["lon_k"] = df["Longitude"].round(5)
    df["date_k"] = df["Sample Date"]
    return df

wq_train = _prep_keys(pd.read_csv(WQ_TRAIN_PATH))
tc_train = _prep_keys(pd.read_csv(TC_TRAIN_PATH))
ls_train = _prep_keys(pd.read_csv(LS_TRAIN_PATH))

merge_keys = ["lat_k", "lon_k", "date_k"]

train_df = wq_train.merge(tc_train.drop(columns=["Latitude","Longitude","Sample Date"], errors="ignore"),
                          on=merge_keys, how="left", suffixes=("", "_tc"))
train_df = train_df.merge(ls_train.drop(columns=["Latitude","Longitude","Sample Date"], errors="ignore"),
                          on=merge_keys, how="left", suffixes=("", "_ls"))

print("train_df shape:", train_df.shape)
print("train_df columns (head):", list(train_df.columns)[:30])
train_df.head()

Training files:
WQ: water_quality_training_dataset.csv
TC: terraclimate_features_training.csv
LS: landsat_features_training.csv
Merged train_df shape: (19278121, 22)


Unnamed: 0,Latitude,Longitude,Sample Date,Total Alkalinity,Electrical Conductance,Dissolved Reactive Phosphorus,lat_k,lon_k,date_k,Latitude_tc,...,pet,Latitude_ls,Longitude_ls,Sample Date_ls,nir,green,swir16,swir22,NDMI,MNDWI
0,-28.760833,17.730278,2011-02-01,128.912,555.0,10.0,-28.76083,17.73028,2011-02-01,-28.760833,...,174.2,-28.760833,17.730278,2011-02-01,11190.0,11426.0,7687.5,7645.0,0.185538,0.195595
1,-26.861111,28.884722,2011-03-01,74.72,162.9,163.0,-26.86111,28.88472,2011-03-01,-26.861111,...,124.1,-26.861111,28.884722,2011-03-01,17658.5,9550.0,13746.5,10574.0,0.124566,-0.180134
2,-26.45,28.085833,2011-03-01,89.254,573.0,80.0,-26.45,28.08583,2011-03-01,-26.45,...,127.5,-26.45,28.085833,2011-03-01,15210.0,10720.0,17974.0,14201.0,-0.083293,-0.252805
3,-27.671111,27.236944,2011-03-01,82.0,203.6,101.0,-27.67111,27.23694,2011-03-01,-27.671111,...,129.7,-27.671111,27.236944,2011-03-01,14887.0,10943.0,13522.0,11403.0,0.048048,-0.105416
4,-27.356667,27.286389,2011-03-01,56.1,145.1,151.0,-27.35667,27.28639,2011-03-01,-27.356667,...,129.2,-27.356667,27.286389,2011-03-01,16828.5,9502.5,12665.5,9643.0,0.141147,-0.142683


In [6]:
import pandas as pd
import numpy as np
from pathlib import Path

BASE_DIR = Path(".")  # change if needed, e.g. Path("/path/to/Models/BaseData")

# ---- auto-find files (so you don't get FileNotFoundError) ----
def find_one(patterns):
    for pat in patterns:
        hits = list(BASE_DIR.glob(pat))
        if hits:
            return hits[0]
    raise FileNotFoundError(f"Could not find file for patterns: {patterns} in {BASE_DIR.resolve()}")

WQ_TRAIN_PATH = find_one(["*water_quality*training*.csv", "*water_quality_training_dataset*.csv"])
TC_TRAIN_PATH = find_one(["*terraclimate*training*.csv", "*terraclimate_features_training*.csv"])
LS_TRAIN_PATH = find_one(["*landsat*training*.csv", "*landsat_features_training*.csv"])

print("Training files:")
print("WQ:", WQ_TRAIN_PATH.name)
print("TC:", TC_TRAIN_PATH.name)
print("LS:", LS_TRAIN_PATH.name)

# ---- key prep (robust merge keys) ----
def _clean_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    return df

def _prepare_keys(df: pd.DataFrame) -> pd.DataFrame:
    df = _clean_cols(df)
    # standard expected columns
    if "Sample Date" in df.columns:
        df["Sample Date"] = pd.to_datetime(df["Sample Date"], errors="coerce")
    # numeric safety
    if "Latitude" in df.columns:
        df["Latitude"] = pd.to_numeric(df["Latitude"], errors="coerce")
    if "Longitude" in df.columns:
        df["Longitude"] = pd.to_numeric(df["Longitude"], errors="coerce")

    # stable merge keys (rounding helps when joins fail due to float precision)
    df["lat_k"] = df["Latitude"].round(5)
    df["lon_k"] = df["Longitude"].round(5)
    df["date_k"] = df["Sample Date"].dt.strftime("%Y-%m-%d")
    return df

merge_keys = ["lat_k", "lon_k", "date_k"]

# ---- load ----
wq_train = _prepare_keys(pd.read_csv(WQ_TRAIN_PATH))
tc_train = _prepare_keys(pd.read_csv(TC_TRAIN_PATH))
ls_train = _prepare_keys(pd.read_csv(LS_TRAIN_PATH))

# ---- merge ----
train_df = wq_train.merge(tc_train, on=merge_keys, how="left", suffixes=("", "_tc"))
train_df = train_df.merge(ls_train, on=merge_keys, how="left", suffixes=("", "_ls"))

print("Merged train_df shape:", train_df.shape)
train_df.head()

Training files:
WQ: water_quality_training_dataset.csv
TC: terraclimate_features_training.csv
LS: landsat_features_training.csv
Merged train_df shape: (19278121, 22)


Unnamed: 0,Latitude,Longitude,Sample Date,Total Alkalinity,Electrical Conductance,Dissolved Reactive Phosphorus,lat_k,lon_k,date_k,Latitude_tc,...,pet,Latitude_ls,Longitude_ls,Sample Date_ls,nir,green,swir16,swir22,NDMI,MNDWI
0,-28.760833,17.730278,2011-02-01,128.912,555.0,10.0,-28.76083,17.73028,2011-02-01,-28.760833,...,174.2,-28.760833,17.730278,2011-02-01,11190.0,11426.0,7687.5,7645.0,0.185538,0.195595
1,-26.861111,28.884722,2011-03-01,74.72,162.9,163.0,-26.86111,28.88472,2011-03-01,-26.861111,...,124.1,-26.861111,28.884722,2011-03-01,17658.5,9550.0,13746.5,10574.0,0.124566,-0.180134
2,-26.45,28.085833,2011-03-01,89.254,573.0,80.0,-26.45,28.08583,2011-03-01,-26.45,...,127.5,-26.45,28.085833,2011-03-01,15210.0,10720.0,17974.0,14201.0,-0.083293,-0.252805
3,-27.671111,27.236944,2011-03-01,82.0,203.6,101.0,-27.67111,27.23694,2011-03-01,-27.671111,...,129.7,-27.671111,27.236944,2011-03-01,14887.0,10943.0,13522.0,11403.0,0.048048,-0.105416
4,-27.356667,27.286389,2011-03-01,56.1,145.1,151.0,-27.35667,27.28639,2011-03-01,-27.356667,...,129.2,-27.356667,27.286389,2011-03-01,16828.5,9502.5,12665.5,9643.0,0.141147,-0.142683


# (2) Train XGBoost “epoch-like” (boosting rounds), multi-run, tunable, with early stopping

- This cell:
- adds safe engineered features (time + coarse spatial bins)
- defines XGBoost params you can tune
- trains one model per target
- repeats training for multiple seeds (runs) and stores learning curves

In [3]:
import xgboost as xgb
print("xgboost version:", xgb.__version__)

xgboost version: 3.1.3


In [7]:
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

TARGETS = ["Total Alkalinity", "Electrical Conductance", "Dissolved Reactive Phosphorus"]

def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def add_time_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    dt = pd.to_datetime(df["Sample Date"], errors="coerce")
    df["year"] = dt.dt.year
    df["month"] = dt.dt.month
    df["dayofyear"] = dt.dt.dayofyear
    df["weekofyear"] = dt.dt.isocalendar().week.astype("int64")
    # cyclic month
    df["month_sin"] = np.sin(2*np.pi*df["month"]/12.0)
    df["month_cos"] = np.cos(2*np.pi*df["month"]/12.0)
    return df

train_df = add_time_features(train_df)

drop_cols = set(TARGETS + ["Sample Date", "Latitude", "Longitude", "lat_k", "lon_k", "date_k"])
numeric_features = []
for c in train_df.columns:
    if c in drop_cols:
        continue
    if pd.api.types.is_numeric_dtype(train_df[c]):
        numeric_features.append(c)

FEATURES = numeric_features
print("Num FEATURES:", len(FEATURES))

df_non_null = train_df.dropna(subset=TARGETS).copy()

train_idx, valid_idx = train_test_split(
    df_non_null.index,
    test_size=0.2,
    random_state=42
)

train_part = df_non_null.loc[train_idx].copy()
valid_part = df_non_null.loc[valid_idx].copy()

xgb_params = dict(
    n_estimators=8000,          
    learning_rate=0.02,
    max_depth=6,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_alpha=0.0,
    reg_lambda=6.0,
    min_child_weight=3,
    objective="reg:squarederror",
    tree_method="hist",        
    eval_metric="rmse",
    early_stopping_rounds=250,
)

SEEDS = [42, 99, 202, 777]     

models_by_target = {t: [] for t in TARGETS}
hist_by_target   = {t: [] for t in TARGETS}
scores_by_target = {t: [] for t in TARGETS}

for seed in SEEDS:
    for tgt in TARGETS:
        X_tr = train_part[FEATURES]
        y_tr = train_part[tgt].values
        X_va = valid_part[FEATURES]
        y_va = valid_part[tgt].values

        model = XGBRegressor(**xgb_params, random_state=seed)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_tr, y_tr), (X_va, y_va)],
            verbose=False
        )

        pred_tr = model.predict(X_tr)
        pred_va = model.predict(X_va)

        models_by_target[tgt].append(model)
        hist_by_target[tgt].append(model.evals_result())
        scores_by_target[tgt].append({
            "seed": seed,
            "best_iteration": int(model.best_iteration) if model.best_iteration is not None else None,
            "train_r2": r2_score(y_tr, pred_tr),
            "train_rmse": rmse(y_tr, pred_tr),
            "valid_r2": r2_score(y_va, pred_va),
            "valid_rmse": rmse(y_va, pred_va),
        })

print("Done training multiple runs.")

ValueError: cannot convert NA to integer