In [None]:
import os, glob, re
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely import wkt
from sklearn.model_selection import KFold, GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from cuml.ensemble import RandomForestRegressor as cuRF
import cudf

from google.colab import drive
drive.mount('/content/drive')

In [None]:
def load_features(features_path):
    """
    Load spatial features from a CSV, convert to GeoDataFrame, and separate
    static and monthly feature columns.
    """
    gdf = pd.read_csv(features_path)
    gdf["geometry"] = gdf["geometry"].apply(wkt.loads)
    gdf = gpd.GeoDataFrame(gdf, geometry="geometry", crs="EPSG:4326")
    monthly_cols = [c for c in gdf.columns if re.search(r'_s(0[1-9]|1[0-2])$', c)]
    static_cols  = [c for c in gdf.columns if c not in monthly_cols and c != "geometry"]
    return gdf, static_cols, monthly_cols


def build_joined_features(obs_path, gdf_feats, static_cols, monthly_cols):
    """
    Join monthly observations with matching static and dynamic spatial features
    using spatial joins. Returns a merged GeoDataFrame where each observation
    is enriched with predictor values.
    """
    df_obs = pd.read_csv(obs_path)
    gdf_obs = gpd.GeoDataFrame(
        df_obs,
        geometry=gpd.points_from_xy(df_obs.Longitude, df_obs.Latitude),
        crs="EPSG:4326"
    )
    gdf_obs_sel = gdf_obs[["geometry","obs","country_name","site_id","pollutant","month", "hydbas"]]
    feats_static = gdf_feats[static_cols + ["geometry"]].to_crs(gdf_obs_sel.crs)
    gdf_stat = gpd.sjoin(gdf_obs_sel, feats_static, how="left", predicate="within").drop(columns="index_right")

    rows = []
    for m in range(1,13):
        obs_m = gdf_stat[gdf_stat.month == m].copy()
        if obs_m.empty:
            continue
        suffix = f"_s{m:02d}"
        feats_m = gdf_feats[static_cols + ["geometry"] + [c for c in monthly_cols if c.endswith(suffix)]].copy()
        feats_m = feats_m.rename(columns={c: c.rsplit("_",1)[0] for c in feats_m.columns if c.endswith(suffix)})
        join_m = gpd.sjoin(obs_m, feats_m, how="left", predicate="within").drop(columns="index_right")
        rows.append(join_m)

    if not rows:
        return None
    gdf_full = pd.concat(rows, ignore_index=True)
    gdf_full.columns = [re.sub(r'_(?:left|right)$', '', col) for col in gdf_full.columns]
    gdf_full = gdf_full.loc[:, ~gdf_full.columns.duplicated()]

    fill_cols = ['tmp_dc','pre_mm','pet_mm','aet_mm','cmi_ix','snw_pc',
                 'glc_pc','pnv_pc','wet_pc','swc_pc','hft_ix']
    for col in fill_cols:
        if col in gdf_full.columns:
            gdf_full[col] = gdf_full[col].fillna(0)
    return gdf_full


def prepare_data(gdf_full, log_transform=False):
    """
    Prepare model inputs by filtering valid rows, extracting features and labels,
    applying optional log transform, and standardizing features.
    """
    exclude = ["geometry","obs","country_name","site_id","pollutant", "hydbas"]
    feature_cols = [c for c in gdf_full.columns if c not in exclude]
    mask_good = ~gdf_full[feature_cols].isna().any(axis=1) & ~gdf_full["obs"].isna()
    clean = gdf_full[mask_good].reset_index(drop=True)
    X_df = clean[feature_cols].astype(np.float32)
    y = clean["obs"].values.astype(np.float32)
    if log_transform:
        y = np.log1p(y)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_df)
    return clean, X_df, y, X_scaled, feature_cols


def run_cv(X, y, feature_cols, clean, cv_type="random", group_col=None, n_splits=10):
    """
    Run cross-validation using different fold strategies (Random, Country, Station, or Basin).
    Supports RF (GPU), XGBoost, and LightGBM models. Returns a DataFrame with performance metrics.
    """
    import cudf
    from cuml.ensemble import RandomForestRegressor as cuRF
    from xgboost import XGBRegressor
    from lightgbm import LGBMRegressor
    import numpy as np
    import pandas as pd

    models = {
        "RF":  lambda: cuRF(n_estimators=200, random_state=42, n_streams=1),
        "XGB": lambda: XGBRegressor(n_estimators=200, learning_rate=0.05, random_state=42, n_jobs=-1),
        "LGBM":lambda: LGBMRegressor(n_estimators=200, learning_rate=0.05, random_state=42, n_jobs=-1, verbose=-1),
    }

    results = []
    pollutant = clean["pollutant"].iat[0]
    X_gpu = cudf.from_pandas(pd.DataFrame(X, columns=feature_cols))
    y_gpu = cudf.Series(y)

    if cv_type == "Random CV":
        splits = list(KFold(n_splits=n_splits, shuffle=True, random_state=42).split(X))
        fold_keys = range(1, len(splits)+1)

    elif cv_type == "Country LOOCV":
        groups = clean[group_col]
        min_obs = 10
        country_counts = groups.value_counts()
        eligible_countries = country_counts[country_counts >= min_obs].index
        random_countries = np.random.choice(eligible_countries, size=min(n_splits, len(eligible_countries)), replace=False)
        splits = [(clean[groups != g].index, clean[groups == g].index) for g in random_countries]
        fold_keys = random_countries

    elif cv_type == "Station CV":
        groups = clean["site_id"]
        gkf = GroupKFold(n_splits=n_splits)
        splits = list(gkf.split(X, y, groups=groups))
        fold_keys = [clean.loc[test_idx, "site_id"].unique().tolist() for _, test_idx in splits]

    elif cv_type == "Basin LOOCV":
        basin_series = clean["hydbas"]
        basin_counts = basin_series.value_counts()
        eligible_basins = basin_counts[basin_counts >= 10].index
        np.random.seed(42)
        selected_basins = np.random.choice(eligible_basins, size=min(n_splits, len(eligible_basins)), replace=False)
        splits = [(clean[basin_series != b].index, clean[basin_series == b].index) for b in selected_basins]
        fold_keys = selected_basins

    else:
        raise ValueError("Unknown cv_type")

    fold = 1
    for fold_label, (train_idx, test_idx) in zip(fold_keys, splits):
        if len(test_idx) < 10:
            continue

        Xtr, Xte = X[train_idx], X[test_idx]
        ytr, yte = y[train_idx], y[test_idx]

        for name, build in models.items():
            model = build()
            if name == "RF":
                Xtr_gpu = X_gpu.iloc[train_idx]
                ytr_gpu = y_gpu.iloc[train_idx]
                Xte_gpu = X_gpu.iloc[test_idx]
                model.fit(Xtr_gpu, ytr_gpu)
                pred = model.predict(Xte_gpu).to_pandas().values.astype(np.float32)
            else:
                model.fit(Xtr, ytr)
                pred = model.predict(Xte).astype(np.float32)

            results.append({
                "Fold": fold,
                "Model": name,
                "Pollutant": pollutant,
                "CV_type": cv_type,
                "Obs.": len(yte),
                "R2": r2_score(yte, pred),
                "R2 (%)": r2_score(yte, pred) * 100,
                "RMSE": np.sqrt(mean_squared_error(yte, pred)),
                "Country": fold_label if cv_type == "Country LOOCV" else None,
                "Basin": fold_label if cv_type == "Basin LOOCV" else None,
                "Site_ID": fold_label if cv_type == "Station CV" else None
            })

        fold += 1

    return pd.DataFrame(results)


def main():
    """
    Load features and pollutant observations, join and clean data,
    run multiple cross-validation strategies, and return all results as one DataFrame.
    """
    pollutant_dir = "/content/drive/MyDrive/data_scriptie/Output/pollutants_monthly/"
    features_path = "/content/drive/MyDrive/data_scriptie/Output/features_final.csv"
    pollutant_files = glob.glob(os.path.join(pollutant_dir, "*.csv"))
    print(pollutant_files)
    gdf_feats, static_cols, monthly_cols = load_features(features_path)
    all_results = []

    for path in pollutant_files:
        print(f"\n>>> Processing {os.path.basename(path)}")
        gdf_full = build_joined_features(path, gdf_feats, static_cols, monthly_cols)
        if gdf_full is None:
            continue
        clean, X_df, y, X_scaled, feature_cols = prepare_data(gdf_full, log_transform=False)
        df_result = run_cv(X_scaled, y, feature_cols, clean, cv_type="Random CV")
        all_results.append(df_result)
        df_site = run_cv(X_scaled, y, feature_cols, clean, cv_type="Station CV")
        all_results.append(df_site)
        df_loco = run_cv(X_scaled, y, feature_cols, clean, cv_type="Country LOOCV", group_col="country_name")
        all_results.append(df_loco)
        df_lobo = run_cv(X_scaled, y, feature_cols, clean, cv_type="Basin LOOCV")
        all_results.append(df_lobo)

    full_df = pd.concat(all_results, ignore_index=True)
    return full_df


results_df = main()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['/content/drive/MyDrive/data_scriptie/Output/pollutants_monthly/Diclofenac_SW.csv', '/content/drive/MyDrive/data_scriptie/Output/pollutants_monthly/Carbamazepine_SW.csv', '/content/drive/MyDrive/data_scriptie/Output/pollutants_monthly/DO_SW.csv', '/content/drive/MyDrive/data_scriptie/Output/pollutants_monthly/PFOS_SW.csv', '/content/drive/MyDrive/data_scriptie/Output/pollutants_monthly/TSS_SW.csv', '/content/drive/MyDrive/data_scriptie/Output/pollutants_monthly/TN_SW.csv']

>>> Processing Diclofenac_SW.csv





>>> Processing Carbamazepine_SW.csv





>>> Processing DO_SW.csv





>>> Processing PFOS_SW.csv





>>> Processing TSS_SW.csv





>>> Processing TN_SW.csv




In [None]:
results_df.to_csv("/content/drive/MyDrive/data_scriptie/Output/pollutants_monthly/results/all_results.csv", index=False)