In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely import wkt
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
"""
Load model performance results, clean and aggregate R² scores (%),
reshape into a comparison table across CV strategies, and format for presentation.
"""

pd.set_option('display.float_format', '{:.2f}'.format)
df = pd.read_csv("/content/drive/MyDrive/data_scriptie/Output/pollutants_monthly/results/all_results.csv")

# Clean and convert R2 (%) strings to floats
def clean_percent(s):
    s = str(s).strip()
    sign = -1 if s.startswith('-') else 1
    if s and s[0] in '+-':
        s = s[1:]
    parts = s.rsplit('.', 1)
    if len(parts) == 2:
        intp, decp = parts
        intp = intp.replace('.', '')
        num_str = intp + '.' + decp
    else:
        num_str = s.replace('.', '')
    return sign * float(num_str)

df['R2 (%)'] = df['R2 (%)'].apply(clean_percent)

# Compute mean R² per pollutant, model, and CV type
mean_scores = (
    df
    .groupby(['Pollutant','Model','CV_type'])['R2 (%)']
    .mean()
    .round(2)
    .reset_index()
)

# Reshape to wide format
pivot = mean_scores.pivot(
    index=['Pollutant','Model'],
    columns='CV_type',
    values='R2 (%)'
).reset_index()
pivot.columns.name = None

# Rename columns for clarity
pivot = pivot.rename(columns={
    'Station CV':    'GroupKFold(site_id)',
    'Country LOOCV': 'LOCO',
    'Basin LOOCV':   'LOBO'
})

# Reorder columns
pivot = pivot[[
    'Pollutant',
    'Model',
    'Random CV',
    'GroupKFold(site_id)',
    'LOCO',
    'LOBO'
]]

# Format values and mask negative R²
cv_cols = ['Random CV','GroupKFold(site_id)','LOCO','LOBO']
def fmt(x):
    if x < 0:
        return "< 0"
    return f"{x:.2f}"

pivot[cv_cols] = pivot[cv_cols].applymap(fmt)

# Output as Markdown table
print(pivot.to_markdown(index=False))


| Pollutant        | Model   |   Random CV | GroupKFold(site_id)   | LOCO   | LOBO   |
|:-----------------|:--------|------------:|:----------------------|:-------|:-------|
| Carbamazepine_SW | LGBM    |       45    | < 0                   | < 0    | < 0    |
| Carbamazepine_SW | RF      |       48.75 | < 0                   | < 0    | < 0    |
| Carbamazepine_SW | XGB     |       42.21 | < 0                   | < 0    | < 0    |
| DO_SW            | LGBM    |       63.03 | 57.82                 | 31.12  | 8.42   |
| DO_SW            | RF      |       66.61 | 57.62                 | 27.33  | < 0    |
| DO_SW            | XGB     |       63.64 | 58.04                 | 32.06  | 11.08  |
| Diclofenac_SW    | LGBM    |       44.35 | < 0                   | < 0    | < 0    |
| Diclofenac_SW    | RF      |       45.43 | < 0                   | < 0    | < 0    |
| Diclofenac_SW    | XGB     |       42.34 | < 0                   | < 0    | < 0    |
| PFOS_SW          | LGBM    |       60.23 

  pivot[cv_cols] = pivot[cv_cols].applymap(fmt)


In [None]:
"""
Load and summarize RMSE scores across models, pollutants, and cross-validation strategies.
Clean and pivot the data into a presentation-ready comparison table.
"""

pd.set_option('display.float_format', '{:.2f}'.format)
df = pd.read_csv("/content/drive/MyDrive/data_scriptie/Output/pollutants_monthly/results/all_results.csv")

# Ensure RMSE column is numeric
df['RMSE'] = pd.to_numeric(df['RMSE'], errors='coerce')

# Compute mean RMSE per pollutant, model, and CV type
mean_rmse = (
    df
    .groupby(['Pollutant','Model','CV_type'])['RMSE']
    .mean()
    .round(2)
    .reset_index()
)

# Pivot to wide format
pivot_rmse = mean_rmse.pivot(
    index=['Pollutant','Model'],
    columns='CV_type',
    values='RMSE'
).reset_index()
pivot_rmse.columns.name = None

# Rename CV column labels
pivot_rmse = pivot_rmse.rename(columns={
    'Station CV':    'GroupKFold(site_id)',
    'Country LOOCV': 'LOCO',
    'Basin LOOCV':   'LOBO'
})

# Reorder columns
pivot_rmse = pivot_rmse[[
    'Pollutant',
    'Model',
    'Random CV',
    'GroupKFold(site_id)',
    'LOCO',
    'LOBO'
]]

# Format as string with 2 decimals (keep NA for missing)
cv_cols = ['Random CV','GroupKFold(site_id)','LOCO','LOBO']
pivot_rmse[cv_cols] = pivot_rmse[cv_cols].applymap(lambda x: f"{x:.2f}" if pd.notnull(x) else "NA")

# Output as Markdown table
print(pivot_rmse.to_markdown(index=False))


| Pollutant        | Model   |   Random CV |   GroupKFold(site_id) |   LOCO |   LOBO |
|:-----------------|:--------|------------:|----------------------:|-------:|-------:|
| Carbamazepine_SW | LGBM    |        0.07 |                  0.1  |   0.15 |   0.09 |
| Carbamazepine_SW | RF      |        0.07 |                  0.1  |   0.12 |   0.09 |
| Carbamazepine_SW | XGB     |        0.08 |                  0.1  |   0.14 |   0.09 |
| DO_SW            | LGBM    |        1.72 |                  1.84 |   1.75 |   1.45 |
| DO_SW            | RF      |        1.64 |                  1.84 |   1.78 |   1.47 |
| DO_SW            | XGB     |        1.71 |                  1.83 |   1.73 |   1.43 |
| Diclofenac_SW    | LGBM    |        0.11 |                  0.16 |   0.15 |   0.02 |
| Diclofenac_SW    | RF      |        0.11 |                  0.14 |   0.14 |   0.03 |
| Diclofenac_SW    | XGB     |        0.11 |                  0.15 |   0.19 |   0.03 |
| PFOS_SW          | LGBM    |        0.01 

  pivot_rmse[cv_cols] = pivot_rmse[cv_cols].applymap(lambda x: f"{x:.2f}" if pd.notnull(x) else "NA")
