In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
import xgboost as xgb
import lightgbm as lgb
from tqdm import tqdm
import pystac_client
import planetary_computer as pc
import odc.stac

In [2]:
print("=== Loading training data ===")

wq = pd.read_csv('water_quality_training_dataset.csv')
tc = pd.read_csv('terraclimate_features_training.csv')
ls = pd.read_csv('landsat_features_training.csv')

df = pd.merge(wq, tc, on=['Latitude', 'Longitude', 'Sample Date'], how='inner')
df = pd.merge(df, ls, on=['Latitude', 'Longitude', 'Sample Date'], how='inner')

print(f"Training rows: {len(df):,}")

=== Loading training data ===
Training rows: 9,319


In [3]:
df['Sample Date'] = pd.to_datetime(df['Sample Date'], format='%d-%m-%Y', errors='coerce')
df = df.dropna(subset=['Sample Date'])

df['year'] = df['Sample Date'].dt.year.astype(float)
df['month'] = df['Sample Date'].dt.month.astype(float)
df['doy'] = df['Sample Date'].dt.dayofyear.astype(float)

df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df['doy_sin'] = np.sin(2 * np.pi * df['doy'] / 365.25)
df['doy_cos'] = np.cos(2 * np.pi * df['doy'] / 365.25)

In [None]:
features = ['pet', 'nir', 'green', 'swir16', 'swir22', 'NDMI', 'MNDWI', 
            'year', 'month_sin', 'month_cos', 'doy_sin', 'doy_cos']

X = df[features].fillna(df[features].mean())  
y_TA = df['Total Alkalinity']
y_EC = df['Electrical Conductance']
y_DRP = df['Dissolved Reactive Phosphorus']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
def train_tuned_blend(X, y, name):
    print(f"\nTuning & training {name} ...")
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof = np.zeros(len(y))
    models_xgb = []
    models_lgb = []

    xgb_grid = {
        'learning_rate': [0.02, 0.05],
        'max_depth': [4, 6],
        'subsample': [0.8],
        'colsample_bytree': [0.7],
        'n_estimators': [800]
    }

    lgb_grid = {
        'learning_rate': [0.02, 0.05],
        'max_depth': [4, 6],
        'num_leaves': [31, 63],
        'n_estimators': [800]
    }

    for fold, (tr_idx, val_idx) in enumerate(kf.split(X), 1):
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        xgb_tuner = GridSearchCV(
            xgb.XGBRegressor(random_state=42, n_jobs=-1),
            xgb_grid,
            cv=3,
            scoring='neg_mean_squared_error',
            verbose=0
        )
        xgb_tuner.fit(X_tr, y_tr)
        xgb_model = xgb_tuner.best_estimator_
        
        xgb_model.fit(X_tr, y_tr)
        models_xgb.append(xgb_model)

        lgb_tuner = GridSearchCV(
            lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1),
            lgb_grid,
            cv=3,
            scoring='neg_mean_squared_error',
            verbose=0
        )
        lgb_tuner.fit(X_tr, y_tr)
        lgb_model = lgb_tuner.best_estimator_
        lgb_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)])
        models_lgb.append(lgb_model)
        oof[val_idx] = 0.55 * xgb_model.predict(X_val) + 0.45 * lgb_model.predict(X_val)

    r2 = r2_score(y, oof)
    rmse = np.sqrt(mean_squared_error(y, oof))
    print(f"{name:20}  CV R² = {r2:.4f}   RMSE = {rmse:.2f}")

    return models_xgb, models_lgb

In [16]:
models = {}
for tgt, y in {'TA': y_TA, 'EC': y_EC, 'DRP': y_DRP}.items():
    models[tgt] = train_tuned_blend(X_scaled, y, tgt)


Tuning & training TA ...




TA                    CV R² = 0.4708   RMSE = 54.33

Tuning & training EC ...




EC                    CV R² = 0.5604   RMSE = 226.70

Tuning & training DRP ...




DRP                   CV R² = 0.3939   RMSE = 39.69




In [17]:
def extract_terraclimate_pet(lat, lon, sample_date, max_days=15):
    try:
        catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")
        bbox = [lon - 0.0001, lat - 0.0001, lon + 0.0001, lat + 0.0001]
        start = (datetime.strptime(sample_date, '%Y-%m-%d') - timedelta(days=max_days)).strftime('%Y-%m-%d')
        end = (datetime.strptime(sample_date, '%Y-%m-%d') + timedelta(days=max_days)).strftime('%Y-%m-%d')
        date_range = f"{start}/{end}"
        search = catalog.search(collections=["terraclimate"], bbox=bbox, datetime=date_range)
        items = list(search.items())
        if items:
            signed_item = pc.sign(items[0])
            data = odc.stac.stac_load([signed_item], bands=["pet"], bbox=bbox, crs="EPSG:4326", resolution=0.0001)
            pet_value = data["pet"].mean().values.item()
            return pet_value
        return np.nan
    except:
        return np.nan

def extract_landsat_features(lat, lon, sample_date, max_days=15):
    try:
        catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")
        bbox = [lon - 0.05, lat - 0.05, lon + 0.05, lat + 0.05]
        start = (datetime.strptime(sample_date, '%Y-%m-%d') - timedelta(days=max_days)).strftime('%Y-%m-%d')
        end = (datetime.strptime(sample_date, '%Y-%m-%d') + timedelta(days=max_days)).strftime('%Y-%m-%d')
        date_range = f"{start}/{end}"
        search = catalog.search(collections=["landsat-c2-l2"], bbox=bbox, datetime=date_range, sortby="-datetime")
        items = list(search.items())
        if not items:
            return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
        
        signed_item = pc.sign(items[0])
        data = odc.stac.stac_load([signed_item], bands=["nir08", "green", "swir16", "swir22"], bbox=bbox, crs="EPSG:4326", resolution=30)
        
        nir    = data["nir08"].mean().values.item()  if "nir08" in data else np.nan
        green  = data["green"].mean().values.item()  if "green" in data else np.nan
        swir16 = data["swir16"].mean().values.item() if "swir16" in data else np.nan
        swir22 = data["swir22"].mean().values.item() if "swir22" in data else np.nan
        
        ndmi  = (nir - swir16) / (nir + swir16) if pd.notna(nir) and pd.notna(swir16) else np.nan
        mndwi = (green - swir22) / (green + swir22) if pd.notna(green) and pd.notna(swir22) else np.nan
        
        return nir, green, swir16, swir22, ndmi, mndwi
    except:
        return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan

In [18]:
print("\n=== Processing submission with tuned extraction ===")
sub = pd.read_csv('submission_template.csv')
sub['Sample Date'] = pd.to_datetime(sub['Sample Date'], format='%d-%m-%Y', errors='coerce').dt.strftime('%Y-%m-%d') 

tqdm.pandas()
sub['pet'] = sub.progress_apply(lambda row: extract_terraclimate_pet(row['Latitude'], row['Longitude'], row['Sample Date']), axis=1)

sub[['nir', 'green', 'swir16', 'swir22', 'NDMI', 'MNDWI']] = sub.progress_apply(
    lambda row: extract_landsat_features(row['Latitude'], row['Longitude'], row['Sample Date']),
    axis=1,
    result_type='expand'
)


=== Processing submission with tuned extraction ===


100%|██████████| 200/200 [03:35<00:00,  1.08s/it]
100%|██████████| 200/200 [11:09<00:00,  3.35s/it]


In [22]:
# Add time features to submission
sub['Sample Date'] = pd.to_datetime(sub['Sample Date'], format='%Y-%m-%d')
sub['year'] = sub['Sample Date'].dt.year.astype(float)
sub['month'] = sub['Sample Date'].dt.month.astype(float)
sub['doy'] = sub['Sample Date'].dt.dayofyear.astype(float)

sub['month_sin'] = np.sin(2 * np.pi * sub['month'] / 12)
sub['month_cos'] = np.cos(2 * np.pi * sub['month'] / 12)
sub['doy_sin'] = np.sin(2 * np.pi * sub['doy'] / 365.25)
sub['doy_cos'] = np.cos(2 * np.pi * sub['doy'] / 365.25)

# Fill any failed extractions with training means
for col in features:
    sub[col] = sub[col].fillna(df[col].mean())

sub_X = sub[features]
sub_X_scaled = scaler.transform(sub_X)

In [23]:
def blend_predict(models_xgb, models_lgb, X):
    preds_xgb = np.mean([m.predict(X) for m in models_xgb], axis=0)
    preds_lgb = np.mean([m.predict(X) for m in models_lgb], axis=0)
    return 0.6 * preds_xgb + 0.4 * preds_lgb

sub['Total Alkalinity']              = blend_predict(*models['TA'], sub_X_scaled)
sub['Electrical Conductance']        = blend_predict(*models['EC'], sub_X_scaled)
sub['Dissolved Reactive Phosphorus'] = blend_predict(*models['DRP'], sub_X_scaled)

sub['Total Alkalinity']              = sub['Total Alkalinity'].clip(df['Total Alkalinity'].min(), df['Total Alkalinity'].max())
sub['Electrical Conductance']        = sub['Electrical Conductance'].clip(df['Electrical Conductance'].min(), df['Electrical Conductance'].max())
sub['Dissolved Reactive Phosphorus'] = sub['Dissolved Reactive Phosphorus'].clip(df['Dissolved Reactive Phosphorus'].min(), df['Dissolved Reactive Phosphorus'].max())

final_sub = sub[[
    'Latitude', 'Longitude', 'Sample Date',
    'Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus'
]]

print("\nFirst 15 submission rows:")
print(final_sub.head(15).to_string(index=False))

final_sub.to_csv('submission_tuned_benchmark_v1.csv', index=False)
print("\nSaved → submission_tuned_benchmark_v1.csv")
print("Upload this — it should give 0.3–0.4+ if extraction succeeds for 60%+ points.")


First 15 submission rows:
  Latitude  Longitude Sample Date  Total Alkalinity  Electrical Conductance  Dissolved Reactive Phosphorus
-32.043333  27.822778  2014-09-01        161.480416              722.688074                      45.665392
-33.329167  26.077500  2015-09-16        139.453031              628.160591                      26.453544
-32.991639  27.640028  2015-05-07        136.852799              654.184970                      29.307177
-34.096389  24.439167  2012-02-07        160.561721              917.866924                      54.324965
-32.000556  28.581667  2014-10-01        164.854650              725.441086                      57.965334
-32.086390  25.575560  2013-07-19        167.907872              833.007106                      40.899760
-32.000556  28.581667  2014-09-03        161.074559              724.311780                      46.180448
-32.991639  27.640028  2014-10-02        166.820283              719.201538                      56.991604
-32.000556

