In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
import xgboost as xgb

# 1. Load & merge your three files
wq = pd.read_csv('water_quality_training_dataset.csv')
tc = pd.read_csv('terraclimate_features_training.csv')
ls = pd.read_csv('landsat_features_training.csv')

df = pd.merge(wq, tc, on=['Latitude','Longitude','Sample Date'], how='inner')
df = pd.merge(df, ls, on=['Latitude','Longitude','Sample Date'], how='inner')

# Parse date properly
df['Sample Date'] = pd.to_datetime(df['Sample Date'], format='%d-%m-%Y', errors='coerce')
df = df.dropna(subset=['Sample Date'])

# Add time features so predictions vary by date
df['year']  = df['Sample Date'].dt.year
df['month'] = df['Sample Date'].dt.month
df['day']   = df['Sample Date'].dt.day
df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
df['month_cos'] = np.cos(2 * np.pi * df['month']/12)

# Features used during training
features = ['pet', 'nir', 'green', 'swir16', 'swir22', 'NDMI', 'MNDWI',
            'year', 'month_sin', 'month_cos', 'day']

X = df[features].fillna(df[features].mean())
y_TA  = df['Total Alkalinity']
y_EC  = df['Electrical Conductance']
y_DRP = df['Dissolved Reactive Phosphorus']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train / validation split
X_tr, X_te, ya_tr, ya_te = train_test_split(X_scaled, y_TA,  test_size=0.2, random_state=42)
_,     _,     ye_tr, ye_te = train_test_split(X_scaled, y_EC,  test_size=0.2, random_state=42)
_,     _,     yd_tr, yd_te = train_test_split(X_scaled, y_DRP, test_size=0.2, random_state=42)

# XGBoost config (a bit stronger than before)
params = {
    'objective': 'reg:squarederror',
    'n_estimators': 300,
    'learning_rate': 0.05,
    'max_depth': 7,
    'subsample': 0.85,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'n_jobs': -1
}

model_TA  = xgb.XGBRegressor(**params).fit(X_tr, ya_tr)
model_EC  = xgb.XGBRegressor(**params).fit(X_tr, ye_tr)
model_DRP = xgb.XGBRegressor(**params).fit(X_tr, yd_tr)

# Quick check on validation
def score(y_true, y_pred, name):
    print(f"{name:22}  R² = {r2_score(y_true, y_pred):.4f}   RMSE = {np.sqrt(mean_squared_error(y_true, y_pred)):.2f}")

score(ya_te, model_TA.predict(X_te),  "Total Alkalinity")
score(ye_te, model_EC.predict(X_te),  "Electrical Conductance")
score(yd_te, model_DRP.predict(X_te), "Dissolved Reactive Phosphorus")

# ────────────────────────────────────────────────
# Prepare submission
sub = pd.read_csv('submission_template.csv')

sub['Sample Date'] = pd.to_datetime(sub['Sample Date'], format='%d-%m-%Y', errors='coerce')

sub['year']     = sub['Sample Date'].dt.year
sub['month']    = sub['Sample Date'].dt.month
sub['day']      = sub['Sample Date'].dt.day
sub['month_sin'] = np.sin(2 * np.pi * sub['month']/12)
sub['month_cos'] = np.cos(2 * np.pi * sub['month']/12)

# Fill missing remote-sensing columns with training averages
sub_X = sub[['year','month_sin','month_cos','day']].copy()
for c in ['pet','nir','green','swir16','swir22','NDMI','MNDWI']:
    sub_X[c] = df[c].mean()

sub_X_scaled = scaler.transform(sub_X[features])

# Predict → only the three columns needed
sub['Total Alkalinity']             = model_TA.predict(sub_X_scaled)
sub['Electrical Conductance']       = model_EC.predict(sub_X_scaled)
sub['Dissolved Reactive Phosphorus'] = model_DRP.predict(sub_X_scaled)

# Final output format
submission_final = sub[['Latitude', 'Longitude', 'Sample Date',
                        'Total Alkalinity', 'Electrical Conductance',
                        'Dissolved Reactive Phosphorus']]

# Show preview
print("\nFirst 15 predictions (should vary):")
print(submission_final.head(15))

# Save
submission_final.to_csv('submission_xgboost_ready.csv', index=False)
print("\n→ Saved to: submission_xgboost_ready.csv")
print("Upload this file directly to the platform.")

Total Alkalinity        R² = 0.5837   RMSE = 48.95
Electrical Conductance  R² = 0.6084   RMSE = 216.21
Dissolved Reactive Phosphorus  R² = 0.5627   RMSE = 34.26

First 15 predictions (should vary):
     Latitude  Longitude Sample Date  Total Alkalinity  \
0  -32.043333  27.822778  2014-09-01        157.367447   
1  -33.329167  26.077500  2015-09-16        147.106842   
2  -32.991639  27.640028  2015-05-07        144.510223   
3  -34.096389  24.439167  2012-02-07        164.432724   
4  -32.000556  28.581667  2014-10-01        155.262817   
5  -32.086390  25.575560  2013-07-19        177.705231   
6  -32.000556  28.581667  2014-09-03        165.000168   
7  -32.991639  27.640028  2014-10-02        161.269867   
8  -32.000556  28.581667  2014-08-06        157.877182   
9  -33.185361  27.390750  2011-09-22        168.091064   
10 -33.731111  24.618333  2013-03-28        174.193405   
11 -33.731111  24.618333  2013-04-18        171.436493   
12 -32.000556  28.581667  2012-11-21        177.