In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
import xgboost as xgb

print("=== Loading & preparing data ===")

# Load only water quality (targets + lat/lon/date)
df = pd.read_csv('water_quality_training_dataset.csv')

# Parse date
df['Sample Date'] = pd.to_datetime(df['Sample Date'], format='%d-%m-%Y', errors='coerce')
df = df.dropna(subset=['Sample Date'])

# Strong spatial + temporal features
df['lat_rad'] = np.radians(df['Latitude'])
df['lon_rad'] = np.radians(df['Longitude'])
df['lat_sin'] = np.sin(df['lat_rad'])
df['lat_cos'] = np.cos(df['lat_rad'])
df['lon_sin'] = np.sin(df['lon_rad'])
df['lon_cos'] = np.cos(df['lon_rad'])

df['year']       = df['Sample Date'].dt.year.astype(float)
df['month']      = df['Sample Date'].dt.month.astype(float)
df['day_of_year'] = df['Sample Date'].dt.dayofyear.astype(float)
df['month_sin']  = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos']  = np.cos(2 * np.pi * df['month'] / 12)
df['doy_sin']    = np.sin(2 * np.pi * df['day_of_year'] / 365.25)
df['doy_cos']    = np.cos(2 * np.pi * df['day_of_year'] / 365.25)

# Features: pure geo + time (no remote sensing!)
features = [
    'lat_sin', 'lat_cos', 'lon_sin', 'lon_cos',
    'year', 'month_sin', 'month_cos', 'doy_sin', 'doy_cos'
]

X = df[features]
y_TA  = df['Total Alkalinity']
y_EC  = df['Electrical Conductance']
y_DRP = df['Dissolved Reactive Phosphorus']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split
X_tr, X_te = train_test_split(X_scaled, test_size=0.2, random_state=42)
ya_tr, ya_te = train_test_split(y_TA,  test_size=0.2, random_state=42)
ye_tr, ye_te = train_test_split(y_EC,  test_size=0.2, random_state=42)
yd_tr, yd_te = train_test_split(y_DRP, test_size=0.2, random_state=42)

# Simpler, regularized XGBoost
xgb_params = {
    'objective': 'reg:squarederror',
    'n_estimators': 600,
    'learning_rate': 0.03,
    'max_depth': 5,
    'subsample': 0.8,
    'colsample_bytree': 0.7,
    'reg_lambda': 3.0,
    'reg_alpha': 1.0,
    'random_state': 42,
    'early_stopping_rounds': 40,
    'n_jobs': -1,
    'verbosity': 0
}

print("\n=== Training ===")

model_TA = xgb.XGBRegressor(**xgb_params).fit(
    X_tr, ya_tr, eval_set=[(X_te, ya_te)], verbose=False
)
model_EC = xgb.XGBRegressor(**xgb_params).fit(
    X_tr, ye_tr, eval_set=[(X_te, ye_te)], verbose=False
)
model_DRP = xgb.XGBRegressor(**xgb_params).fit(
    X_tr, yd_tr, eval_set=[(X_te, yd_te)], verbose=False
)

# Validation scores
# Validation scores — corrected function name
def print_score(y_true, y_pred, name):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{name:25}  R² = {r2:.4f}   RMSE = {rmse:.2f}")

print("\nValidation performance on hold-out set:")
print_score(ya_te, model_TA.predict(X_te),  "Total Alkalinity")
print_score(ye_te, model_EC.predict(X_te),  "Electrical Conductance")
print_score(yd_te, model_DRP.predict(X_te), "Dissolved Reactive Phosphorus")

# =============================================================================
# SUBMISSION
# =============================================================================

print("\n=== Generating submission ===")

sub = pd.read_csv('submission_template.csv')
sub['Sample Date'] = pd.to_datetime(sub['Sample Date'], format='%d-%m-%Y', errors='coerce')
sub = sub.dropna(subset=['Sample Date'])

# Same features as training
sub['lat_rad'] = np.radians(sub['Latitude'])
sub['lon_rad'] = np.radians(sub['Longitude'])
sub['lat_sin'] = np.sin(sub['lat_rad'])
sub['lat_cos'] = np.cos(sub['lat_rad'])
sub['lon_sin'] = np.sin(sub['lon_rad'])
sub['lon_cos'] = np.cos(sub['lon_rad'])

sub['year']       = sub['Sample Date'].dt.year.astype(float)
sub['month']      = sub['Sample Date'].dt.month.astype(float)
sub['day_of_year'] = sub['Sample Date'].dt.dayofyear.astype(float)
sub['month_sin']  = np.sin(2 * np.pi * sub['month'] / 12)
sub['month_cos']  = np.cos(2 * np.pi * sub['month'] / 12)
sub['doy_sin']    = np.sin(2 * np.pi * sub['day_of_year'] / 365.25)
sub['doy_cos']    = np.cos(2 * np.pi * sub['day_of_year'] / 365.25)

sub_X = sub[features]
sub_X_scaled = scaler.transform(sub_X)

sub['Total Alkalinity']              = model_TA.predict(sub_X_scaled)
sub['Electrical Conductance']        = model_EC.predict(sub_X_scaled)
sub['Dissolved Reactive Phosphorus'] = model_DRP.predict(sub_X_scaled)

# Final output
submission_final = sub[[
    'Latitude', 'Longitude', 'Sample Date',
    'Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus'
]]

print("\nFirst 15 rows of submission (should vary):")
print(submission_final.head(15).to_string(index=False))

submission_final.to_csv('submission_geo_time_xgb.csv', index=False)
print("\nSaved → 'submission_geo_time_xgb.csv'")
print("Upload this now — it should score better than -0.115 (likely close to 0 or slightly positive)")

=== Loading & preparing data ===

=== Training ===

Validation performance on hold-out set:
Total Alkalinity           R² = 0.8294   RMSE = 31.34
Electrical Conductance     R² = 0.8536   RMSE = 132.18
Dissolved Reactive Phosphorus  R² = 0.6458   RMSE = 30.83

=== Generating submission ===

First 15 rows of submission (should vary):
  Latitude  Longitude Sample Date  Total Alkalinity  Electrical Conductance  Dissolved Reactive Phosphorus
-32.043333  27.822778  2014-09-01         29.724779              202.875641                      27.592695
-33.329167  26.077500  2015-09-16        152.536316              744.836487                      67.560921
-32.991639  27.640028  2015-05-07         35.455940              230.749847                      41.325493
-34.096389  24.439167  2012-02-07         34.364712              697.068298                      15.105787
-32.000556  28.581667  2014-10-01         25.959690              188.449051                      30.359249
-32.086390  25.575560  2