In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import r2_score, mean_squared_error
import xgboost as xgb
from scipy.stats import hmean

print("=== Loading data ===")
wq = pd.read_csv('water_quality_training_dataset.csv')

df = wq.copy()
df['Sample Date'] = pd.to_datetime(df['Sample Date'], format='%d-%m-%Y', errors='coerce')
df = df.dropna(subset=['Sample Date'])

# ─── Enhanced features ────────────────────────────────────────────────
# 1. Strong circular encoding for lat/lon
df['lat_rad'] = np.radians(df['Latitude'])
df['lon_rad'] = np.radians(df['Longitude'])
for c, rad in [('lat', 'lat_rad'), ('lon', 'lon_rad')]:
    df[f'{c}_sin'] = np.sin(df[rad])
    df[f'{c}_cos'] = np.cos(df[rad])
    df[f'{c}_sin2'] = np.sin(2 * df[rad])
    df[f'{c}_cos2'] = np.cos(2 * df[rad])

# 2. Time features — year trend + full seasonal cycle
df['year'] = df['Sample Date'].dt.year.astype(float)
df['year_norm'] = (df['year'] - 2011) / 4.0
df['month'] = df['Sample Date'].dt.month.astype(float)
df['doy']   = df['Sample Date'].dt.dayofyear.astype(float)

for period in [12, 6, 4, 3]:
    df[f'month_sin_{period}'] = np.sin(2 * np.pi * df['month'] / period)
    df[f'month_cos_{period}'] = np.cos(2 * np.pi * df['month'] / period)
    df[f'doy_sin_{period}']   = np.sin(2 * np.pi * df['doy']   / 365.25 * period)
    df[f'doy_cos_{period}']   = np.cos(2 * np.pi * df['doy']   / 365.25 * period)

# 3. Interaction terms (lat/lon with season)
df['lat_month_sin'] = df['lat_sin'] * df['month_sin_12']
df['lon_month_sin'] = df['lon_sin'] * df['month_sin_12']

features = [c for c in df.columns if c.startswith(('lat_','lon_','month_','doy_','year_norm'))]

print(f"Using {len(features)} engineered features")

# ─── Data prep ────────────────────────────────────────────────────────
X = df[features].copy()
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

targets = {
    'TA':  df['Total Alkalinity'],
    'EC':  df['Electrical Conductance'],
    'DRP': df['Dissolved Reactive Phosphorus']
}

# ─── Train function with cross-validation blending ─────────────────────
def train_and_blend(X, y, name, n_folds=5):
    print(f"\nTraining {name} ...")
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(y))
    
    params = {
        'objective': 'reg:squarederror',
        'n_estimators': 1200,
        'learning_rate': 0.018,
        'max_depth': 6,
        'subsample': 0.78,
        'colsample_bytree': 0.65,
        'reg_lambda': 4.5,
        'reg_alpha': 1.8,
        'min_child_weight': 5,
        'random_state': 42,
        'early_stopping_rounds': 60,
        'n_jobs': -1,
        'verbosity': 0
    }
    
    models = []
    for fold, (tr_idx, val_idx) in enumerate(kf.split(X), 1):
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]
        
        model = xgb.XGBRegressor(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            verbose=False
        )
        models.append(model)
        
        oof_preds[val_idx] = model.predict(X_val)
    
    r2 = r2_score(y, oof_preds)
    rmse = np.sqrt(mean_squared_error(y, oof_preds))
    print(f"{name:20}  CV R² = {r2:.4f}   RMSE = {rmse:.2f}")
    
    return models

# Train one model set per target
models = {}
for tgt, y in targets.items():
    models[tgt] = train_and_blend(X_scaled, y, tgt)

# ─── Submission ───────────────────────────────────────────────────────
print("\n=== Generating submission ===")
sub = pd.read_csv('submission_template.csv')
sub['Sample Date'] = pd.to_datetime(sub['Sample Date'], format='%d-%m-%Y', errors='coerce')

# Same feature engineering
sub['lat_rad'] = np.radians(sub['Latitude'])
sub['lon_rad'] = np.radians(sub['Longitude'])
for c, rad in [('lat', 'lat_rad'), ('lon', 'lon_rad')]:
    sub[f'{c}_sin'] = np.sin(sub[rad])
    sub[f'{c}_cos'] = np.cos(sub[rad])
    sub[f'{c}_sin2'] = np.sin(2 * sub[rad])
    sub[f'{c}_cos2'] = np.cos(2 * sub[rad])

sub['year'] = sub['Sample Date'].dt.year.astype(float)
sub['year_norm'] = (sub['year'] - 2011) / 4.0
sub['month'] = sub['Sample Date'].dt.month.astype(float)
sub['doy']   = sub['Sample Date'].dt.dayofyear.astype(float)

for period in [12, 6, 4, 3]:
    sub[f'month_sin_{period}'] = np.sin(2 * np.pi * sub['month'] / period)
    sub[f'month_cos_{period}'] = np.cos(2 * np.pi * sub['month'] / period)
    sub[f'doy_sin_{period}']   = np.sin(2 * np.pi * sub['doy']   / 365.25 * period)
    sub[f'doy_cos_{period}']   = np.cos(2 * np.pi * sub['doy']   / 365.25 * period)

sub['lat_month_sin'] = sub['lat_sin'] * sub['month_sin_12']
sub['lon_month_sin'] = sub['lon_sin'] * sub['month_sin_12']

sub_X = sub[features]
sub_X_scaled = scaler.transform(sub_X)

# Blend predictions from all folds
def blend_predict(models_list, X):
    preds = np.mean([m.predict(X) for m in models_list], axis=0)
    return preds

sub['Total Alkalinity']              = blend_predict(models['TA'],  sub_X_scaled)
sub['Electrical Conductance']        = blend_predict(models['EC'],  sub_X_scaled)
sub['Dissolved Reactive Phosphorus'] = blend_predict(models['DRP'], sub_X_scaled)

# Realistic clipping (prevents crazy outliers)
sub['Total Alkalinity']              = sub['Total Alkalinity'].clip(10, 350)
sub['Electrical Conductance']        = sub['Electrical Conductance'].clip(50, 1800)
sub['Dissolved Reactive Phosphorus'] = sub['Dissolved Reactive Phosphorus'].clip(1, 200)

# Final submission file
submission_final = sub[[
    'Latitude', 'Longitude', 'Sample Date',
    'Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus'
]]

print("\nFirst 15 submission rows:")
print(submission_final.head(15).to_string(index=False))

submission_final.to_csv('submission_xgb_enhanced.csv', index=False)
print("\nSaved → submission_xgb_enhanced.csv")
print("Upload this — it should push you well above 0.028 (likely 0.15–0.45+ range)")

=== Loading data ===
Using 29 engineered features

Training TA ...
TA                    CV R² = 0.8381   RMSE = 30.05

Training EC ...
EC                    CV R² = 0.8450   RMSE = 134.61

Training DRP ...
DRP                   CV R² = 0.6167   RMSE = 31.56

=== Generating submission ===

First 15 submission rows:
  Latitude  Longitude Sample Date  Total Alkalinity  Electrical Conductance  Dissolved Reactive Phosphorus
-32.043333  27.822778  2014-09-01         47.739815              233.499344                      22.863617
-33.329167  26.077500  2015-09-16        180.494843              754.415039                      78.401772
-32.991639  27.640028  2015-05-07         36.164478              213.880219                      29.823120
-34.096389  24.439167  2012-02-07         39.550884              622.862976                      16.513981
-32.000556  28.581667  2014-10-01         43.588364              236.271286                      27.909363
-32.086390  25.575560  2013-07-19        