In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import r2_score, mean_squared_error
import xgboost as xgb
import lightgbm as lgb
from scipy import stats

print("=== Loading data ===")
df = pd.read_csv('water_quality_training_dataset.csv')

df['Sample Date'] = pd.to_datetime(df['Sample Date'], format='%d-%m-%Y', errors='coerce')
df = df.dropna(subset=['Sample Date'])

# ─── Very strong feature set ──────────────────────────────────────────
df['lat'] = df['Latitude']
df['lon'] = df['Longitude']

# Circular encodings (multiple harmonics)
for angle, name in [(df['lat'], 'lat'), (df['lon'], 'lon')]:
    rad = np.radians(angle)
    for k in [1, 2, 3, 4]:
        df[f'{name}_sin_{k}'] = np.sin(k * rad)
        df[f'{name}_cos_{k}'] = np.cos(k * rad)

# Time features — full Fourier series + trend
df['year'] = df['Sample Date'].dt.year.astype(float)
df['doy']  = df['Sample Date'].dt.dayofyear.astype(float)
df['year_trend'] = (df['year'] - 2011) / 4.0

for period in [365.25, 182.625, 91.3125, 30.4375]:
    df[f'doy_sin_{period:.1f}'] = np.sin(2 * np.pi * df['doy'] / period)
    df[f'doy_cos_{period:.1f}'] = np.cos(2 * np.pi * df['doy'] / period)

df['month'] = df['Sample Date'].dt.month
df['season'] = df['month'] % 12 // 3   # 0=summer,1=autumn,2=winter,3=spring (southern hemisphere)

# Interaction terms
df['lat_doy_sin'] = df['lat_sin_1'] * df['doy_sin_365.3']
df['lon_doy_sin'] = df['lon_sin_1'] * df['doy_sin_365.3']
df['lat_year']    = df['lat'] * df['year_trend']
df['lon_year']    = df['lon'] * df['year_trend']

features = [c for c in df.columns if any(p in c for p in ['sin','cos','year','doy','season','lat_','lon_'])]

print(f"Using {len(features)} strong features")

# ─── Data prep ────────────────────────────────────────────────────────
X = df[features].copy()
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

targets = {
    'TA':  df['Total Alkalinity'],
    'EC':  df['Electrical Conductance'],
    'DRP': df['Dissolved Reactive Phosphorus']
}

# ─── Train multiple models & blend ────────────────────────────────────
def train_blend(X, y, name):
    print(f"\nTraining {name} ...")
    kf = KFold(n_splits=6, shuffle=True, random_state=42)
    oof = np.zeros(len(y))
    models_xgb = []
    models_lgb = []

    xgb_params = {
        'n_estimators': 1800,
        'learning_rate': 0.014,
        'max_depth': 7,
        'subsample': 0.77,
        'colsample_bytree': 0.62,
        'reg_lambda': 5.5,
        'reg_alpha': 2.2,
        'min_child_weight': 6,
        'early_stopping_rounds': 80,
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': 0
    }

    lgb_params = {
        'objective': 'regression',
        'metric': 'rmse',
        'n_estimators': 1800,
        'learning_rate': 0.016,
        'max_depth': 8,
        'num_leaves': 45,
        'subsample': 0.78,
        'colsample_bytree': 0.65,
        'reg_lambda': 6.0,
        'reg_alpha': 2.5,
        'min_child_samples': 25,
        'random_state': 42,
        'n_jobs': -1,
        'verbose': -1
    }

    for fold, (tr_idx, val_idx) in enumerate(kf.split(X), 1):
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        # XGBoost
        xgb_model = xgb.XGBRegressor(**xgb_params)
        xgb_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
        models_xgb.append(xgb_model)

        # LightGBM
        lgb_model = lgb.LGBMRegressor(**lgb_params)
        lgb_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)])
        models_lgb.append(lgb_model)

        # OOF blend (simple average)
        oof[val_idx] = 0.55 * xgb_model.predict(X_val) + 0.45 * lgb_model.predict(X_val)

    r2 = r2_score(y, oof)
    rmse = np.sqrt(mean_squared_error(y, oof))
    print(f"{name:20}  CV R² = {r2:.4f}   RMSE = {rmse:.2f}")

    return models_xgb, models_lgb

# Train all
models = {}
for tgt, y in targets.items():
    models[tgt] = train_blend(X_scaled, y, tgt)

# ─── Submission ───────────────────────────────────────────────────────
print("\n=== Submission ===")
sub = pd.read_csv('submission_template.csv')
sub['Sample Date'] = pd.to_datetime(sub['Sample Date'], format='%d-%m-%Y', errors='coerce')

# Same feature engineering
sub['lat'] = sub['Latitude']
sub['lon'] = sub['Longitude']
sub['lat_rad'] = np.radians(sub['lat'])
sub['lon_rad'] = np.radians(sub['lon'])

for c, rad in [('lat', 'lat_rad'), ('lon', 'lon_rad')]:
    for k in [1,2,3,4]:
        sub[f'{c}_sin_{k}'] = np.sin(k * sub[rad])
        sub[f'{c}_cos_{k}'] = np.cos(k * sub[rad])

sub['year'] = sub['Sample Date'].dt.year.astype(float)
sub['year_norm'] = (sub['year'] - 2011) / 4.0
sub['month'] = sub['Sample Date'].dt.month.astype(float)
sub['doy']   = sub['Sample Date'].dt.dayofyear.astype(float)

for period in [365.25, 182.625, 91.3125, 30.4375]:
    sub[f'doy_sin_{period:.1f}'] = np.sin(2 * np.pi * sub['doy'] / period)
    sub[f'doy_cos_{period:.1f}'] = np.cos(2 * np.pi * sub['doy'] / period)

for period in [12, 6, 4, 3]:
    sub[f'month_sin_{period}'] = np.sin(2 * np.pi * sub['month'] / period)
    sub[f'month_cos_{period}'] = np.cos(2 * np.pi * sub['month'] / period)

sub['lat_doy_sin'] = sub['lat_sin_1'] * sub['doy_sin_365.3']
sub['lon_doy_sin'] = sub['lon_sin_1'] * sub['doy_sin_365.3']
sub['lat_year']    = sub['lat'] * sub['year_norm']
sub['lon_year']    = sub['lon'] * sub['year_norm']

sub_X = sub[features]
sub_X_scaled = scaler.transform(sub_X)

# Blend predictions from all models
def blend_predict(models_xgb, models_lgb, X):
    preds_xgb = np.mean([m.predict(X) for m in models_xgb], axis=0)
    preds_lgb = np.mean([m.predict(X) for m in models_lgb], axis=0)
    return 0.55 * preds_xgb + 0.45 * preds_lgb

sub['Total Alkalinity']              = blend_predict(*models['TA'],  sub_X_scaled)
sub['Electrical Conductance']        = blend_predict(*models['EC'],  sub_X_scaled)
sub['Dissolved Reactive Phosphorus'] = blend_predict(*models['DRP'], sub_X_scaled)

# Clip to realistic training ranges
sub['Total Alkalinity']              = sub['Total Alkalinity'].clip(5, 400)
sub['Electrical Conductance']        = sub['Electrical Conductance'].clip(30, 2200)
sub['Dissolved Reactive Phosphorus'] = sub['Dissolved Reactive Phosphorus'].clip(0.5, 250)

# Final file
final_sub = sub[[
    'Latitude', 'Longitude', 'Sample Date',
    'Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus'
]]

print("\nFirst 15 rows (should vary significantly):")
print(final_sub.head(15).to_string(index=False))

final_sub.to_csv('submission_strong_v3.csv', index=False)
print("\nSaved → submission_strong_v3.csv")
print("Upload immediately — this should give 0.35–0.55+ if test is not too shifted")

=== Loading data ===


KeyError: 'doy_sin_365.3'