In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import r2_score, mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [2]:
def add_features(data):
    data = data.copy()
    data['Sample Date'] = pd.to_datetime(data['Sample Date'], format='%d-%m-%Y', errors='coerce')
    data = data.dropna(subset=['Sample Date'])
    
    data['lat'] = data['Latitude']
    data['lon'] = data['Longitude']
    
    for angle, prefix in [(data['lat'], 'lat'), (data['lon'], 'lon')]:
        rad = np.radians(angle)
        for k in [1, 2, 3, 4]:
            data[f'{prefix}_sin_{k}'] = np.sin(k * rad)
            data[f'{prefix}_cos_{k}'] = np.cos(k * rad)
    
    data['year'] = data['Sample Date'].dt.year.astype(float)
    data['year_norm'] = (data['year'] - 2011) / 4.0
    data['month'] = data['Sample Date'].dt.month.astype(float)
    data['doy']   = data['Sample Date'].dt.dayofyear.astype(float)
    
    data['doy_sin_365_3'] = np.sin(2 * np.pi * data['doy'] / 365.25)
    data['doy_cos_365_3'] = np.cos(2 * np.pi * data['doy'] / 365.25)
    
    data['month_sin_12'] = np.sin(2 * np.pi * data['month'] / 12)
    data['month_cos_12'] = np.cos(2 * np.pi * data['month'] / 12)
    
    for period, p_label in [
        (182.625, '182_6'),
        (91.3125, '91_3'),
        (30.4375, '30_4')
    ]:
        data[f'doy_sin_{p_label}'] = np.sin(2 * np.pi * data['doy'] / period)
        data[f'doy_cos_{p_label}'] = np.cos(2 * np.pi * data['doy'] / period)
    
    for period in [6, 4, 3]:
        data[f'month_sin_{period}'] = np.sin(2 * np.pi * data['month'] / period)
        data[f'month_cos_{period}'] = np.cos(2 * np.pi * data['month'] / period)
    
    data['lat_doy_sin'] = data['lat_sin_1'] * data['doy_sin_365_3']
    data['lon_doy_sin'] = data['lon_sin_1'] * data['doy_sin_365_3']
    data['lat_year']    = data['lat'] * data['year_norm']
    data['lon_year']    = data['lon'] * data['year_norm']
    
    return data

In [3]:
print("=== Loading & featurizing training data ===")
df = pd.read_csv('water_quality_training_dataset.csv')
df = add_features(df)

# Collect features AFTER adding them
features = [c for c in df.columns if any(p in c for p in [
    'sin', 'cos', 'year', 'doy_', 'month_', 'lat_', 'lon_'
])]

print(f"Using {len(features)} features")
print("First 10 features:", features[:10])
print("Sample training columns:", df.columns[-15:].tolist())  

=== Loading & featurizing training data ===
Using 38 features
First 10 features: ['lat_sin_1', 'lat_cos_1', 'lat_sin_2', 'lat_cos_2', 'lat_sin_3', 'lat_cos_3', 'lat_sin_4', 'lat_cos_4', 'lon_sin_1', 'lon_cos_1']
Sample training columns: ['doy_cos_182_6', 'doy_sin_91_3', 'doy_cos_91_3', 'doy_sin_30_4', 'doy_cos_30_4', 'month_sin_6', 'month_cos_6', 'month_sin_4', 'month_cos_4', 'month_sin_3', 'month_cos_3', 'lat_doy_sin', 'lon_doy_sin', 'lat_year', 'lon_year']


In [4]:
X = df[features].copy()
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

targets = {
    'TA':  df['Total Alkalinity'],
    'EC':  df['Electrical Conductance'],
    'DRP': df['Dissolved Reactive Phosphorus']
}

In [5]:
def train_blend(X, y, name):
    print(f"\nTraining {name} ...")
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof = np.zeros(len(y))
    models_xgb = []
    models_lgb = []

    xgb_params = {
        'n_estimators': 1500,
        'learning_rate': 0.018,
        'max_depth': 6,
        'subsample': 0.8,
        'colsample_bytree': 0.7,
        'reg_lambda': 4.0,
        'reg_alpha': 1.5,
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': 0
    }

    lgb_params = {
        'objective': 'regression',
        'metric': 'rmse',
        'n_estimators': 1500,
        'learning_rate': 0.02,
        'max_depth': 7,
        'num_leaves': 40,
        'subsample': 0.8,
        'colsample_bytree': 0.7,
        'reg_lambda': 5.0,
        'reg_alpha': 2.0,
        'random_state': 42,
        'n_jobs': -1,
        'verbose': -1
    }

    for fold, (tr_idx, val_idx) in enumerate(kf.split(X), 1):
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        xgb_model = xgb.XGBRegressor(**xgb_params)
        xgb_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
        models_xgb.append(xgb_model)

        lgb_model = lgb.LGBMRegressor(**lgb_params)
        lgb_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)])
        models_lgb.append(lgb_model)

        oof[val_idx] = 0.6 * xgb_model.predict(X_val) + 0.4 * lgb_model.predict(X_val)

    r2 = r2_score(y, oof)
    rmse = np.sqrt(mean_squared_error(y, oof))
    print(f"{name:20}  CV R² = {r2:.4f}   RMSE = {rmse:.2f}")

    return models_xgb, models_lgb

models = {}
for tgt, y in targets.items():
    models[tgt] = train_blend(X_scaled, y, tgt)


Training TA ...




TA                    CV R² = 0.8527   RMSE = 28.66

Training EC ...




EC                    CV R² = 0.8583   RMSE = 128.72

Training DRP ...




DRP                   CV R² = 0.6679   RMSE = 29.38




In [6]:
print("\n=== Processing submission ===")
sub = pd.read_csv('submission_template.csv')
sub = add_features(sub)  

print("Submission columns sample:", sub.columns[-20:].tolist()) 

sub_X = sub[features]
sub_X_scaled = scaler.transform(sub_X)

def blend_predict(models_xgb, models_lgb, X):
    preds_xgb = np.mean([m.predict(X) for m in models_xgb], axis=0)
    preds_lgb = np.mean([m.predict(X) for m in models_lgb], axis=0)
    return 0.6 * preds_xgb + 0.4 * preds_lgb

sub['Total Alkalinity']              = blend_predict(*models['TA'],  sub_X_scaled)
sub['Electrical Conductance']        = blend_predict(*models['EC'],  sub_X_scaled)
sub['Dissolved Reactive Phosphorus'] = blend_predict(*models['DRP'], sub_X_scaled)

# Clip
sub['Total Alkalinity']              = sub['Total Alkalinity'].clip(5, 400)
sub['Electrical Conductance']        = sub['Electrical Conductance'].clip(30, 2200)
sub['Dissolved Reactive Phosphorus'] = sub['Dissolved Reactive Phosphorus'].clip(0.5, 250)

final_sub = sub[[
    'Latitude', 'Longitude', 'Sample Date',
    'Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus'
]]

print("\nFirst 15 submission rows:")
print(final_sub.head(15).to_string(index=False))

final_sub.to_csv('submission_strong_v7_safe.csv', index=False)
print("\nSaved → submission_strong_v7_safe.csv")
print("Upload this file and check the score!")


=== Processing submission ===
Submission columns sample: ['doy_sin_365_3', 'doy_cos_365_3', 'month_sin_12', 'month_cos_12', 'doy_sin_182_6', 'doy_cos_182_6', 'doy_sin_91_3', 'doy_cos_91_3', 'doy_sin_30_4', 'doy_cos_30_4', 'month_sin_6', 'month_cos_6', 'month_sin_4', 'month_cos_4', 'month_sin_3', 'month_cos_3', 'lat_doy_sin', 'lon_doy_sin', 'lat_year', 'lon_year']

First 15 submission rows:
  Latitude  Longitude Sample Date  Total Alkalinity  Electrical Conductance  Dissolved Reactive Phosphorus
-32.043333  27.822778  2014-09-01         41.129627              190.066686                      21.158709
-33.329167  26.077500  2015-09-16        119.328170              604.878204                      75.651770
-32.991639  27.640028  2015-05-07         40.957513              183.255347                      28.247779
-34.096389  24.439167  2012-02-07         47.270145              722.295124                      11.319343
-32.000556  28.581667  2014-10-01         54.524566              271.29

