In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Loading The Datasets

In [3]:
water_quality_data = pd.read_csv("water_quality_training_dataset.csv")
terraclimate_data = pd.read_csv("terraclimate_features_training.csv")
landsat_data = pd.read_csv("landsat_features_training.csv")

In [4]:
print("Water Quality columns:", water_quality_data.columns.tolist())
print("Terraclimate columns:", terraclimate_data.columns.tolist())
print("Landsat columns:", landsat_data.columns.tolist())

Water Quality columns: ['Latitude', 'Longitude', 'Sample Date', 'Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus']
Terraclimate columns: ['Latitude', 'Longitude', 'Sample Date', 'pet']
Landsat columns: ['Latitude', 'Longitude', 'Sample Date', 'nir', 'green', 'swir16', 'swir22', 'NDMI', 'MNDWI']


In [5]:
merged_data = water_quality_data.merge(terraclimate_data, on=['Latitude', 'Longitude', 'Sample Date'], how='inner')
merged_data = merged_data.merge(landsat_data, on=['Latitude', 'Longitude', 'Sample Date'], how='inner')


print("Merged data shape:", merged_data.shape)
print(merged_data.head())

Merged data shape: (9319, 13)
    Latitude  Longitude Sample Date  Total Alkalinity  Electrical Conductance  \
0 -28.760833  17.730278  02-01-2011           128.912                   555.0   
1 -26.861111  28.884722  03-01-2011            74.720                   162.9   
2 -26.450000  28.085833  03-01-2011            89.254                   573.0   
3 -27.671111  27.236944  03-01-2011            82.000                   203.6   
4 -27.356667  27.286389  03-01-2011            56.100                   145.1   

   Dissolved Reactive Phosphorus    pet      nir    green   swir16   swir22  \
0                           10.0  174.2  11190.0  11426.0   7687.5   7645.0   
1                          163.0  124.1  17658.5   9550.0  13746.5  10574.0   
2                           80.0  127.5  15210.0  10720.0  17974.0  14201.0   
3                          101.0  129.7  14887.0  10943.0  13522.0  11403.0   
4                          151.0  129.2  16828.5   9502.5  12665.5   9643.0   

       N

In [6]:
X = merged_data.drop(columns=['Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus', 'Sample Date'])
y = merged_data['Total Alkalinity']

In [7]:
X.fillna(X.mean(), inplace=True) 

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGbooot

In [16]:
# xg_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, random_state=42)

# params = {
#     'objective': 'reg:squarederror',
#     'eval_metric': 'rmse', 
#     'n_estimators': 1000
# }

In [20]:
xg_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1000,
    random_state=42,
    early_stopping_rounds=50,
    eval_metric='rmse' 
)

xg_model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    verbose=True
)

[0]	validation_0-rmse:61.34768
[1]	validation_0-rmse:52.97551
[2]	validation_0-rmse:47.07859
[3]	validation_0-rmse:43.94676
[4]	validation_0-rmse:41.20891
[5]	validation_0-rmse:39.70954
[6]	validation_0-rmse:38.47421
[7]	validation_0-rmse:37.68090
[8]	validation_0-rmse:37.24265
[9]	validation_0-rmse:36.93553
[10]	validation_0-rmse:36.57902
[11]	validation_0-rmse:36.07987
[12]	validation_0-rmse:35.96517
[13]	validation_0-rmse:35.79497
[14]	validation_0-rmse:35.72211
[15]	validation_0-rmse:35.59285
[16]	validation_0-rmse:35.38240
[17]	validation_0-rmse:35.21096
[18]	validation_0-rmse:35.02945
[19]	validation_0-rmse:35.00324
[20]	validation_0-rmse:34.96405
[21]	validation_0-rmse:34.91078
[22]	validation_0-rmse:34.77350
[23]	validation_0-rmse:34.55995
[24]	validation_0-rmse:34.30351
[25]	validation_0-rmse:34.28683
[26]	validation_0-rmse:34.29952
[27]	validation_0-rmse:34.33279
[28]	validation_0-rmse:34.28355
[29]	validation_0-rmse:34.24632
[30]	validation_0-rmse:34.16858
[31]	validation_0-

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,50
,enable_categorical,False


In [22]:
print(f"Best number of boosting rounds: {xg_model.best_iteration}")

Best number of boosting rounds: 124


In [42]:
trained_models = {}
feature_order = []

targets = {
    'Total Alkalinity': 'Total Alkalinity',
    'Electrical Conductance': 'Electrical Conductance',
    'Dissolved Reactive Phosphorus': 'Dissolved Reactive Phosphorus'
}

for target_name, target_col in targets.items():
    print("=" * 60)
    print(f"Training Model for {target_name}")
    print("=" * 60)
    print()
    
    # Prepare X and y
    X = merged_data.drop(columns=['Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus', 'Sample Date'])
    y = merged_data[target_col]
    
    if not feature_order:
        feature_order = X.columns.tolist()

    X.fillna(X.mean(), inplace=True)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    xg_model = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=1000,
        random_state=42,
        early_stopping_rounds=50,
        eval_metric='rmse'
    )
    
    xg_model.fit(
        X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        verbose=False
    )
    
    trained_models[target_name] = xg_model
    
    y_train_pred = xg_model.predict(X_train)
    y_test_pred = xg_model.predict(X_test)
    
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_rmse = np.sqrt(train_mse)
    train_r2 = r2_score(y_train, y_train_pred)
    
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_rmse = np.sqrt(test_mse)
    test_r2 = r2_score(y_test, y_test_pred)
    
    print("Train Evaluation:")
    print(f"R²: {train_r2:.3f}")
    print(f"RMSE: {train_rmse:.3f}")
    print()
    print("Test Evaluation:")
    print(f"R²: {test_r2:.3f}")
    print(f"RMSE: {test_rmse:.3f}")
    print()

Training Model for Total Alkalinity

Train Evaluation:
R²: 0.947
RMSE: 17.177

Test Evaluation:
R²: 0.822
RMSE: 31.984

Training Model for Electrical Conductance

Train Evaluation:
R²: 0.974
RMSE: 54.811

Test Evaluation:
R²: 0.851
RMSE: 133.574

Training Model for Dissolved Reactive Phosphorus

Train Evaluation:
R²: 0.905
RMSE: 15.653

Test Evaluation:
R²: 0.678
RMSE: 29.384



# Submission

In [32]:
#Reading the coordinates for the submission
test_file = pd.read_csv('submission_template.csv')
test_file.head()

Unnamed: 0,Latitude,Longitude,Sample Date,Total Alkalinity,Electrical Conductance,Dissolved Reactive Phosphorus
0,-32.043333,27.822778,01-09-2014,,,
1,-33.329167,26.0775,16-09-2015,,,
2,-32.991639,27.640028,07-05-2015,,,
3,-34.096389,24.439167,07-02-2012,,,
4,-32.000556,28.581667,01-10-2014,,,


In [34]:
landsat_val_features = pd.read_csv('landsat_features_validation.csv')
landsat_val_features.head()

Unnamed: 0,Latitude,Longitude,Sample Date,nir,green,swir16,swir22,NDMI,MNDWI
0,-32.043333,27.822778,01-09-2014,15229.0,12868.0,14797.0,12421.0,0.014388,-0.069727
1,-33.329167,26.0775,16-09-2015,,,,,,
2,-32.991639,27.640028,07-05-2015,16221.0,9304.5,12536.5,9958.0,0.128123,-0.147979
3,-34.096389,24.439167,07-02-2012,,,,,,
4,-32.000556,28.581667,01-10-2014,9125.0,11100.5,9455.0,8711.0,-0.017761,0.080052


In [35]:
Terraclimate_val_df = pd.read_csv('terraclimate_features_validation.csv')
Terraclimate_val_df.head()

Unnamed: 0,Latitude,Longitude,Sample Date,pet
0,-32.043333,27.822778,01-09-2014,161.90001
1,-33.329167,26.0775,16-09-2015,177.6
2,-32.991639,27.640028,07-05-2015,158.40001
3,-34.096389,24.439167,07-02-2012,130.0
4,-32.000556,28.581667,01-10-2014,152.5


In [36]:
#Consolidate all the extracted bands and features in a single dataframe
val_data = pd.DataFrame({
    'Longitude': landsat_val_features['Longitude'].values,
    'Latitude': landsat_val_features['Latitude'].values,
    'Sample Date': landsat_val_features['Sample Date'].values,
    'nir': landsat_val_features['nir'].values,
    'green': landsat_val_features['green'].values,
    'swir16': landsat_val_features['swir16'].values,
    'swir22': landsat_val_features['swir22'].values,
    'NDMI': landsat_val_features['NDMI'].values,
    'MNDWI': landsat_val_features['MNDWI'].values,
    'pet': Terraclimate_val_df['pet'].values,
})

In [37]:
# Impute the missing values
val_data = val_data.fillna(val_data.median(numeric_only=True))

In [43]:
submission_val_data = val_data[feature_order]

print("Validation Data Shape:", submission_val_data.shape)
submission_val_data.head()

Validation Data Shape: (200, 9)


Unnamed: 0,Latitude,Longitude,pet,nir,green,swir16,swir22,NDMI,MNDWI
0,-32.043333,27.822778,161.90001,15229.0,12868.0,14797.0,12421.0,0.014388,-0.069727
1,-33.329167,26.0775,177.6,14525.5,9493.5,12425.5,9973.0,0.081427,-0.130571
2,-32.991639,27.640028,158.40001,16221.0,9304.5,12536.5,9958.0,0.128123,-0.147979
3,-34.096389,24.439167,130.0,14525.5,9493.5,12425.5,9973.0,0.081427,-0.130571
4,-32.000556,28.581667,152.5,9125.0,11100.5,9455.0,8711.0,-0.017761,0.080052


In [39]:
submission_val_data.shape

(200, 4)

In [45]:
pred_TA_submission = trained_models['Total Alkalinity'].predict(submission_val_data)
pred_EC_submission = trained_models['Electrical Conductance'].predict(submission_val_data)
pred_DRP_submission = trained_models['Dissolved Reactive Phosphorus'].predict(submission_val_data)

print("Predictions generated successfully.")
print("TA Sample:", pred_TA_submission[:5])

Predictions generated successfully.
TA Sample: [106.10725  230.19731   57.871025 102.92508   90.22716 ]


In [47]:
submission_df = pd.DataFrame({
    'Longitude': test_file['Longitude'].values,
    'Latitude': test_file['Latitude'].values,
    'Sample Date': test_file['Sample Date'].values,
    'Total Alkalinity': pred_TA_submission,
    'Electrical Conductance': pred_EC_submission,
    'Dissolved Reactive Phosphorus': pred_DRP_submission
})

In [48]:
submission_df.head()

Unnamed: 0,Longitude,Latitude,Sample Date,Total Alkalinity,Electrical Conductance,Dissolved Reactive Phosphorus
0,27.822778,-32.043333,01-09-2014,106.107246,265.00885,18.603912
1,26.0775,-33.329167,16-09-2015,230.197311,666.573914,75.053574
2,27.640028,-32.991639,07-05-2015,57.871025,247.605347,17.52623
3,24.439167,-34.096389,07-02-2012,102.925079,1306.21106,29.25564
4,28.581667,-32.000556,01-10-2014,90.227158,224.327652,14.044097
