In [11]:
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Data manipulation and analysis
import numpy as np
import pandas as pd

# Multi-dimensional arrays and datasets (e.g., NetCDF, Zarr)
import xarray as xr

# Geospatial raster data handling with CRS support
import rioxarray as rxr

# Raster operations and spatial windowing
import rasterio
from rasterio.windows import Window

# Feature preprocessing and data splitting
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.spatial import cKDTree

# Machine Learning
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Planetary Computer tools for STAC API access and authentication
# import pystac_client
# import planetary_computer as pc
# from odc.stac import stac_load
# from pystac.extensions.eo import EOExtension as eo

# from datetime import date
# from tqdm import tqdm
import os 

In [3]:
Water_Quality_df=pd.read_csv('water_quality_training_dataset.csv')
Water_Quality_df.head()

Unnamed: 0,Latitude,Longitude,Sample Date,Total Alkalinity,Electrical Conductance,Dissolved Reactive Phosphorus
0,-28.760833,17.730278,02-01-2011,128.912,555.0,10.0
1,-26.861111,28.884722,03-01-2011,74.72,162.9,163.0
2,-26.45,28.085833,03-01-2011,89.254,573.0,80.0
3,-27.671111,27.236944,03-01-2011,82.0,203.6,101.0
4,-27.356667,27.286389,03-01-2011,56.1,145.1,151.0


In [5]:
landsat_train_features = pd.read_csv('landsat_features_training.csv')
landsat_train_features.head()

Unnamed: 0,Latitude,Longitude,Sample Date,nir,green,swir16,swir22,NDMI,MNDWI
0,-28.760833,17.730278,02-01-2011,11190.0,11426.0,7687.5,7645.0,0.185538,0.195595
1,-26.861111,28.884722,03-01-2011,17658.5,9550.0,13746.5,10574.0,0.124566,-0.180134
2,-26.45,28.085833,03-01-2011,15210.0,10720.0,17974.0,14201.0,-0.083293,-0.252805
3,-27.671111,27.236944,03-01-2011,14887.0,10943.0,13522.0,11403.0,0.048048,-0.105416
4,-27.356667,27.286389,03-01-2011,16828.5,9502.5,12665.5,9643.0,0.141147,-0.142683


In [6]:
Terraclimate_df = pd.read_csv('terraclimate_features_training.csv')
Terraclimate_df.head()

Unnamed: 0,Latitude,Longitude,Sample Date,pet
0,-28.760833,17.730278,02-01-2011,174.2
1,-26.861111,28.884722,03-01-2011,124.1
2,-26.45,28.085833,03-01-2011,127.5
3,-27.671111,27.236944,03-01-2011,129.7
4,-27.356667,27.286389,03-01-2011,129.2


In [7]:
# Combine two datasets vertically (along columns) using pandas concat function.
def combine_two_datasets(dataset1,dataset2,dataset3):
    '''
    Returns a  vertically concatenated dataset.
    Attributes:
    dataset1 - Dataset 1 to be combined 
    dataset2 - Dataset 2 to be combined
    '''
    
    data = pd.concat([dataset1,dataset2,dataset3], axis=1)
    data = data.loc[:, ~data.columns.duplicated()]
    return data

In [14]:
# Combining ground data and final data into a single dataset.
wq_data = combine_two_datasets(Water_Quality_df, landsat_train_features, Terraclimate_df)
wq_data.head()

Unnamed: 0,Latitude,Longitude,Sample Date,Total Alkalinity,Electrical Conductance,Dissolved Reactive Phosphorus,nir,green,swir16,swir22,NDMI,MNDWI,pet
0,-28.760833,17.730278,02-01-2011,128.912,555.0,10.0,11190.0,11426.0,7687.5,7645.0,0.185538,0.195595,174.2
1,-26.861111,28.884722,03-01-2011,74.72,162.9,163.0,17658.5,9550.0,13746.5,10574.0,0.124566,-0.180134,124.1
2,-26.45,28.085833,03-01-2011,89.254,573.0,80.0,15210.0,10720.0,17974.0,14201.0,-0.083293,-0.252805,127.5
3,-27.671111,27.236944,03-01-2011,82.0,203.6,101.0,14887.0,10943.0,13522.0,11403.0,0.048048,-0.105416,129.7
4,-27.356667,27.286389,03-01-2011,56.1,145.1,151.0,16828.5,9502.5,12665.5,9643.0,0.141147,-0.142683,129.2


In [15]:
wq_data = wq_data.fillna(wq_data.median(numeric_only=True))
wq_data.isna().sum()

Latitude                         0
Longitude                        0
Sample Date                      0
Total Alkalinity                 0
Electrical Conductance           0
Dissolved Reactive Phosphorus    0
nir                              0
green                            0
swir16                           0
swir22                           0
NDMI                             0
MNDWI                            0
pet                              0
dtype: int64

In [16]:
# Retaining only the columns for B01, B06, NDVI, and UHI Index in the dataset.
wq_data = wq_data[['swir22','NDMI','MNDWI','pet', 'Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus']]

In [17]:
def split_data(X, y, test_size=0.3, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

def scale_data(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, scaler

def train_model(X_train_scaled, y_train):
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train_scaled, y_train)
    return model

def evaluate_model(model, X_scaled, y_true, dataset_name="Test"):
    y_pred = model.predict(X_scaled)
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"\n{dataset_name} Evaluation:")
    print(f"R²: {r2:.3f}")
    print(f"RMSE: {rmse:.3f}")
    return y_pred, r2, rmse

In [18]:
def run_pipeline(X, y, param_name="Parameter"):
    print(f"\n{'='*60}")
    print(f"Training Model for {param_name}")
    print(f"{'='*60}")
    
    # Split data
    X_train, X_test, y_train, y_test = split_data(X, y)
    
    # Scale
    X_train_scaled, X_test_scaled, scaler = scale_data(X_train, X_test)
    
    # Train
    model = train_model(X_train_scaled, y_train)
    
    # Evaluate (in-sample)
    y_train_pred, r2_train, rmse_train = evaluate_model(model, X_train_scaled, y_train, "Train")
    
    # Evaluate (out-sample)
    y_test_pred, r2_test, rmse_test = evaluate_model(model, X_test_scaled, y_test, "Test")
    
    # Return summary
    results = {
        "Parameter": param_name,
        "R2_Train": r2_train,
        "RMSE_Train": rmse_train,
        "R2_Test": r2_test,
        "RMSE_Test": rmse_test
    }
    return model, scaler, pd.DataFrame([results])

In [19]:
X = wq_data.drop(columns=['Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus'])

y_TA = wq_data['Total Alkalinity']
y_EC = wq_data['Electrical Conductance']
y_DRP = wq_data['Dissolved Reactive Phosphorus']

model_TA, scaler_TA, results_TA = run_pipeline(X, y_TA, "Total Alkalinity")
model_EC, scaler_EC, results_EC = run_pipeline(X, y_EC, "Electrical Conductance")
model_DRP, scaler_DRP, results_DRP = run_pipeline(X, y_DRP, "Dissolved Reactive Phosphorus")



Training Model for Total Alkalinity

Train Evaluation:
R²: 0.903
RMSE: 23.132

Test Evaluation:
R²: 0.546
RMSE: 50.870

Training Model for Electrical Conductance

Train Evaluation:
R²: 0.918
RMSE: 98.007

Test Evaluation:
R²: 0.585
RMSE: 219.999

Training Model for Dissolved Reactive Phosphorus

Train Evaluation:
R²: 0.882
RMSE: 17.455

Test Evaluation:
R²: 0.529
RMSE: 35.182


In [20]:
results_summary = pd.concat([results_TA, results_EC, results_DRP], ignore_index=True)
results_summary

Unnamed: 0,Parameter,R2_Train,RMSE_Train,R2_Test,RMSE_Test
0,Total Alkalinity,0.903199,23.132468,0.545661,50.870362
1,Electrical Conductance,0.917884,98.007101,0.585458,219.998675
2,Dissolved Reactive Phosphorus,0.882169,17.454757,0.529145,35.181776


In [21]:
#Reading the coordinates for the submission
test_file = pd.read_csv('submission_template.csv')
test_file.head()

Unnamed: 0,Latitude,Longitude,Sample Date,Total Alkalinity,Electrical Conductance,Dissolved Reactive Phosphorus
0,-32.043333,27.822778,01-09-2014,98.304748,305.251717,31.386501
1,-33.329167,26.0775,16-09-2015,118.196182,435.515555,46.431552
2,-32.991639,27.640028,07-05-2015,56.872074,382.454005,24.997714
3,-34.096389,24.439167,07-02-2012,51.892464,162.478964,13.929241
4,-32.000556,28.581667,01-10-2014,84.423265,132.292569,-7.407065


In [22]:
landsat_val_features = pd.read_csv('landsat_features_validation.csv')
landsat_val_features.head()

Unnamed: 0,Latitude,Longitude,Sample Date,nir,green,swir16,swir22,NDMI,MNDWI
0,-32.043333,27.822778,01-09-2014,15229.0,12868.0,14797.0,12421.0,0.014388,-0.069727
1,-33.329167,26.0775,16-09-2015,,,,,,
2,-32.991639,27.640028,07-05-2015,16221.0,9304.5,12536.5,9958.0,0.128123,-0.147979
3,-34.096389,24.439167,07-02-2012,,,,,,
4,-32.000556,28.581667,01-10-2014,9125.0,11100.5,9455.0,8711.0,-0.017761,0.080052


In [23]:
Terraclimate_val_df = pd.read_csv('terraclimate_features_validation.csv')
Terraclimate_val_df.head()

Unnamed: 0,Latitude,Longitude,Sample Date,pet
0,-32.043333,27.822778,01-09-2014,161.90001
1,-33.329167,26.0775,16-09-2015,177.6
2,-32.991639,27.640028,07-05-2015,158.40001
3,-34.096389,24.439167,07-02-2012,130.0
4,-32.000556,28.581667,01-10-2014,152.5


In [24]:
#Consolidate all the extracted bands and features in a single dataframe
val_data = pd.DataFrame({
    'Longitude': landsat_val_features['Longitude'].values,
    'Latitude': landsat_val_features['Latitude'].values,
    'Sample Date': landsat_val_features['Sample Date'].values,
    'nir': landsat_val_features['nir'].values,
    'green': landsat_val_features['green'].values,
    'swir16': landsat_val_features['swir16'].values,
    'swir22': landsat_val_features['swir22'].values,
    'NDMI': landsat_val_features['NDMI'].values,
    'MNDWI': landsat_val_features['MNDWI'].values,
    'pet': Terraclimate_val_df['pet'].values,
})

In [25]:
# Impute the missing values
val_data = val_data.fillna(val_data.median(numeric_only=True))

In [26]:
# Extracting specific columns (B01, B06, and NDVI) from the validation dataset
submission_val_data=val_data.loc[:,['swir22','NDMI','MNDWI','pet']]
submission_val_data.head()

Unnamed: 0,swir22,NDMI,MNDWI,pet
0,12421.0,0.014388,-0.069727,161.90001
1,9973.0,0.081427,-0.130571,177.6
2,9958.0,0.128123,-0.147979,158.40001
3,9973.0,0.081427,-0.130571,130.0
4,8711.0,-0.017761,0.080052,152.5


In [27]:
submission_val_data.shape

(200, 4)

In [28]:
# --- Predicting for Total Alkalinity ---
X_sub_scaled_TA = scaler_TA.transform(submission_val_data)
pred_TA_submission = model_TA.predict(X_sub_scaled_TA)

# --- Predicting for Electrical Conductance ---
X_sub_scaled_EC = scaler_EC.transform(submission_val_data)
pred_EC_submission = model_EC.predict(X_sub_scaled_EC)

# --- Predicting for Dissolved Reactive Phosphorus ---
X_sub_scaled_DRP = scaler_DRP.transform(submission_val_data)
pred_DRP_submission = model_DRP.predict(X_sub_scaled_DRP)

In [29]:
submission_df = pd.DataFrame({
    'Longitude': test_file['Longitude'].values,
    'Latitude': test_file['Latitude'].values,
    'Sample Date': test_file['Sample Date'].values,
    'Total Alkalinity': pred_TA_submission,
    'Electrical Conductance': pred_EC_submission,
    'Dissolved Reactive Phosphorus': pred_DRP_submission
})

In [30]:
#Displaying the sample submission dataframe
submission_df.head()

Unnamed: 0,Longitude,Latitude,Sample Date,Total Alkalinity,Electrical Conductance,Dissolved Reactive Phosphorus
0,27.822778,-32.043333,01-09-2014,114.833126,314.267271,25.235333
1,26.0775,-33.329167,16-09-2015,156.957291,648.500033,60.455833
2,27.640028,-32.991639,07-05-2015,62.87798,724.577283,29.445333
3,24.439167,-34.096389,07-02-2012,72.334447,234.950986,13.726667
4,28.581667,-32.000556,01-10-2014,109.078753,304.0102,30.21


In [31]:
#Dumping the predictions into a csv file.
submission_df.to_csv("submission.csv",index = False)