In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import time

In [5]:
water_quality_data = pd.read_csv("water_quality_training_dataset.csv")
terraclimate_data = pd.read_csv("terraclimate_features_training.csv")
landsat_data = pd.read_csv("landsat_features_training.csv")

print("Water quality shape:", water_quality_data.shape)
print("TerraClimate shape:", terraclimate_data.shape)
print("Landsat shape:", landsat_data.shape)

Water quality shape: (9319, 6)
TerraClimate shape: (9319, 4)
Landsat shape: (9319, 9)


In [9]:
merged_data = water_quality_data.merge(
    terraclimate_data,
    on=['Latitude', 'Longitude', 'Sample Date'],
    how='left'
).merge(
    landsat_data,
    on=['Latitude', 'Longitude', 'Sample Date'],
    how='left'
)

print("Merged shape:", merged_data.shape)

Merged shape: (9319, 13)


In [10]:
targets = ['Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus']

X = merged_data.drop(columns=targets + ['Sample Date'])
y = merged_data[targets]

# Fill missing values
X = X.fillna(X.mean())

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (9319, 9)
y shape: (9319, 3)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set:", X_train.shape)
print("Test set:", X_test.shape)

Training set: (7455, 9)
Test set: (1864, 9)


# Models

- Using the random forest model

In [12]:
print("="*50)
print("Training Random Forest for: Total Alkalinity")
print("="*50)

rf_model_alkalinity = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)

start_time = time.time()
rf_model_alkalinity.fit(X_train, y_train['Total Alkalinity'])
training_time = time.time() - start_time

y_pred_alkalinity = rf_model_alkalinity.predict(X_test)

mse = mean_squared_error(y_test['Total Alkalinity'], y_pred_alkalinity)
r2 = r2_score(y_test['Total Alkalinity'], y_pred_alkalinity)

print(f"Training time: {training_time:.2f} seconds")
print(f"MSE: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

Training Random Forest for: Total Alkalinity
Training time: 0.34 seconds
MSE: 1009.1841
R² Score: 0.8247


In [13]:
print("="*50)
print("Training Random Forest for: Electrical Conductance")
print("="*50)

rf_model_conductance = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)

start_time = time.time()
rf_model_conductance.fit(X_train, y_train['Electrical Conductance'])
training_time = time.time() - start_time

y_pred_conductance = rf_model_conductance.predict(X_test)

mse = mean_squared_error(y_test['Electrical Conductance'], y_pred_conductance)
r2 = r2_score(y_test['Electrical Conductance'], y_pred_conductance)

print(f"Training time: {training_time:.2f} seconds")
print(f"MSE: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

Training Random Forest for: Electrical Conductance
Training time: 0.35 seconds
MSE: 16820.7502
R² Score: 0.8591


In [14]:
print("="*50)
print("Training Random Forest for: Dissolved Reactive Phosphorus")
print("="*50)

rf_model_phosphorus = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)

start_time = time.time()
rf_model_phosphorus.fit(X_train, y_train['Dissolved Reactive Phosphorus'])
training_time = time.time() - start_time

y_pred_phosphorus = rf_model_phosphorus.predict(X_test)

mse = mean_squared_error(y_test['Dissolved Reactive Phosphorus'], y_pred_phosphorus)
r2 = r2_score(y_test['Dissolved Reactive Phosphorus'], y_pred_phosphorus)

print(f"Training time: {training_time:.2f} seconds")
print(f"MSE: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

Training Random Forest for: Dissolved Reactive Phosphorus
Training time: 0.36 seconds
MSE: 843.3557
R² Score: 0.6858


# Models

- Using the random forest model