In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [2]:
landsat = pd.read_csv('landsat_features_training.csv')
terra = pd.read_csv('terraclimate_features_training.csv')

df = pd.merge(landsat, terra, on=['Latitude', 'Longitude', 'Sample Date'], how='inner')
print(f"Rows after merge: {len(df):,}")

core_cols = ['nir', 'green', 'swir16', 'swir22', 'NDMI', 'MNDWI', 'pet']
df = df.dropna(subset=core_cols).copy()
print(f"Rows after dropping missing values: {len(df):,}")

Rows after merge: 9,319
Rows after dropping missing values: 8,234


In [3]:
df['Sample Date'] = pd.to_datetime(df['Sample Date'], format='%d-%m-%Y', errors='coerce')
df = df.dropna(subset=['Sample Date'])
df['month'] = df['Sample Date'].dt.month
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

In [4]:
features_for_corr = ['nir', 'green', 'swir16', 'swir22', 'NDMI', 'MNDWI', 'pet', 'month_sin', 'month_cos']
corr_before = df[features_for_corr].corr().round(4)
print("\nCorrelation matrix BEFORE feature selection:\n")
print(corr_before)


Correlation matrix BEFORE feature selection:

              nir   green  swir16  swir22    NDMI   MNDWI     pet  month_sin  \
nir        1.0000  0.5624  0.7227  0.6311  0.1416 -0.4329 -0.1585     0.1191   
green      0.5624  1.0000  0.5935  0.5535 -0.1295  0.1283  0.0445    -0.0504   
swir16     0.7227  0.5935  1.0000  0.9532 -0.5550 -0.6774 -0.0184    -0.1617   
swir22     0.6311  0.5535  0.9532  1.0000 -0.6338 -0.6249  0.0508    -0.2416   
NDMI       0.1416 -0.1295 -0.5550 -0.6338  1.0000  0.5044 -0.1713     0.3978   
MNDWI     -0.4329  0.1283 -0.6774 -0.6249  0.5044  1.0000  0.1192     0.1205   
pet       -0.1585  0.0445 -0.0184  0.0508 -0.1713  0.1192  1.0000    -0.1350   
month_sin  0.1191 -0.0504 -0.1617 -0.2416  0.3978  0.1205 -0.1350     1.0000   
month_cos  0.2067  0.0462  0.0482  0.0299  0.2009 -0.0016 -0.0275    -0.0136   

           month_cos  
nir           0.2067  
green         0.0462  
swir16        0.0482  
swir22        0.0299  
NDMI          0.2009  
MNDWI        -

In [6]:
drop_cols = ['NDMI', 'MNDWI', 'swir22', 'Sample Date', 'month']
df_clean = df.drop(columns=drop_cols)

keep_features = ['nir', 'green', 'swir16', 'pet', 'month_sin', 'month_cos']

In [8]:
corr_after = df_clean[keep_features].corr().round(4)
print("\nCorrelation matrix AFTER feature selection:\n")
print(corr_after)

predictors = ['nir', 'green', 'swir16', 'month_sin', 'month_cos']
max_offdiag = corr_after.loc[predictors, predictors].abs() \
    .where(~np.eye(len(predictors), dtype=bool)).max().max()


Correlation matrix AFTER feature selection:

              nir   green  swir16     pet  month_sin  month_cos
nir        1.0000  0.5624  0.7227 -0.1585     0.1191     0.2067
green      0.5624  1.0000  0.5935  0.0445    -0.0504     0.0462
swir16     0.7227  0.5935  1.0000 -0.0184    -0.1617     0.0482
pet       -0.1585  0.0445 -0.0184  1.0000    -0.1350    -0.0275
month_sin  0.1191 -0.0504 -0.1617 -0.1350     1.0000    -0.0136
month_cos  0.2067  0.0462  0.0482 -0.0275    -0.0136     1.0000


In [9]:
output_file = 'cleaned_landsat_terraclimate_data.csv'
df_clean[['Latitude', 'Longitude'] + keep_features].to_csv(output_file, index=False)
print(f"\nCleaned dataset saved → {output_file} (Shape: {df_clean.shape})")


Cleaned dataset saved → cleaned_landsat_terraclimate_data.csv (Shape: (8234, 8))


In [10]:
X = df_clean[['nir', 'green', 'swir16', 'month_sin', 'month_cos']]
y = df_clean['pet']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
model_nn = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1) 
])
model_nn.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
model_nn.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=0)
y_pred_nn = model_nn.predict(X_test)
r2_nn = r2_score(y_test, y_pred_nn)
print(f"\nNeural Network R² on test set: {r2_nn:.4f}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Neural Network R² on test set: -1.3060


In [12]:
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest R² on test set: {r2_rf:.4f}")

Random Forest R² on test set: 0.3319
