# Day 4, Session 2 · Instructor Notebook
### LSTM drought forecasting with Google Earth Engine NDVI

## 0. Session guide

- Run this notebook before class to ensure Earth Engine access is configured.
- Keep the student notebook side-by-side; mirror the same sequence but provide full solutions here.
- If Earth Engine is unavailable during delivery, use the cached sample CSV to proceed with the modelling part.

## 1. Environment check

Install Earth Engine if it is missing (one-time per environment). Uncomment the cell below when provisioning new machines.

In [None]:
# !pip install earthengine-api --quiet

Authenticate once per user (opens a browser window). Skip if already authenticated on this machine.

In [None]:
import os
import ee

# Update this to your Cloud project that has the Earth Engine API enabled.
GEE_PROJECT = os.environ.get('EE_PROJECT') or os.environ.get('GEE_PROJECT') or 'your-ee-project-id'

if GEE_PROJECT == 'your-ee-project-id':
    raise ValueError('Set GEE_PROJECT to your Earth Engine project ID (or export EE_PROJECT env var).')

try:
    ee.Initialize(project=GEE_PROJECT)
    print(f'Connected to Earth Engine project: {GEE_PROJECT}')
except Exception as exc:
    print('Earth Engine initialization failed, attempting authentication...')
    ee.Authenticate(auth_mode='notebook', project=GEE_PROJECT)
    ee.Initialize(project=GEE_PROJECT)
    print(f'Authenticated and connected to project: {GEE_PROJECT}')


## 2. Imports & configuration

In [None]:
import math
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

plt.style.use('seaborn-v0_8-whitegrid')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

## 3. Define Mindanao regions (Earth Engine geometries)

In [None]:
REGION_GEOMETRIES = {
    "Bukidnon": ee.Geometry.Polygon([
        [124.36, 8.84],
        [124.36, 7.05],
        [125.63, 7.05],
        [125.63, 8.84],
        [124.36, 8.84]
    ]),
    "South Cotabato": ee.Geometry.Polygon([
        [124.28, 6.88],
        [124.28, 5.68],
        [125.30, 5.68],
        [125.30, 6.88],
        [124.28, 6.88]
    ])
}

REGION_GEOMETRIES

## 4. Helper to fetch Sentinel-2 NDVI from Google Earth Engine

Adjust the parameters as needed (cloud filter, temporal window, reducer).

In [None]:
def fetch_monthly_ndvi(regions, start_date='2018-01-01', end_date='2023-12-31', cloud_pct=35, scale=20):
    """Return a wide DataFrame with monthly NDVI for each region."""
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    months = pd.date_range(start=start_date, end=end_date, freq='MS')

    def mask_s2_sr(image):
        scl = image.select('SCL')
        mask = (
            scl.neq(0)
            .And(scl.neq(1))
            .And(scl.neq(3))
            .And(scl.neq(7))
            .And(scl.neq(8))
            .And(scl.neq(9))
            .And(scl.neq(10))
            .And(scl.neq(11))
        )
        return image.updateMask(mask).copyProperties(image, image.propertyNames())

    def add_ndvi(image):
        ndvi = image.normalizedDifference(['B8', 'B4']).rename('NDVI')
        return image.addBands(ndvi)

    collection = (
        ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
        .filterDate(start_date.strftime('%Y-%m-%d'), (end_date + pd.offsets.Day(1)).strftime('%Y-%m-%d'))
        .filter(ee.Filter.lte('CLOUDY_PIXEL_PERCENTAGE', cloud_pct))
        .map(mask_s2_sr)
        .map(add_ndvi)
    )

    records = []
    for location, geometry in regions.items():
        for month_start in months:
            month_end = month_start + pd.offsets.MonthBegin(1)
            exclusive_end = min(month_end, end_date + pd.offsets.Day(1))
            monthly = collection.filterDate(
                month_start.strftime('%Y-%m-%d'),
                exclusive_end.strftime('%Y-%m-%d')
            )
            if monthly.size().getInfo() == 0:
                ndvi_value = None
            else:
                composite = monthly.median()
                stats = composite.select('NDVI').reduceRegion(
                    reducer=ee.Reducer.mean(),
                    geometry=geometry,
                    scale=scale,
                    bestEffort=True,
                    maxPixels=1_000_000
                )
                ndvi_info = stats.get('NDVI')
                ndvi_value = ee.Number(ndvi_info).getInfo() if ndvi_info is not None else None
            records.append({
                'month': month_start,
                'location': location,
                'ndvi': ndvi_value
            })

    df = pd.DataFrame(records)
    df['month'] = pd.to_datetime(df['month'])
    df.sort_values(['location', 'month'], inplace=True)
    pivot = df.pivot(index='month', columns='location', values='ndvi')
    pivot = pivot.interpolate(limit_direction='both')
    pivot = pivot.reset_index()
    pivot.columns = ['month'] + [f'NDVI_{col.replace(" ", "")}' for col in pivot.columns[1:]]
    return pivot


## 5. Load data (choose Earth Engine or cached sample)

In [None]:
CACHE_DIR = Path('day4/data')
CACHE_DIR.mkdir(exist_ok=True)
CACHE_PATH = CACHE_DIR / 'mindanao_ndvi_sample.csv'  # provided sample export
LATEST_EXPORT_PATH = CACHE_DIR / 'mindanao_ndvi_gee.csv'

USE_GEE = True  # set to False only if Earth Engine is unavailable

if USE_GEE:
    try:
        ndvi_df = fetch_monthly_ndvi(
            REGION_GEOMETRIES,
            start_date='2018-01-01',
            end_date='2023-12-31',
            cloud_pct=35,
            scale=20
        )
        ndvi_df.to_csv(LATEST_EXPORT_PATH, index=False)
        print(f'Fetched {len(ndvi_df)} monthly observations from Earth Engine (project: {GEE_PROJECT}).')
    except Exception as exc:
        print(f'Earth Engine fetch failed ({exc}). Falling back to cached sample CSV...')
        ndvi_df = pd.read_csv(CACHE_PATH)
else:
    print('USE_GEE is False – reading cached sample export. Switch to True for live Sentinel-2 NDVI.')
    ndvi_df = pd.read_csv(CACHE_PATH)

ndvi_df['month'] = pd.to_datetime(ndvi_df['month'])
ndvi_df.head()


### Notes for instructors

- The sample CSV is a lightweight export from this pipeline—use only when Earth Engine access is unavailable.
- Keep both files in `day4/data/` so the class can fall back gracefully if GEE is down.

## 6. Explore NDVI trends

In [None]:
locations = [col for col in ndvi_df.columns if col.startswith('NDVI_')]

fig, ax = plt.subplots(figsize=(10, 5))
for column in locations:
    label = column.replace('NDVI_', '')
    ax.plot(ndvi_df['month'], ndvi_df[column], marker='o', label=label)

ax.axhline(0.4, color='red', linestyle='--', linewidth=1, label='NDVI 0.40 threshold')
ax.set_title('Monthly NDVI (GEE export / fallback)
Bukidnon vs South Cotabato')
ax.set_ylabel('NDVI')
ax.set_xlabel('Month')
ax.legend(loc='lower right')
plt.tight_layout()

## 7. Seasonal statistics

In [None]:
dry_months = [5, 6, 7, 8, 9, 10]
wet_months = [11, 12, 1, 2, 3, 4]

summary_rows = []
for column in locations:
    series = ndvi_df[['month', column]].copy()
    dry_mean = series[series['month'].dt.month.isin(dry_months)][column].mean()
    wet_mean = series[series['month'].dt.month.isin(wet_months)][column].mean()
    min_idx = series[column].idxmin()
    summary_rows.append({
        'location': column.replace('NDVI_', ''),
        'dry_mean': round(dry_mean, 3),
        'wet_mean': round(wet_mean, 3),
        'lowest_ndvi': round(series.loc[min_idx, column], 3),
        'lowest_month': series.loc[min_idx, 'month'].strftime('%Y-%m')
    })

summary_df = pd.DataFrame(summary_rows)
summary_df

## 8. Build sliding-window sequences

In [None]:
LOOKBACK = 12  # months
HORIZON = 1    # predict 1 month ahead

records = []
for column in locations:
    values = ndvi_df[column].values.astype(np.float32)
    months = ndvi_df['month'].values
    for idx in range(LOOKBACK, len(values) - HORIZON + 1):
        history = values[idx - LOOKBACK:idx]
        target_value = values[idx + HORIZON - 1]
        target_month = months[idx + HORIZON - 1]
        records.append({
            'location': column,
            'target_month': target_month,
            'sequence': history,
            'target': target_value
        })

sequence_df = pd.DataFrame(records)
sequence_df.head()

## 9. Temporal train/validation/test split

In [None]:
train_end = pd.Timestamp('2021-12-31')
val_end = pd.Timestamp('2022-12-31')

train_mask = sequence_df['target_month'] <= train_end
val_mask = (sequence_df['target_month'] > train_end) & (sequence_df['target_month'] <= val_end)
test_mask = sequence_df['target_month'] > val_end

print(f'Train samples: {train_mask.sum()}')
print(f'Validation samples: {val_mask.sum()}')
print(f'Test samples: {test_mask.sum()}')

## 10. Scaling using training range

In [None]:
train_month_mask = ndvi_df['month'] <= train_end
train_values = ndvi_df.loc[train_month_mask, locations].values.flatten()
ndvi_min, ndvi_max = train_values.min(), train_values.max()
print(f'Training NDVI range: {ndvi_min:.3f} → {ndvi_max:.3f}')

if ndvi_max - ndvi_min < 1e-6:
    raise ValueError('Training NDVI range is too narrow for scaling.')

scale = lambda arr: (arr - ndvi_min) / (ndvi_max - ndvi_min)
invert = lambda arr: arr * (ndvi_max - ndvi_min) + ndvi_min

In [None]:
def stack_split(mask):
    X = np.stack(sequence_df.loc[mask, 'sequence'].values)
    y = sequence_df.loc[mask, 'target'].values.astype(np.float32)
    dates = sequence_df.loc[mask, 'target_month'].values
    locs = sequence_df.loc[mask, 'location'].values

    X_scaled = scale(X)[:, :, None].astype(np.float32)
    y_scaled = scale(y)[:, None].astype(np.float32)
    return X_scaled, y_scaled, dates, locs

X_train, y_train, dates_train, locs_train = stack_split(train_mask)
X_val, y_val, dates_val, locs_val = stack_split(val_mask)
X_test, y_test, dates_test, locs_test = stack_split(test_mask)

X_train.shape, X_val.shape, X_test.shape

## 11. PyTorch datasets & data loaders

In [None]:
class SequenceDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

BATCH_SIZE = 32

train_dataset = SequenceDataset(X_train, y_train)
val_dataset = SequenceDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

len(train_dataset), len(val_dataset)

## 12. Define LSTM forecaster

In [None]:
class NDVIForecaster(nn.Module):
    def __init__(self, input_size=1, hidden_sizes=(64, 32), dropout=0.2):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_sizes[0],
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )
        self.projection = nn.Linear(hidden_sizes[0], hidden_sizes[1])
        self.output_layer = nn.Linear(hidden_sizes[1], 1)
        self.activation = nn.ReLU()

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        last_hidden = lstm_out[:, -1, :]
        projected = self.activation(self.projection(last_hidden))
        return self.output_layer(projected)

model = NDVIForecaster().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

model

## 13. Training loop

In [None]:
def run_epoch(data_loader, model, criterion, optimizer=None):
    if optimizer:
        model.train()
    else:
        model.eval()

    total_loss, total_mae = 0.0, 0.0
    for features, targets in data_loader:
        features = features.to(device)
        targets = targets.to(device)

        if optimizer:
            optimizer.zero_grad()

        outputs = model(features)
        loss = criterion(outputs, targets)

        if optimizer:
            loss.backward()
            optimizer.step()

        batch_size = features.size(0)
        total_loss += loss.item() * batch_size
        total_mae += torch.mean(torch.abs(outputs - targets)).item() * batch_size

    dataset_size = len(data_loader.dataset)
    return total_loss / dataset_size, total_mae / dataset_size

EPOCHS = 75
history = {'epoch': [], 'train_loss': [], 'val_loss': [], 'train_mae': [], 'val_mae': []}

for epoch in range(1, EPOCHS + 1):
    train_loss, train_mae = run_epoch(train_loader, model, criterion, optimizer)
    with torch.no_grad():
        val_loss, val_mae = run_epoch(val_loader, model, criterion)

    history['epoch'].append(epoch)
    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    history['train_mae'].append(train_mae)
    history['val_mae'].append(val_mae)

    if epoch % 10 == 0 or epoch == 1:
        print(f'Epoch {epoch:03d} | train_loss={train_loss:.4f} val_loss={val_loss:.4f} val_mae={val_mae:.4f}')

## 14. Learning curves

In [None]:
history_df = pd.DataFrame(history)
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].plot(history_df['epoch'], history_df['train_loss'], label='Train MSE')
axes[0].plot(history_df['epoch'], history_df['val_loss'], label='Val MSE')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('MSE')
axes[0].legend()

axes[1].plot(history_df['epoch'], history_df['train_mae'], label='Train MAE')
axes[1].plot(history_df['epoch'], history_df['val_mae'], label='Val MAE')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('MAE')
axes[1].set_title('MAE')
axes[1].legend()

plt.tight_layout()

## 15. Evaluation on held-out months

In [None]:
model.eval()
with torch.no_grad():
    test_predictions = model(torch.tensor(X_test, dtype=torch.float32, device=device))

y_test_pred = test_predictions.cpu().numpy().squeeze()
y_test_true = y_test.squeeze()

pred_ndvi = invert(y_test_pred)
true_ndvi = invert(y_test_true)

results_df = pd.DataFrame({
    'target_month': pd.to_datetime(dates_test),
    'location': [loc.replace('NDVI_', '') for loc in locs_test],
    'actual_ndvi': true_ndvi,
    'predicted_ndvi': pred_ndvi
}).sort_values('target_month')

results_df.head()

In [None]:
mae = np.mean(np.abs(pred_ndvi - true_ndvi))
rmse = np.sqrt(np.mean((pred_ndvi - true_ndvi) ** 2))
print(f'Test MAE: {mae:.3f}')
print(f'Test RMSE: {rmse:.3f}')

In [None]:
for location, group in results_df.groupby('location'):
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.plot(group['target_month'], group['actual_ndvi'], marker='o', label='Actual')
    ax.plot(group['target_month'], group['predicted_ndvi'], marker='x', linestyle='--', label='Predicted')
    ax.axhline(0.4, color='red', linestyle='--', linewidth=1, label='NDVI 0.40 threshold')
    ax.set_title(f'{location} · held-out NDVI forecast')
    ax.set_ylabel('NDVI')
    ax.set_xlabel('Month')
    ax.legend(loc='best')
    plt.tight_layout()
    plt.show()

## 16. Drought alert metrics

In [None]:
DROUGHT_THRESHOLD = 0.40

results_df['actual_drought'] = results_df['actual_ndvi'] < DROUGHT_THRESHOLD
results_df['predicted_drought'] = results_df['predicted_ndvi'] < DROUGHT_THRESHOLD

confusion = pd.crosstab(results_df['actual_drought'], results_df['predicted_drought'],
                        rownames=['actual'], colnames=['predicted'], dropna=False)
print(confusion)

true_positives = int(((results_df['actual_drought']) & (results_df['predicted_drought'])).sum())
false_positives = int((~results_df['actual_drought'] & results_df['predicted_drought']).sum())
false_negatives = int((results_df['actual_drought'] & ~results_df['predicted_drought']).sum())
precision = true_positives / (true_positives + false_positives + 1e-6)
recall = true_positives / (true_positives + false_negatives + 1e-6)

print(f'Precision: {precision:.3f}')
print(f'Recall: {recall:.3f}')

## 17. Teaching notes & extensions

- Demonstrate toggling `USE_GEE` to show real-time access vs cached workflow.
- Encourage participants to export their own geometries (barangay, irrigation system) for custom runs.
- Extensions: merge rainfall/SPEI, try multi-output LSTMs, or run explainability (SHAP, integrated gradients).