<a href="https://colab.research.google.com/github/sisn749/Landslide_Susceptibility_GEOG761/blob/main/Ensemble_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extract patches

NO NEED TO RUN THIS IF YOU ALREADY HAVE THE DATASET;  JUMP TO "ENSEMBLE MODEL ..." IF YOU NEED TO RERUN THE MODEL

-------


How it works:

- Sentinel-2 SR provides L2A reflectance at 10 m.

- The .median() composite merges cloud-free pixels from 2019–2022.

- Each point.buffer(464 m) yields a ~928 m window around the coordinate.

- Patches are exported to Google Drive → “GEE_Landslide_Patches” folder.

In [None]:
import ee
ee.Authenticate()
ee.Initialize(project='clara-geog761-tryout-1')

Now, we will transform the numerical long-lat  into GEE points that are georeferenced. The cell after this visualizes these points.

In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/GEOG761 Machine Learning for Remote Sensing/Group project landslide susceptibility/landslides_with_variables_fixed1.csv")

In [None]:
# Convert to ee.FeatureCollection
def row_to_feature(row):
    geom = ee.Geometry.Point(float(row['Longitude']), float(row['Latitude']))
    # Make sure 'Valid Landslide' is integer and has no nulls for filtering
    label = row['Valid Landslide']
    if pd.notna(label):
        return ee.Feature(geom, {'id': int(row.name), 'label': int(label)})
    else:
      print("AAAAAAAAAAAAAA")
    return None

features = [row_to_feature(r) for _, r in df.iterrows()]
features = [f for f in features if f is not None] # Remove null features
fc = ee.FeatureCollection(features)

print("Feature collection created with", fc.size().getInfo(), "points.")

In [None]:
# Visualizing all of our datapoint on a GEE map

import geemap

Map = geemap.Map()
vis_params = {'color': 'red'}
Map.addLayer(fc, vis_params, 'Landslide Points')
Map.centerObject(fc, 8)
Map

In [None]:
# Sentinel-2 Level-2A (Surface reflectance)
collection = (ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
              .filterDate('2022-01-01', '2022-12-31')
              .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 10)))

median_image = collection.median().select(['B2','B3','B4','B8','B11','B12'])

median_image = median_image.unmask(0)

Calculate area needed around each datapoint to create patches by taking the mean maximum area from all of the landslides

In [None]:
import numpy as np

# Define patch size
df['Area Maximum'] = pd.to_numeric(df['Area Maximum'], errors='coerce')
patch_size_m = np.sqrt(df['Area Maximum'].mean(skipna=True) * 10_000)
half_width = patch_size_m / 2
print(f"Suggested patch width: {patch_size_m:.0f} meters")

In [None]:
# Define the number of samples to take from each class
sample_size = 1250

# Filter the collection for each class
positives = fc.filter(ee.Filter.eq('label', 1)).limit(sample_size)
negatives = fc.filter(ee.Filter.eq('label', 0)).limit(sample_size)

# Merge the two limited collections into one
fc_limited = positives.merge(negatives)

# Shuffle the collection to mix the positive and negative samples
fc_limited = fc_limited.randomColumn()
fc_limited = fc_limited.sort('random')
print(f"Limited feature collection to {fc_limited.size().getInfo()} points.")

# The following function extracts a patch and sets its ID and label as properties
def create_and_tag_patch(feature):

    # Define the approximate area (in meters)
    patch_geometry = feature.geometry().buffer(half_width).bounds()

    # Store that area as metadata
    return median_image.set({
        'id': feature.get('id'),
        'label': feature.get('label'),
        'patch_geometry': patch_geometry
    })

tagged_image_patches = fc_limited.map(create_and_tag_patch)

print(f"Created a collection of {tagged_image_patches.size().getInfo()} tagged patches.")

In [None]:
# Making sure that the patches have their labels

print("Labels of first 5 patches:", tagged_image_patches.limit(5).aggregate_array('label').getInfo())
print("Nr of negative and positive samples:", tagged_image_patches.aggregate_histogram('label').getInfo())

Now we will create these patches in GEE.


In [None]:
# Get the size from the ImageCollection
num_patches = tagged_image_patches.size().getInfo()

# Batch-fetch metadata by aggregating directly from the ImageCollection
all_ids_client = tagged_image_patches.aggregate_array('id').getInfo()
all_labels_client = tagged_image_patches.aggregate_array('label').getInfo()
all_geoms_client = tagged_image_patches.aggregate_array('patch_geometry').getInfo()

In [None]:
# This function creates a task for each patch

for i in range(num_patches):

    patch_id = all_ids_client[i]
    patch_label = all_labels_client[i]
    patch_geom = all_geoms_client[i]

    clean_geom = ee.Geometry.Polygon(patch_geom['coordinates'])
    filename = f"patch_id_{patch_id}_label_{patch_label}"

    task = ee.batch.Export.image.toDrive(
        image=median_image,
        description=f'Export_Patch_id_{patch_id}_index_{i}',
        folder='GEE_Landslide_Patches_2',
        fileNamePrefix=filename,
        region=clean_geom,
        dimensions='100x100',
        fileFormat='GeoTIFF'
    )


    if i%25 == 0:
      print(f"Exporting patch {i+1}/{num_patches}...")

    task.start()

print(f"All {num_patches} tasks have been submitted.")
print("Monitor their progress in the 'Tasks' tab of the GEE Code Editor.")

In [None]:
# Visualizing one patch

import rasterio
from rasterio.plot import show
import matplotlib.pyplot as plt
import numpy as np

filepath = '/content/drive/MyDrive/GEE_Landslide_Patches/patch_id_1171_label_1.tif'

with rasterio.open(filepath) as src:
    # Read the red, green, and blue bands into a 3D array
    # Note: Sentinel-2 band numbers might be different, e.g., B4, B3, B2 are often bands 4, 3, 2.
    # We'll assume the first three bands are the ones we want for simplicity here.
    rgb = src.read([1, 2, 3])

    # Function to normalize bands for display
    def normalize(array):
        array_min, array_max = array.min(), array.max()
        return ((array - array_min) / (array_max - array_min))

    # Normalize each band to the 0-1 range for proper RGB display
    red_normalized = normalize(rgb[0])
    green_normalized = normalize(rgb[1])
    blue_normalized = normalize(rgb[2])

    # Stack the bands back together
    rgb_normalized = np.dstack((red_normalized, green_normalized, blue_normalized))

    # Display the true-color image
    plt.imshow(rgb_normalized)
    plt.show()

# Ensemble model: Random forest - CNN
This part of the code creates a multi-modal model that is able to take in patches and numerical data like slope, landcover etc.

1. Loading the data

This code loads two types of data: .tif files which are patches created around the long-lat points in our numerical datasets, and some geological and topographical data for each point. In the end, we align these two types of data to be able to feed it into the model. We create the following data:
- X_images_list: All the pixel data from the patches.

- X_numerical_list: All the corresponding numerical features (slope, curvature, etc.) from the CSV.

- y_labels_list: The labels (0 or 1) that align with both.

Then we convert these lists into numpy arrays and normalize them for the machine learning model by transposing them to (samples, h, w, bands) and dividing them by MAX_PIXEL_VALUE.

In [None]:
# Run this if you need to install rasterio
!pip install rasterio

In [None]:
import pandas as pd
import os
import glob
import rasterio
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the numerical data
# Make sure to change the path to where the dataset is stored on your own device 
csv_path = '/content/drive/MyDrive/GEOG761 Machine Learning for Remote Sensing/Group project landslide susceptibility/landslides_with_variables_fixed1.csv'
df = pd.read_csv(csv_path)

# Load image patches and align
# Make sure to change the path to where the dataset is stored on your own device 
folder_path = '/content/drive/MyDrive/GEE_Landslide_Patches_2'
patch_files = glob.glob(folder_path + "/*.tif")

X_images_list = []
X_numerical_list = []
y_labels_list = []

print(f"Found {len(patch_files)} files. Aligning with CSV...")

for file_path in patch_files:
    filename = os.path.basename(file_path)

    #  Extract ID and Label from filename
    # e.g., 'patch_id_89_label_1.tif'
    parts = filename.split('_')
    patch_id = int(parts[2]) # Get the '89'
    patch_label = int(parts[4].split('.tif')[0]) # Get the '1'

    # Find the matching row in the CSV
    row = df[df['ID'] == patch_id]

    if not row.empty:
        # Add the numerical data
        numerical_features = row[['CURVATURE', 'TWI', 'SLOPE_deg', 'DEM', 'LANDCOVER_CODE', 'ASPECT_sin', 'ASPECT_cos']].values[0]
        X_numerical_list.append(numerical_features)

        # Add the label
        y_labels_list.append(patch_label)

        # Add the image data
        with rasterio.open(file_path) as src:
            X_images_list.append(src.read())

# Convert to NumPy Arrays
X_images = np.array(X_images_list)
X_numerical = np.array(X_numerical_list)
y = np.array(y_labels_list)
MAX_PIXEL_VALUE = np.max(X_images)
print(MAX_PIXEL_VALUE)

# -- Pre-process --
# Images: Transpose and normalize as before
X_images_norm = np.transpose(X_images, (0, 2, 3, 1)) / MAX_PIXEL_VALUE

# Numerical: Scale the data
# This transforms the numerical data so it has a mean of 0 and a standard deviation of 1
scaler = StandardScaler()
X_numerical_norm = scaler.fit_transform(X_numerical)

print(f"Image data shape: {X_images_norm.shape}")
print(f"Numerical data shape: {X_numerical_norm.shape}")
print(f"Label data shape: {y.shape}")

In [None]:
# Dataset split

from sklearn.model_selection import train_test_split

# Split all three datasets together
# We need to split twice to explicitely define the validation set because a random forest model,
# as opposed to a CNN, does not have a built-in validation split

# First split: (Train + Val) and Test (80% / 20%)
X_img_train_val, X_img_test, y_train_val, y_test = train_test_split(
    X_images_norm, y, test_size=0.2, random_state=42, stratify=y
)
X_num_train_val, X_num_test, _, _ = train_test_split(
    X_numerical_norm, y, test_size=0.2, random_state=42, stratify=y
)

# Second split: (Train) and (Val) (75% of 80% = 60% / 25% of 80% = 20%)
X_img_train, X_img_val, y_train, y_val = train_test_split(
    X_img_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)
X_num_train, X_num_val, _, _ = train_test_split(
    X_num_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)

print(f"Train shapes: {X_img_train.shape}, {X_num_train.shape}, {y_train.shape}")
print(f"Val shapes:   {X_img_val.shape}, {X_num_val.shape}, {y_val.shape}")
print(f"Test shapes:  {X_img_test.shape}, {X_num_test.shape}, {y_test.shape}")

Now we build and train the CNN first.

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

image_input_shape = X_img_train.shape[1:]
input_image = keras.Input(shape=image_input_shape)
cnn_branch = layers.Conv2D(32, (3, 3), activation='relu')(input_image)
cnn_branch = layers.MaxPooling2D((2, 2))(cnn_branch)
cnn_branch = layers.Conv2D(64, (3, 3), activation='relu')(cnn_branch)
cnn_branch = layers.MaxPooling2D((2, 2))(cnn_branch)
cnn_branch = layers.Flatten()(cnn_branch)
cnn_branch = layers.Dense(64, activation='relu')(cnn_branch)
cnn_output = layers.Dense(1, activation='sigmoid')(cnn_branch)

cnn_model = keras.Model(inputs=input_image, outputs=cnn_output)

cnn_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.0001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print("\n--- Training CNN Model ---")
cnn_model.fit(
    X_img_train, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_img_val, y_val)
)

Now the simple random forest.

In [None]:
from sklearn.ensemble import RandomForestClassifier

print("\n--- Training Random Forest Model ---")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_num_train, y_train)

# Check validation accuracy
val_accuracy = rf_model.score(X_num_train, y_train)
print(f"Random Forest Validation Accuracy: {val_accuracy:.4f}")

## Ensemble model combined by logistic regression

In the next step, we will stack both of the models. We will use the validation set to train a *new* very simple model (I chose logistic regression, it just needs to learn how to best combine the predictions from the two base models). The logistic regression model learns how much to trust each base model's prediction.

It looks like this:

It gets the inputs from both models:

- The CNN: "Based on the image, I'm 90% sure this is a landslide."

- The Random Forest: "Based on the numbers, I'm only 60% sure."

The logistic regression model's job is to take those two inputs (90% and 60%) and make the final decision. By training it on the validation data, it learns which model is more reliable in which situations.

For example, it might learn: "The CNN is usually overconfident, so I'll trust the Random Forest more." Or it might learn: "When both models agree, they are almost always right, but when they disagree, I should trust the CNN."

It learns the optimal weights or rules to combine the two predictions into a single, more accurate final prediction.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Get predictions from base models on the VALIDATION set
print("Training meta-model")
cnn_val_probs = cnn_model.predict(X_img_val)
rf_val_probs = rf_model.predict_proba(X_num_val)[:, 1] # Get prob of class '1'

# Stack these predictions into a new feature array
# Shape will be (num_val_samples, 2)
X_val_stacked = np.c_[cnn_val_probs, rf_val_probs]

# Train the meta-model (Logistic Regression is perfect for this)
# It learns the best "weight" for the CNN prob vs. the RF prob
meta_model = LogisticRegression()
meta_model.fit(X_val_stacked, y_val)

print("Meta-model trained.")

And finally, we will evaluate the final hybrid model on the test set.

In [None]:
# Get predictions from base models on the TEST set
print("\n--- Final Evaluation on Test Set ---")
cnn_test_probs = cnn_model.predict(X_img_test)
rf_test_probs = rf_model.predict_proba(X_num_test)[:, 1]

# Stack them just like before
X_test_stacked = np.c_[cnn_test_probs, rf_test_probs]

# Get the final hybrid prediction from the meta-model
hybrid_predictions = meta_model.predict(X_test_stacked)

# Report the results
final_accuracy = accuracy_score(y_test, hybrid_predictions)
print(f"\nFinal HYBRID Model Accuracy: {final_accuracy * 100:.2f}%")
print("\nHybrid Model Classification Report:")

print(classification_report(y_test, hybrid_predictions))

In [None]:
# Prediction values for each patch in the test set

hybrid_probabilities = meta_model.predict_proba(X_test_stacked)[:, 1]
print(hybrid_probabilities)