In [None]:
!uv pip install -r requirements.txt

### --- Import Libraries ---


In [1]:
import io
from datetime import datetime, timedelta

import ee
from google.api_core import exceptions, retry
import google.auth
import numpy as np
from numpy.lib.recfunctions import structured_to_unstructured
import requests



### --- Constants and Earth Engine Initialization ---


In [2]:
SCALE = 5000  # meters per pixel
WORLD_SCALE = 10_000
LAND_COVER_DATASET = "GOOGLE/DYNAMICWORLD/V1"  # Dynamic World Land Cover dataset
LAND_COVER_BAND = "Map"  # Land cover classification band
WORLD_POLYGONS = [
    # Americas
    [(-33.0, -7.0), (-55.0, 53.0), (-166.0, 65.0), (-68.0, -56.0)],
    # Africa, Asia, Europe
    [
        (74.0, 71.0),
        (166.0, 55.0),
        (115.0, -11.0),
        (74.0, -4.0),
        (20.0, -38.0),
        (-29.0, 25.0),
    ],
    # Australia
    [(170.0, -47.0), (179.0, -37.0), (167.0, -12.0), (128.0, 17.0), (106.0, -29.0)],
]
POLYGON = [(-140.0, 60.0), (-140.0, -60.0), (-10.0, -60.0), (-10.0, 60.0)]

### --- Earth Engine Initialization ---


In [3]:
project = "ee-anirudhananth"
# Use cli to authenticate
# !earthengine authenticate

# Or use the following code to authenticate
def initialize_ee():
  ee.Authenticate()
  ee.Initialize(project=project, opt_url="https://earthengine-highvolume.googleapis.com")

In [4]:
initialize_ee()

### --- Data Retreival Functions ---


In [5]:
def get_modis_ndvi(date: datetime) -> ee.Image:
    """Gets MODIS NDVI data for a given date."""
    return (
        ee.ImageCollection("MODIS/006/MOD13A2")
        .filterDate(date, date + timedelta(days=1))
        .select("NDVI")
        .first()
    )

In [6]:
def get_landsat_image(date: datetime) -> ee.Image:
    """Gets a Landsat 8 image for the selected date."""
    return (
        ee.ImageCollection("LANDSAT/8/C01/T1_SR")
        .filterDate(date, date  + timedelta(days=1))
        .mosaic()
    )

In [7]:
def get_landsat_ndvi(image: ee.Image) -> ee.Image:
    """Calculates NDVI from a Landsat 8 image."""
    return image.normalizedDifference(["B5", "B4"]).rename("NDVI")

In [8]:
def get_landsat_lst(image: ee.Image) -> ee.Image:
    """
    Calculates Land Surface Temperature from a Landsat 8 image.
    This function is based on the formula in the following page https://developers.google.com/earth-engine/datasets/catalog/LANDSAT_LC08_C02_T1_L2
    """
    return image.select("ST_B10").multiply(0.00341802).add(149.0).rename("LST")

In [9]:
def get_land_cover(date: datetime) -> ee.Image:
    """Gets a Land Cover image for the given date."""
    return (
        ee.ImageCollection(LAND_COVER_DATASET)
        .filterDate(date, date + timedelta(days=30))
        .select("label")
        .first()
        # .rename("landcover")
        .unmask(0)  # fill missing values with 0 (water)
        .byte()
    )

### --- Input and Label Image Composition ---


In [10]:
def get_inputs_image(date: datetime) -> ee.Image:
    """Gets an Earth Engine image with all the inputs for the model."""
    # Get MODIS NDVI
    modis_ndvi = get_modis_ndvi(date)

    # Get Landsat data
    landsat_image = get_landsat_image(date)
    landsat_ndvi = get_landsat_ndvi(landsat_image)
    landsat_lst = get_landsat_lst(landsat_image)

    # Combine all input data
    return ee.Image([modis_ndvi, landsat_ndvi, landsat_lst])

In [11]:
def get_labels_image(year: int) -> ee.Image:
    """Gets a Land Cover image for the selected year and preprocesses it."""
    land_cover = get_land_cover(year)
    # Add preprocessing steps if needed (e.g., remapping land cover classes)
    return land_cover

### --- Get input and labels for a given latitude and longitude ---


In [12]:
@retry.Retry(deadline=10 * 60)  # seconds
def get_patch(
    image: ee.Image, lonlat: tuple[float, float], patch_size: int, scale: int
) -> np.ndarray:
    """Fetches a patch of pixels from Earth Engine."""
    point = ee.Geometry.Point(lonlat)
    url = image.getDownloadURL(
        {
            "region": point.buffer(scale * patch_size / 2, 1).bounds(1),
            "dimensions": [patch_size, patch_size],
            "scale": SCALE,
            "format": "NPY",
        }
    )

    # Retry on "Too Many Requests" errors
    response = requests.get(url)
    if response.status_code == 429:
        raise exceptions.TooManyRequests(response.text)

    # Raise other exceptions
    response.raise_for_status()
    return np.load(io.BytesIO(response.content), allow_pickle=True)

In [13]:
def get_inputs_patch(
    date: datetime, lonlat: tuple[float, float], patch_size: int
) -> np.ndarray:
    """Gets the inputs patch of pixels for the given point and date."""
    image = get_inputs_image(date)
    patch = get_patch(image, lonlat, patch_size, SCALE)
    return structured_to_unstructured(patch)


def get_labels_patch(
    date: datetime, lonlat: tuple[float, float], patch_size: int
) -> np.ndarray:
    """Gets the labels patch of pixels for the given point and year."""
    image = get_labels_image(date)
    patch = get_patch(image, lonlat, patch_size, SCALE)
    return structured_to_unstructured(patch)

## Creating the dataset


### --- Imports ---


In [14]:
import logging
import random
from datetime import datetime, timedelta

import dask.bag as db
import dask
from dask.bag.core import Bag
from dask.distributed import Client
import numpy as np
import os
import pandas as pd
import uuid

### --- Configs ---


In [15]:
NUM_SAMPLES = 1000
PATCH_SIZE = 128
PARTITION_SIZE = 10
START_DATE = "2015-07-01"
END_DATE = "2021-12-01"

### --- Sample Points ---


In [64]:
def random_date(start: datetime, end: datetime):
    """Generate a random datetime between `start` and `end`"""
    return start + timedelta(
        # Get a random amount of seconds between `start` and `end`
        seconds=random.randint(0, int((end - start).total_seconds())),
    )

Getting land cover for 2019-09-04 02:53:34
Land cover snapshot date: 2019-09-04 02:56:00
Getting land cover for 2016-05-26 11:28:45
Land cover snapshot date: 2016-05-26 11:36:51


In [71]:
def sample_points(date: datetime) -> tuple:
    """Samples points within the defined polygon for the given year."""
    while True:
        initialize_ee()
        dask.distributed.print(f"Getting land cover for {date}")
        land_cover = get_land_cover(date)
        snapshot_date = land_cover.date().format().getInfo()
        snapshot_date = datetime.strptime(snapshot_date, "%Y-%m-%dT%H:%M:%S")
        dask.distributed.print(f"Land cover snapshot date: {snapshot_date}")
        points = land_cover.stratifiedSample(
            numPoints=1,
            region=ee.Geometry.MultiPolygon(WORLD_POLYGONS),
            scale=WORLD_SCALE,
            geometries=True,
        )
        dask.distributed.print(f"Found for date {date} {points.size().getInfo()} points")
        if int(points.size().getInfo()) > 0:
            point = points.toList(points.size()).getInfo()[0]
            break
        start_date = datetime.strptime(START_DATE, "%Y-%m-%d")
        end_date = datetime.strptime(END_DATE, "%Y-%m-%d")
        date = random_date(start_date, end_date)
    return (snapshot_date, point["geometry"]["coordinates"])

Getting land cover for 2021-05-07 18:44:44
Land cover snapshot date: 2021-05-07 19:06:52


### --- Prepare Training Data ---


In [17]:
def get_training_example(date: datetime, point: tuple) -> tuple:
    """Gets an (inputs, labels) training example for land cover change prediction."""
    inputs = get_inputs_patch(date, point, PATCH_SIZE)
    # Get land cover for the next day
    labels = get_labels_patch(date + timedelta(days=1), point, PATCH_SIZE)
    return (inputs, labels)

In [24]:
import dask.distributed


def try_get_example(date: datetime, point: tuple) -> "tuple | None":
    """Wrapper to handle errors during training data generation."""
    ee.Initialize(project=project)
    dask.distributed.print(f"Generating training data for {date} at {point}")
    try:
        return get_training_example(date, point)
    except Exception as e:
        dask.distributed.print(f"Error occurred: {e}")

Found for date 2019-09-15 22:53:55 1 points
Getting land cover for 2020-01-10 23:05:55
Land cover snapshot date: 2020-01-10 23:11:54
Found for date 2020-01-10 23:05:55 2 points
Getting land cover for 2020-02-09 10:58:46
Land cover snapshot date: 2020-02-09 11:01:30
Found for date 2020-02-09 10:58:46 2 points
Getting land cover for 2015-07-12 18:50:28
Land cover snapshot date: 2015-07-15 09:43:08
Found for date 2015-07-12 18:50:28 2 points
Getting land cover for 2020-01-24 12:19:31
Land cover snapshot date: 2020-01-24 12:34:56
Found for date 2020-01-24 12:19:31 2 points


# --- Dask Workflow for Dataset Creation ---

In [20]:
def write_npz(data: Bag, data_path: str) -> str:
    """Writes an (inputs, labels) set of data into a compressed NumPy file.

    Args:
        batch: Batch of (inputs, labels) pairs of NumPy arrays.
        data_path: Directory path to save files to.

    Returns: The filename of the data file.
    """
    initialize_ee()
    data.compute()
    dask.distributed.print(f"Writing {len(data)} data points to {data_path}")
    filename = os.path.join(data_path, f"{uuid.uuid4()}.npz")
    with open(filename, "xb") as f:
        inputs = [x for (x, _) in data]
        labels = [y for (_, y) in data]
        np.savez_compressed(f, inputs=inputs, labels=labels)
    logging.info(filename)
    return filename

In [21]:
client = Client()
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 52792 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:52792/status,

0,1
Dashboard: http://127.0.0.1:52792/status,Workers: 4
Total threads: 8,Total memory: 16.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:52793,Workers: 4
Dashboard: http://127.0.0.1:52792/status,Total threads: 8
Started: Just now,Total memory: 16.00 GiB

0,1
Comm: tcp://127.0.0.1:52806,Total threads: 2
Dashboard: http://127.0.0.1:52811/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:52796,
Local directory: /var/folders/cx/87dk66wd4x75lqp06c180d5m0000gn/T/dask-scratch-space/worker-mgphyn9j,Local directory: /var/folders/cx/87dk66wd4x75lqp06c180d5m0000gn/T/dask-scratch-space/worker-mgphyn9j

0,1
Comm: tcp://127.0.0.1:52804,Total threads: 2
Dashboard: http://127.0.0.1:52808/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:52797,
Local directory: /var/folders/cx/87dk66wd4x75lqp06c180d5m0000gn/T/dask-scratch-space/worker-ky8u2kz5,Local directory: /var/folders/cx/87dk66wd4x75lqp06c180d5m0000gn/T/dask-scratch-space/worker-ky8u2kz5

0,1
Comm: tcp://127.0.0.1:52805,Total threads: 2
Dashboard: http://127.0.0.1:52810/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:52798,
Local directory: /var/folders/cx/87dk66wd4x75lqp06c180d5m0000gn/T/dask-scratch-space/worker-8_b7wtyt,Local directory: /var/folders/cx/87dk66wd4x75lqp06c180d5m0000gn/T/dask-scratch-space/worker-8_b7wtyt

0,1
Comm: tcp://127.0.0.1:52807,Total threads: 2
Dashboard: http://127.0.0.1:52809/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:52799,
Local directory: /var/folders/cx/87dk66wd4x75lqp06c180d5m0000gn/T/dask-scratch-space/worker-qgzb708d,Local directory: /var/folders/cx/87dk66wd4x75lqp06c180d5m0000gn/T/dask-scratch-space/worker-qgzb708d


In [65]:
def run(data_path: str, samples: int = NUM_SAMPLES) -> None:
    """Runs the Dask workflow to generate the dataset."""

    # Generate dates from the start date to the end date
    start_date = datetime.strptime(START_DATE, "%Y-%m-%d")
    end_date = datetime.strptime(END_DATE, "%Y-%m-%d")

    random_dates = [random_date(start_date, end_date) for _ in range(samples)]

    # Authenticate and initialize Earth Engine.
    initialize_ee()
    print(client)

    def wrapper(data, data_path):
        return write_npz(data, data_path)

    points = (
        db.from_sequence(random_dates, npartitions=PARTITION_SIZE)
        .map(sample_points)
        .compute()
    )
    dask.distributed.print(points.count)
    training_data = list(
        db.from_sequence(points, npartitions=PARTITION_SIZE)
        .map(try_get_example)
        .filter(lambda x: x is not None)
    )
    # training_data = list(bag.map(try_get_example).filter(lambda x: x is not None))

    db.from_sequence(training_data, npartitions=PARTITION_SIZE).map_partitions(
        wrapper, data_path=data_path
    ).compute()
    training_data.map_partitions(wrapper, data_path=data_path).compute()

### --- Perform Dataset Creation ---

In [72]:
logging.getLogger().setLevel(logging.INFO)
run("data/climate_change/")

<Client: 'tcp://127.0.0.1:52793' processes=4 threads=8, memory=16.00 GiB>
Found for date 2016-08-16 19:57:10 2 points
Getting land cover for 2016-08-11 01:16:50
Land cover snapshot date: 2016-08-11 01:26:58
Getting land cover for 2021-04-10 12:21:28
Land cover snapshot date: 2021-04-10 12:31:54
Getting land cover for 2020-08-30 19:05:38
Land cover snapshot date: 2020-08-30 19:14:16


Key:       ('sample_points-749fed88dfe4ed3d89f6e74302a7e814', 3)
Function:  execute_task
args:      ((<function reify at 0x114da1820>, (<function map_chunk at 0x114da1c10>, <function sample_points at 0x1255bc820>, [[datetime.datetime(2016, 12, 8, 19, 21, 19), datetime.datetime(2018, 7, 30, 20, 48, 2), datetime.datetime(2020, 11, 5, 21, 43, 54), datetime.datetime(2017, 12, 13, 5, 51, 42), datetime.datetime(2020, 11, 2, 17, 23, 27), datetime.datetime(2018, 6, 16, 22, 10, 11), datetime.datetime(2019, 12, 20, 2, 21, 36), datetime.datetime(2016, 9, 6, 23, 48, 46), datetime.datetime(2020, 8, 24, 15, 9, 57), datetime.datetime(2019, 11, 24, 10, 0, 43), datetime.datetime(2021, 1, 29, 13, 17, 56), datetime.datetime(2017, 8, 16, 7, 16, 13), datetime.datetime(2017, 9, 27, 9, 41, 28), datetime.datetime(2020, 2, 24, 12, 36, 32), datetime.datetime(2017, 7, 3, 19, 32, 48), datetime.datetime(2018, 4, 24, 21, 14, 10), datetime.datetime(2020, 6, 11, 17, 38, 52), datetime.datetime(2018, 12, 26, 5, 13, 3),

Getting land cover for 2017-12-07 11:19:10
Found for date 2020-08-30 19:05:38 2 points
Land cover snapshot date: 2017-12-07 11:44:48
Getting land cover for 2018-05-29 10:18:58
Land cover snapshot date: 2018-05-29 10:42:17
Getting land cover for 2017-11-16 19:46:40
Land cover snapshot date: 2017-11-16 19:46:54
Found for date 2017-04-03 01:58:36 2 points
Getting land cover for 2019-02-20 09:54:42
Land cover snapshot date: 2019-02-20 10:13:00
Getting land cover for 2020-02-21 20:25:14
Land cover snapshot date: 2020-02-21 20:29:23
Getting land cover for 2018-09-13 13:46:50
Land cover snapshot date: 2018-09-13 13:52:20
Found for date 2016-08-11 01:16:50 4 points
Getting land cover for 2020-09-15 16:12:48
Land cover snapshot date: 2020-09-15 16:20:53
Found for date 2018-05-29 10:18:58 1 points
Getting land cover for 2018-02-05 11:39:46
Land cover snapshot date: 2018-02-05 11:43:45
Getting land cover for 2018-04-28 05:49:34
Land cover snapshot date: 2018-04-28 06:04:35
Found for date 2019-04-

KeyboardInterrupt: 

Found for date 2018-02-05 11:39:46 1 points
Getting land cover for 2018-06-07 15:51:18
Land cover snapshot date: 2018-06-07 15:55:25
Getting land cover for 2018-02-03 06:01:32
Land cover snapshot date: 2018-02-03 06:06:32
Found for date 2015-12-08 22:59:38 2 points
Getting land cover for 2021-11-21 23:49:33
Land cover snapshot date: 2021-11-21 23:49:48
Found for date 2018-10-16 15:08:24 1 points
Getting land cover for 2017-12-11 12:57:19
Land cover snapshot date: 2017-12-11 13:10:22
Found for date 2020-09-01 02:29:20 0 points
Getting land cover for 2020-10-01 20:13:59
Land cover snapshot date: 2020-10-01 20:34:57
Getting land cover for 2017-05-05 12:38:50
Land cover snapshot date: 2017-05-05 12:57:07
Getting land cover for 2021-08-23 13:33:21
Land cover snapshot date: 2021-08-23 13:50:15
Getting land cover for 2018-04-08 13:39:16
Land cover snapshot date: 2018-04-08 13:46:36
Getting land cover for 2018-03-06 05:03:17
Land cover snapshot date: 2018-03-06 05:12:26
