In [1]:
import ee
import geemap
import os
import calendar

import rasterio
import numpy as np
from PIL import Image

ee.Authenticate()
ee.Initialize()

In [2]:
# ==========================================
# 1. LOAD DHS CLUSTERS FROM ASSET
# ==========================================

# Define asset path for DHS clusters
ASSET_ID = 'projects/integrated-hawk-485001-k3/assets/ph_dhs_sample_50'

# Load the asset directly as a FeatureCollection
dhs_points = ee.FeatureCollection(ASSET_ID)

# Verify the data loaded correctly by printing the size
print(f"Loaded {dhs_points.size().getInfo()} points from asset.")

Loaded 50 points from asset.


In [3]:
# ==========================================
# 2. DEFINE BUFFER FUNCTION
# ==========================================
# We need to turn each single point into a square box for the CNN.
# 2km buffer for urban, 5km for rural

def adaptive_buffer(feature):
    # 1. Get the Urban/Rural flag from column 'URBAN_RURA'
    urban_rural_status = ee.String(feature.get('URBAN_RURA'))
    
    # 2. Check if it is Urban (Compare string to 'U')
    is_urban = urban_rural_status.compareTo('U').eq(0)
    
    # 3. Choose Radius based on status
    # Logic: If Urban -> 2000m, Else -> 5000m
    radius = ee.Number(ee.Algorithms.If(is_urban, 2000, 5000))
    
    # 4. Buffer and Box
    return feature.buffer(radius).bounds()

# Apply the buffer function to all DHS points
dhs_squares = dhs_points.map(adaptive_buffer)
cluster_ids = dhs_squares.aggregate_array('DHSCLUST').getInfo()
tif_filenames = [f"dhs_{str(c_id)}.tif" for c_id in cluster_ids]

# Verify on Map
Map = geemap.Map()
Map.centerObject(dhs_points, 6)
Map.addLayer(dhs_points, {'color': 'red'}, 'Original DHS Points')
Map.addLayer(dhs_squares, {'color': 'blue'}, 'Buffered Squares (Download Area)')
Map

Map(center=[12.15445433292109, 122.42655412583971], controls=(WidgetControl(options=['position', 'transparent_…

In [4]:
# ==========================================
# SETUP
# ==========================================
# Change this to your desired output folder
base_output_dir = os.path.expanduser('/Users/ruben/Desktop/Thesis/TrainingData/Sentinel2/sample50-quarterly-2022')
YEAR = 2022

if not os.path.exists(base_output_dir):
    os.makedirs(base_output_dir)

In [5]:
# Define the cloud masking function (Standard S2)
def mask_s2_clouds(image):
    qa = image.select('QA60')
    mask = qa.bitwiseAnd(1 << 10).eq(0).And(qa.bitwiseAnd(1 << 11).eq(0))
    return image.updateMask(mask).divide(10000)

In [None]:
# ==========================================
# NESTED LOOP: CLUSTERS -> MONTHS
# ==========================================

# Get the list of features (Clusters)
features_list = dhs_squares.getInfo()['features']
print(f"Found {len(features_list)} clusters. Starting quarterly download...")



for i, feature in enumerate(features_list):
    cluster_id = str(feature['properties']['DHSCLUST'])
    
    # 1. Create a dedicated folder for this cluster
    cluster_dir = os.path.join(base_output_dir, cluster_id)
    if not os.path.exists(cluster_dir):
        os.makedirs(cluster_dir)

    # Get the geometry for this specific cluster
    roi_geometry = ee.Geometry.Polygon(feature['geometry']['coordinates'])

    print(f"[{i+1}/{len(features_list)}] Processing Cluster {cluster_id}...")

    quarters = {
        1: ('01-01', '03-31'),
        2: ('04-01', '06-30'),
        3: ('07-01', '09-30'),
        4: ('10-01', '12-31')
    }

    # 2. Loop through 4 quarters
    # Quarter 1: Jan-Mar (1-3)
    # Quarter 2: Apr-Jun (4-6)
    # Quarter 3: Jul-Sep (7-9)
    # Quarter 4: Oct-Dec (10-12)
    
    quarters = {
        1: ('01-01', '03-31'),
        2: ('04-01', '06-30'),
        3: ('07-01', '09-30'),
        4: ('10-01', '12-31')
    }

    for q_num, (start_md, end_md) in quarters.items():
        # Define strict start/end dates for the quarter
        start_date = f'{YEAR}-{start_md}'
        end_date = f'{YEAR}-{end_md}'
        
        # Naming convention: dhs_0001_2022_Q1.tif
        filename = f"dhs_{cluster_id}_{YEAR}_Q{q_num}.tif"
        out_path = os.path.join(cluster_dir, filename)

        if os.path.exists(out_path):
            continue

        try:
            # 1. LOAD COLLECTION
            # Note: We now filter by the full 3-month range
            quarterly_col = (ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
                .filterBounds(roi_geometry)
                .filterDate(start_date, end_date)
                .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 100)))

            if quarterly_col.size().getInfo() > 0:
                # 2. CREATE LAYERS (Same logic as before)
                best_layer = (quarterly_col
                              .map(mask_s2_clouds)
                              .select(['B4', 'B3', 'B2', 'B8', 'B11'])
                              .median()) # Median of 3 months is very clean!
                
                backup_layer = (quarterly_col
                                .select(['B4', 'B3', 'B2', 'B8', 'B11'])
                                .mosaic()
                                .divide(10000))
                
                final_img = best_layer.unmask(backup_layer).clip(roi_geometry)

                # 3. DOWNLOAD
                geemap.download_ee_image(
                    image=final_img,
                    filename=out_path,
                    region=roi_geometry,
                    scale=10,
                    crs='EPSG:3857',
                    overwrite=True
                )
            else:
                print(f"  - Warning: Zero images for Q{q_num}")

        except Exception as e:
            print(f"  - Error downloading Q{q_num}: {e}")

print("Quarterly download complete.")

Found 50 clusters. Starting quarterly download...
[1/50] Processing Cluster 441...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[2/50] Processing Cluster 450...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[3/50] Processing Cluster 442...


  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

[4/50] Processing Cluster 491...


  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

[5/50] Processing Cluster 512...


  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

[6/50] Processing Cluster 558...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[7/50] Processing Cluster 563...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[8/50] Processing Cluster 600...


  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

[9/50] Processing Cluster 532...


  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

[10/50] Processing Cluster 651...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[11/50] Processing Cluster 638...


  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

[12/50] Processing Cluster 667...


  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

[13/50] Processing Cluster 699...


  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

[14/50] Processing Cluster 727...


  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

[15/50] Processing Cluster 705...


  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

[16/50] Processing Cluster 843...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[17/50] Processing Cluster 784...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[18/50] Processing Cluster 832...


  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

[19/50] Processing Cluster 876...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[20/50] Processing Cluster 895...


  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

[21/50] Processing Cluster 909...


  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

[22/50] Processing Cluster 948...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[23/50] Processing Cluster 955...


  0%|          |0/10 tiles [00:00<?]

  0%|          |0/10 tiles [00:00<?]

  0%|          |0/10 tiles [00:00<?]

  0%|          |0/10 tiles [00:00<?]

[24/50] Processing Cluster 1087...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[25/50] Processing Cluster 1056...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[26/50] Processing Cluster 1086...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[27/50] Processing Cluster 1055...


  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

[28/50] Processing Cluster 1171...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[29/50] Processing Cluster 1182...


  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

[30/50] Processing Cluster 1180...


  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

[31/50] Processing Cluster 1228...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[32/50] Processing Cluster 355...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[33/50] Processing Cluster 393...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[34/50] Processing Cluster 346...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[35/50] Processing Cluster 405...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[36/50] Processing Cluster 361...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[37/50] Processing Cluster 349...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[38/50] Processing Cluster 76...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[39/50] Processing Cluster 138...


  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

[40/50] Processing Cluster 79...


  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

[41/50] Processing Cluster 74...


  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

  0%|          |0/15 tiles [00:00<?]

[42/50] Processing Cluster 56...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[43/50] Processing Cluster 67...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[44/50] Processing Cluster 31...


  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

[45/50] Processing Cluster 63...


  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

[46/50] Processing Cluster 216...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[47/50] Processing Cluster 164...


  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

[48/50] Processing Cluster 249...


  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

  0%|          |0/1 tiles [00:00<?]

[49/50] Processing Cluster 242...


  0%|          |0/20 tiles [00:00<?]

  0%|          |0/20 tiles [00:00<?]

  0%|          |0/20 tiles [00:00<?]

  0%|          |0/20 tiles [00:00<?]

[50/50] Processing Cluster 263...


  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

  0%|          |0/5 tiles [00:00<?]

Quarterly download complete.


In [None]:
# ==========================================
# 3. CHECK FOR MISSING QUARTERS
# ==========================================

def check_missing_quarters(base_dir, year=2022):
    """
    Scans the output directory to identify clusters with missing quarterly images.
    
    Args:
        base_dir (str): Path to the folder containing cluster subfolders.
        year (int): The year to check.
    """
    print(f"Checking dataset in: {base_dir} for Year {year} (Quarterly)...\n")
    
    # Get all cluster folders (ignoring hidden files)
    cluster_folders = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]
    
    # Sort them for clean output
    try:
        cluster_folders.sort(key=lambda x: int(x))
    except ValueError:
        cluster_folders.sort()

    missing_log = {}
    total_clusters = len(cluster_folders)
    complete_count = 0

    for cluster_id in cluster_folders:
        cluster_path = os.path.join(base_dir, cluster_id)
        missing_quarters = []

        # Check quarters 1 to 4
        for quarter in range(1, 5):
            # Updated filename format to use 'q' for quarters
            expected_filename = f"dhs_{cluster_id}_{year}_q{quarter}.tif"
            file_path = os.path.join(cluster_path, expected_filename)
            
            if not os.path.exists(file_path):
                missing_quarters.append(quarter)

        # Log results
        if missing_quarters:
            missing_log[cluster_id] = missing_quarters
            print(f"Cluster {cluster_id}: Missing {len(missing_quarters)} images -> Quarters {missing_quarters}")
        else:
            complete_count += 1

    # Final Summary
    print("-" * 40)
    print(f"SUMMARY REPORT")
    print("-" * 40)
    print(f"Total Clusters Scanned: {total_clusters}")
    print(f"Complete Clusters (4/4): {complete_count}")
    print(f"Incomplete Clusters: {len(missing_log)}")
    
    if len(missing_log) == 0:
        print("\nAll clusters have complete quarterly data!")
    else:
        print("\nRe-run the download script for the missing quarters.")

# ==========================================
# RUN THE CHECK
# ==========================================
output_folder = os.path.expanduser('/Users/ruben/Desktop/Thesis/TrainingData/Sentinel2/sample50-quarterly-2022')

check_missing_quarters(output_folder, year=2022)

Checking dataset in: /Users/ruben/Desktop/Thesis/TrainingData/Sentinel2/sample50-quarterly-2022 for Year 2022 (Quarterly)...

----------------------------------------
SUMMARY REPORT
----------------------------------------
Total Clusters Scanned: 50
Complete Clusters (4/4): 50
Incomplete Clusters: 0

✅ SUCCESS: All clusters have complete quarterly data!
