<a href="https://colab.research.google.com/github/sruehr/croppingintensity/blob/main/1_export_Brazil_soybean_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Export data for analysis on multicropping, irrigation, and WUE in Brazil
### Functions of this code
   * Ingests relevant data for annual years (irrigation, cropping intensity, meteorology, underlying edaphic and climate conditions)
   * Crops to Brazil and masks to soybean fields that are either rainfed or pivot-irrigated
   * Resamples and reprojects to set scale (1000km) using bilinear interpolation for continuous variables and mode for categorical variables
   * Creates smaller subgrids for export
   * Exports each subgrid and each year as a .csv file over relevant pixels
   * Makes some plots so you know what's going on and can check it works


# 1. Set up: load packages & set variables

In [None]:
## Load packages
import ee
import pandas as pd
import ast
import geemap
import folium

# Trigger the GEE authentication flow.
ee.Authenticate()

# Initialize the library.
ee.Initialize(project='ee-sophieruehr')

In [None]:
## Input variables
# Define export region (Brazil bounding box)
region = ee.FeatureCollection("USDOS/LSIB_SIMPLE/2017") \
            .filter(ee.Filter.eq("country_na", "Brazil")) \
            .geometry()

# Define range of years to consider
years = list(range(2017, 2024))

# Define output scale
scale_out = 1000

# Set export folder name
export_folder = 'BrazilCsv'

# Define a smaller region for testing purposes
test_region = ee.Geometry.Rectangle([-55, -16, -54, -15])  # lon_min, lat_min, lon_max, lat_max

# Size of export boxes (need to export over smaller region to reduce data size)
lon_step = 5
lat_step = 5

# 2. Define datasets of interest

In [None]:
# Input GEE collection names and relevant bands for individual variables
# Land use
irr_coll = 'projects/mapbiomas-public/assets/brazil/lulc/collection9/mapbiomas_collection90_irrigated_agriculture_v1'
irr_band = 'irrigated_agriculture_'
multi_coll = 'projects/mapbiomas-public/assets/brazil/lulc_10m/collection2/mapbiomas_10m_collection2_agriculture_number_cycles_v1'
multi_band = 'cycles_'
crop_coll = 'projects/mapbiomas-public/assets/brazil/lulc/collection10/mapbiomas_brazil_collection10_coverage_v2'
crop_band = 'classification_'
soy_value = 39 # Soy is designated as '39' in the dataset

# Covariates
precip_coll = 'UCSB-CHG/CHIRPS/PENTAD'
precip_band = 'precipitation' # mm/pentad, total Precip
et_coll = 'MODIS/061/MOD16A2GF'
et_band = 'ET' # kg/m^2/8day, scale = 0.1, total ET
et_qc = 'ET_QC'# Bit 0: MODLAND_QC bits; 0: Good quality; 1: Other quality
climate_coll = 'ECMWF/ERA5_LAND/MONTHLY_AGGR' # Mean monthly ERA climate
ta_band = 'temperature_2m'  # Mean 2m Air temp in K
dewpt_band = 'dewpoint_temperature_2m' # Mean 2m dewpoint in K

# To be added:
gpp_coll = 'MODIS/061/MYD17A2H' # Aqua GPP, 2021-2025
gpp_band = 'Gpp' # kg*C/m^2, scale = 0.1
gpp_qc = 'Psn_QC' # Bit 0: 0=good quality, 1=bad quality
soil_coll = 'OpenLandMap/SOL/SOL_TEXTURE-CLASS_USDA-TT_M/v02' # Open Land Map
soil_band = 'b10' #  Soil texture category class (USDA system) at 10 cm depth (around median soy rooting depth) https://developers.google.com/earth-engine/datasets/catalog/OpenLandMap_SOL_SOL_TEXTURE-CLASS_USDA-TT_M_v02#bands
aridity_coll = 'projects/sat-io/open-datasets/global_ai/global_ai_yearly' # Annual aridity maps, see https://gee-community-catalog.org/projects/ai0/?utm_source=chatgpt.com#paper-citation
aridity_band = 'b1' # aridity index (0-1) 0.0001 scale
elevation_coll = 'CGIAR/SRTM90_V4'
elevation_band = 'elevation'

# Load collections
irr = ee.Image(irr_coll)
multi = ee.Image(multi_coll)
crop = ee.Image(crop_coll)
precip = ee.ImageCollection(precip_coll)
et = ee.ImageCollection(et_coll)
climate = ee.ImageCollection(climate_coll)
gpp = ee.ImageCollection(gpp_coll)

# Constants
soil = ee.Image(soil_coll)
elevation = ee.Image(elevation_coll)
aridity = ee.Image(aridity_coll)


In [None]:
print(aridity.getInfo())

{'type': 'Image', 'bands': [{'id': 'b1', 'data_type': {'type': 'PixelType', 'precision': 'int', 'min': 0, 'max': 65535}, 'dimensions': [43200, 21600], 'crs': 'EPSG:4326', 'crs_transform': [0.008333333333, 0, -180, 0, -0.008333333333, 90.0000000576]}], 'version': 1662154398217605, 'id': 'projects/sat-io/open-datasets/global_ai/global_ai_yearly', 'properties': {'system:footprint': {'type': 'LinearRing', 'coordinates': [[-180, -90], [180, -90], [180, 90], [-180, 90], [-180, -90]]}, 'system:asset_size': 534568039}}


# 3. Create sub-regions for export without big data failure

In [None]:
# OLD: ALL gridcells
# Make a grid to export across all of Brazil, but in smaller regions
def make_grid(region, lon_step, lat_step):
    """Generate a list of rectangles covering the input region."""
    bounds = region.bounds().getInfo()['coordinates'][0]  # outer ring coords

    # Get min/max lon/lat
    lons = [pt[0] for pt in bounds]
    lats = [pt[1] for pt in bounds]
    lon_min, lon_max = min(lons), max(lons)
    lat_min, lat_max = min(lats), max(lats)

    # Create grid cells
    grid = []
    lon = lon_min
    while lon < lon_max:
        lat = lat_min
        while lat < lat_max:
            rect = ee.Geometry.Rectangle([lon, lat, min(lon + lon_step, lon_max), min(lat + lat_step, lat_max)])
            grid.append(rect)
            lat += lat_step
        lon += lon_step
    return grid

# 4. Functions to process and export data

## 4a. Function to process data

In [None]:

# Function to grab data, clip to region, and mask
  # Masks:
def process_year(year, region, scale_out=500):
    # Dates
    start = ee.Date.fromYMD(year, 1, 1)
    end = ee.Date.fromYMD(year, 12, 31)

    # Land cover classes (~10m res)
    irr_i = irr.select(f"{irr_band}{year}").toInt().unmask(137)  # fill NA with 137 to include rainfed
    multi_i = multi.select(f"{multi_band}{year}").toInt()
    crop_i = crop.select(f"{crop_band}{year}")

    # Climate/weather variables (~500-1000m res)
    precip_i = precip.select(precip_band).filterDate(start, end).sum()
    et_i = et.select(et_band).filterDate(start, end).sum().multiply(0.1)
    climate_i = climate.select([ta_band, dewpt_band]).filterDate(start, end).mean().subtract(273.15) # To C

    # Static variables
    soil_i = soil.select(soil_band)
    elevation_i = elevation.select(elevation_band)
    aridity_i = aridity.select(aridity_band).multiply(0.0001)

    # Clip global vars to region
    precip_i = precip_i.clip(region)
    et_i = et_i.clip(region)
    climate_i = climate_i.clip(region)
    aridity_i = aridity_i.clip(region)
    soil_i = soil_i.clip(region)
    elevation_i = elevation_i.clip(region)

    # Rescale continuous vars to new resolution
    precip_res = precip_i.resample('bilinear').reproject(crs=et_i.projection(), scale=scale_out)
    climate_res = climate_i.resample('bilinear').reproject(crs=et_i.projection(), scale=scale_out)
    et_res = et_i.resample('bilinear').reproject(crs=et_i.projection(), scale=scale_out)
    aridity_res = aridity_i.resample('bilinear').reproject(crs=et_i.projection(), scale=scale_out)
    elevation_res = elevation_i.resample('bilinear').reproject(crs=et_i.projection(), scale=scale_out)

    # Create high-res masks (~10m)
    soy_mask = crop_i.eq(39) # Must be soy fields
    irr_mask = irr_i.eq(137).Or(irr_i.eq(139)) # Must be either pivot irrigation or rainfed
    mask = soy_mask.And(irr_mask) # Both masks combined

    # Upscale categorical variables to new resolution with mode
    multi_res = multi_i.updateMask(mask).reduceResolution(
        reducer=ee.Reducer.mode(),
        maxPixels=1024,
        bestEffort=True
    ).reproject(crs=et_i.projection(), scale=scale_out).clip(region)

    irr_res = irr_i.updateMask(mask).reduceResolution(
        reducer=ee.Reducer.mode(),
        maxPixels=1024,
        bestEffort=True
    ).reproject(crs=et_i.projection(), scale=scale_out).clip(region)

    mask_res = mask.reduceResolution(
        reducer=ee.Reducer.mode(),
        maxPixels=1024,
        bestEffort=True
    ).reproject(crs=et_i.projection(), scale=scale_out).clip(region)

    soil_res = soil_i.reduceResolution(
        reducer=ee.Reducer.mode(),
        maxPixels=1024,
        bestEffort=True
    ).reproject(crs=et_i.projection(), scale=scale_out).clip(region)


    # Apply soy mask to continuous variables
    precip_res = precip_res.updateMask(mask_res)
    et_res = et_res.updateMask(mask_res)
    climate_res = climate_res.updateMask(mask_res)
    mask_res = mask_res.updateMask(mask_res)
    aridity_res = aridity_res.updateMask(mask_res)
    elevation_res = elevation_res.updateMask(mask_res)
    soil_res = soil_res.updateMask(soil_res)

     # Add GPP, but only if year >= 2021
    if year >= 2021:
        gpp_i = gpp.select(gpp_band).filterDate(start, end).sum().multiply(0.1)  # scale factor
        gpp_res = gpp_i.resample('bilinear').reproject(crs=et_i.projection(), scale=scale_out).clip(region)
        gpp_res = gpp_res.updateMask(mask_res)
    else:
        # For earlier years, return a dummy masked image with 0 or null
        gpp_res = ee.Image(0).updateMask(mask_res).rename("GPP").reproject(crs=et_i.projection(), scale=scale_out)


    return {
      "precip": precip_res,
      "et": et_res,
      "climate": climate_res,
      "irr": irr_res,
      "multi": multi_res,
      "mask": mask_res,
      "gpp": gpp_res,
      'aridity': aridity_res,
      'elevation': elevation_res,
      'soil': soil_res
    }

## 4b. Function to export data for each year and subregion

In [None]:
# Code to export data
def export_soy_climate(year, region, subregion = '',
                       scale_out=scale_out, folder=export_folder):
    # --- process inputs ---
    layers = process_year(year, region, scale_out=scale_out)

    # --- combine into single multiband image ---
    combined = ee.Image.cat([
        layers["et"].rename('ET'),
        layers["multi"].rename('Multi'),
        layers["irr"].rename('Irr'),
        layers["precip"].rename('Precip'),
        layers["climate"].select(ta_band).rename('Tair'),
        layers["climate"].select(dewpt_band).rename('DewPoint'),
        layers["gpp"].rename('GPP'),
        layers['aridity'].rename('Aridity'),
        layers['elevation'].rename('Elevation'),
        layers['soil'].rename('Soil')
    ]).set('year', year)

    # --- sample over region ---
    sampled = combined.sample(
        region=region,
        scale=scale_out,
        geometries=True   # keep lat/lon
    )

    # --- export to Drive ---
    task = ee.batch.Export.table.toDrive(
        collection=sampled,
        description=f'soy_climate_{year}_{scale_out}_{subregion}',
        folder=folder,
        fileNamePrefix=f'soy_climate_{year}_{scale_out}_{subregion}',
        fileFormat='CSV'
    )
    task.start()
    print(f"Export started for year {year}")


# 5. Export data

In [None]:
# region = test_region

# Create subregions
subregions = make_grid(region, lon_step=lon_step, lat_step=lat_step)

# Check how many were created
n_subregions = len(subregions)
print(f"Number of subregions: {n_subregions}")

Number of subregions: 80


In [None]:
# Loop through subregions
n_subregions = len(subregions)
start_index = 0
for i, subregion in enumerate(subregions[start_index:], start=start_index):
    sub_region_str = f"{i+1:03d}"
    for y in years:
        print(f"Exporting year {y}, subregion {i+1}/{n_subregions}")
        export_soy_climate(y, region=subregion, scale_out=scale_out, subregion=sub_region_str)



Exporting year 2017, subregion 38/80
Export started for year 2017
Exporting year 2018, subregion 38/80
Export started for year 2018
Exporting year 2019, subregion 38/80
Export started for year 2019
Exporting year 2020, subregion 38/80
Export started for year 2020
Exporting year 2021, subregion 38/80
Export started for year 2021
Exporting year 2022, subregion 38/80
Export started for year 2022
Exporting year 2023, subregion 38/80
Export started for year 2023
Exporting year 2017, subregion 39/80
Export started for year 2017
Exporting year 2018, subregion 39/80
Export started for year 2018
Exporting year 2019, subregion 39/80
Export started for year 2019
Exporting year 2020, subregion 39/80
Export started for year 2020
Exporting year 2021, subregion 39/80
Export started for year 2021
Exporting year 2022, subregion 39/80
Export started for year 2022
Exporting year 2023, subregion 39/80
Export started for year 2023
Exporting year 2017, subregion 40/80
Export started for year 2017
Exporting 

# 6. Make some plots so you know whats happening

## Map to show processing of data over test area

In [None]:
# Create an interactive map centered on your test region
data = process_year(2023, region=test_region, scale_out=scale_out)
year = 2023

Map = geemap.Map(center=[-15, -54], zoom=8, basemap='SATELLITE')
# Map.addLayer(data['mask'].clip(region), {'min':0, 'max':1, 'palette':['white','blue']}, 'Soy')
Map.addLayer(data['multi'], {'min':1, 'max':3, 'palette':['white','blue']}, 'Multi output')
# Map.addLayer(multi.select(f"{multi_band}2017").toInt().clip(test_region),  {'min':1, 'max':3, 'palette':['white','blue']}, 'Multi')
# Map.addLayer(test_region, {'color': 'red'}, 'Test Region')
# Map.addLayer(data['irr'], {'min':137, 'max':141, 'palette':['white','blue']}, 'Irr')
# Map.addLayer(data['gpp'], {'min':1250, 'max':1400, 'palette':['white', 'green', 'blue']}, 'GPP')
# Map.addLayer(data['soil'], {'min':1, 'max':12, 'palette':['white','blue']}, 'Soils')
Map.addLayer(data['elevation'], {'min':400, 'max':1000, 'palette':['white','blue']}, 'Elevation')
# Map.addLayer(data['aridity'], {'min':0, 'max':2, 'palette':['blue','red']}, 'Aridity')
# Map.addLayer(data['et'], {'min':800, 'max':1200, 'palette':['lightgreen','green','darkgreen']}, 'ET')
# Map.addLayer(data['precip'], {'min':700, 'max':1800, 'palette':['lightblue','blue','darkblue']}, 'Precipitation')
Map

combined = ee.Image.cat([
      data["et"].rename('ET'),
      data["multi"].rename('Multi'),
      data["irr"].rename('Irr'),
      data["precip"].rename('Precip'),
      data["climate"].select(ta_band).rename('Tair'),
      data["climate"].select(dewpt_band).rename('DewPoint'),
      data["gpp"].rename('GPP'),
      data['aridity'].rename('Aridity'),
      data['elevation'].rename('Elevation'),
      data['soil'].rename('Soil')
  ]).set('year', year)

info = combined.getInfo()
print([b['id'] for b in info['bands']])

sampled = combined.sample(
      region=region,
      scale=scale_out,
      geometries=True   # keep lat/lon
  )

fc = sampled.getInfo()
first_feat = fc['features'][0]
print(first_feat['properties'].keys())

['ET', 'Multi', 'Irr', 'Precip', 'Tair', 'DewPoint', 'GPP', 'Aridity', 'Elevation', 'Soil']
dict_keys(['Aridity', 'DewPoint', 'ET', 'Elevation', 'GPP', 'Irr', 'Multi', 'Precip', 'Soil', 'Tair'])


## Map to show subregions

In [None]:
# Map setup
Map = geemap.Map(center=[-15, -54], zoom=4)

# Add subregions polygons
Map.addLayer(subregions_fc, {"color": "blue"}, "Subregions Grid")

# Add labels
Map.addLayer(
    subregions_fc.map(lambda f: ee.Feature(f.geometry().centroid()).set({'label': f.get('label')})),
    {"color": "black", "pointSize": 5, "pointShape": "circle"},
    "Subregion Labels"
)

# Add Brazil boundary
Map.addLayer(region, {"color": "red"}, "Brazil Boundary")

Map

Map(center=[-15, -54], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=SearchDataGUI(ch…