# Expected Output:

In [6]:
import sys

from shapely.geometry import box
import geopandas as gpd
import ee
import os
from osgeo import gdal

ee.Initialize()

# Constants

In [7]:
# this is about 10 meters
METERS_TO_DECIMAL_DEGREES_CONST = 1/30/3600

#the value we use to signify no data at a pixel
NO_DATA_VALUE = 65535

#this is the biggest region we allow to avoid data overflow errors and keep files manageable
MAX_REGION_SIZE = 0.5

#the base region of interest folder
BASE_ROI_FOLDER = 'regions'
if BASE_ROI_FOLDER not in os.listdir():
    os.mkdir(BASE_ROI_FOLDER)

In [8]:
BASELINE_WETLAND_PATH = 'C://Users/ritvik/Desktop/JPLProject/data/CIFORWetlands/cifor_wetlands_colombia.tif'

# Data Collection Functions

In [9]:
def clip_raster_by_shapefile(source_raster, shapefile_path, save_path):
    """
    Given some raster, this function clips the raster gien the shape of another raster,
    
    source_raster: a TIFF that you wish to crop
    shapefile_path: a SHP whose bounds you will use to crop the source_raster
    save_path: the eventual place to save the cropped TIFF
    """
    source_ds = gdal.Open(source_raster, gdal.GA_ReadOnly)
    
    options = gdal.WarpOptions(format='GTiff', cutlineDSName=shapefile_path, cropToCutline=True)
    ds = gdal.Warp(save_path, source_ds, options=options)
    
    ds = None
    source_ds = None

In [10]:
def generate_region_folders(minx, miny, maxx, maxy, path):
    """
    Using the given extent, create a folder with shapefile info at the given path
    
    minx, miny, maxx, maxy: the extent of the region we wish to analyze
    path: the directory where to store all the sub-reion subdirectories
    """
    
    diff_x = (maxx - minx)
    diff_y = (maxy - miny)
    
    num_x = int(diff_x // MAX_REGION_SIZE + 1)
    num_y = int(diff_y // MAX_REGION_SIZE + 1)
    
    size_x = diff_x / num_x
    size_y = diff_y / num_y
    
    print(num_x, num_y, size_x, size_y)
    
    for i in range(num_x):
        for j in range(num_y):
            geo_box = box(minx+i*size_x, miny+j*size_y, minx+(i+1)*size_x, miny+(j+1)*size_y)
            df = gpd.GeoDataFrame(geometry=[geo_box], crs={'init':'epsg:4326'})
            df.to_file('%s_%s_%s'%(path, i, j))

In [11]:
def get_bands_from_region(folders_to_process, features, gdrive_folder, date_range, aux_data, primary_dataset):
    """
    This function accepts the below parameters and querys Google Earth Engine for data. The data is stored in 
    Google Drive.
    
    folders_to_process: the folders where to find the regions of interest
    features: a dictionary of features to include in the resulting data cubes
    gdrive_folder: the name of the folder on Google Drive to store the results
    date_range: the date range for this data
    aux_data: dictionary of source-specific info such as cloudy pixel percentage for Sentinel, etc.
    primary_dataset: the dataset to use for the eventual image resolution
    """
    
    #this will store all started tasks
    tasks = {}
    
    #work through each sub-region 
    for region_folder in folders_to_process:
        
        filtered_imgs = []
        region_name = region_folder.split('/')[-1]
        
        print('Working on region folder: %s...'%region_name)
        
        print('Created Baseline Wetlands Raster...')
        #clip the baseline map of wetlands and store in sub-directory
        clip_raster_by_shapefile(BASELINE_WETLAND_PATH, '%s/%s.shp'%(region_folder, region_name), '%s/baseline_%s.tiff'%(region_folder, region_name))
    
        #read the area of interest
        df = gpd.read_file(region_folder)

        #get the coordinates of that area
        area_coords = df.geometry[0].exterior.coords[:]
        area_coords = [list(pair) for pair in area_coords]

        #get the minx, miny, maxx, maxy
        x1 = min([item[0] for item in area_coords])
        y1 = max([item[1] for item in area_coords])

        x2 = max([item[0] for item in area_coords])
        y2 = min([item[1] for item in area_coords])

        #store the reference coordinates
        ref_coords = (x1,y1)

        #create an area of interest from Earth Engine Geometry
        area_of_interest = ee.Geometry.Polygon(coords=area_coords)

        #iterate over each data source
        for data_source, bands in features.items():
                
            print('Working on data source: %s...'%data_source)
            
            #access the Earth Engine image collection with the specified bands
            data = ee.ImageCollection(data_source).select(bands)

            #filter on date range
            data_filtered = data.filterBounds(area_of_interest).filterDate(date_range[0], date_range[1])

            #filter on auxilary data, if any
            for title, info in aux_data:
                data_filtered = data_filtered.filterMetadata(title, info['relation'], info['value'])

            #ensure there is at least 1 image
            num_items = data_filtered.size().getInfo()
            if num_items == 0:
                print('no items found, returning started tasks.')
                return tasks

            band_info = data_filtered.first().getInfo()['bands'][0]

            #if crs is already EPSG 4326, get resolution directly, otherwise need to transform from meters
            if band_info['crs'] == 'EPSG:4326':
                res = band_info['crs_transform'][0]
            else:
                res = band_info['crs_transform'][0] * METERS_TO_DECIMAL_DEGREES_CONST

            #if this is the eventual primary dataset, store its resolution
            if data_source == primary_dataset:
                eventual_res = res

            #get a mosaic as median of all returned images
            mosaic = ee.Image(data_filtered.median())

            #add this mosaic to the list
            filtered_imgs.append(mosaic)
        
        #generate file name
        fname = '%s-%s'%(region_folder.split('/')[-1], '_'.join(list(features.keys())).replace('/','_'))
        print(fname)
        
        #add the various layers on top of each other to create a data cube with all features
        final_img = ee.Image()
        for img in filtered_imgs:
            final_img = ee.Image.addBands(final_img,img)
        
        #use the ALOS qa band to filter out invalid pixels
        if 'qa' in features['JAXA/ALOS/PALSAR/YEARLY/SAR']:
            qa_band = final_img.select('qa')
            qa_mask = qa_band.lt(51)
            final_img = final_img.where(qa_mask, NO_DATA_VALUE)
        
        #use the Sentinel-2 SCL band to filter out invalid pixels
        if 'SCL' in features['COPERNICUS/S2_SR']:
            scl_band = final_img.select('SCL')
            scl_nodata_vals = [0,3,6,8,9,10]
            scl_mask = scl_band.eq(0)
            for v in scl_nodata_vals:
                scl_mask = scl_mask.Or(scl_band.eq(v))
            final_img = final_img.where(scl_mask, NO_DATA_VALUE)
         
        #store the result with just the needed bands
        selected_bands = sorted([b for bands in features.values() for b in bands if b != 'qa' and b != 'SCL'])
        result = final_img.select(*selected_bands)
          
        #define the task to gather the data
        task = ee.batch.Export.image.toDrive(image=result,
                                             region=area_of_interest.getInfo()['coordinates'],
                                             description=region_folder.split('/')[-1],
                                             folder=gdrive_folder,
                                             fileNamePrefix=fname,
                                             crs_transform=[eventual_res, 0.0, ref_coords[0], 0.0, -eventual_res, ref_coords[1]],
                                             crs='EPSG:4326')
        
        #start up the task
        task.start()
        
        #store the task
        tasks[fname] = task
        
        print('==================================')
    
    return tasks

# Driver Code

In [12]:
generate_region_folders(-68.894, 2.296, -67.43, 3.599, '%s/region'%BASE_ROI_FOLDER)

3 3 0.48799999999999955 0.43433333333333346


In [13]:
search_area_name = 'region_2'
folders_to_process = ['%s/%s'%(BASE_ROI_FOLDER, item) for item in os.listdir(BASE_ROI_FOLDER) if search_area_name in item]
features = {'JAXA/ALOS/PALSAR/YEARLY/SAR': ['HH', 'HV', 'qa'], 'COPERNICUS/S2_SR': ['B2', 'B3', 'B4', 'B8', 'SCL']}
date_range = ['2017-01-01', '2019-01-01']
gdrive_folder = 'GoogleEarthEngine'
aux_data = dict()

In [14]:
tasks = get_bands_from_region(folders_to_process, features, gdrive_folder, date_range, aux_data, 'COPERNICUS/S2_SR')

Working on region folder: region_2_0...
Created Baseline Wetlands Raster...
Working on data source: JAXA/ALOS/PALSAR/YEARLY/SAR...
Working on data source: COPERNICUS/S2_SR...
region_2_0-JAXA_ALOS_PALSAR_YEARLY_SAR_COPERNICUS_S2_SR
Working on region folder: region_2_1...
Created Baseline Wetlands Raster...
Working on data source: JAXA/ALOS/PALSAR/YEARLY/SAR...
Working on data source: COPERNICUS/S2_SR...
region_2_1-JAXA_ALOS_PALSAR_YEARLY_SAR_COPERNICUS_S2_SR
Working on region folder: region_2_2...
Created Baseline Wetlands Raster...
Working on data source: JAXA/ALOS/PALSAR/YEARLY/SAR...
Working on data source: COPERNICUS/S2_SR...
region_2_2-JAXA_ALOS_PALSAR_YEARLY_SAR_COPERNICUS_S2_SR


In [17]:
for name,task in tasks.items():
    print(name, task.status()['state'])

region_2_0-JAXA_ALOS_PALSAR_YEARLY_SAR_COPERNICUS_S2_SR COMPLETED
region_2_1-JAXA_ALOS_PALSAR_YEARLY_SAR_COPERNICUS_S2_SR COMPLETED
region_2_2-JAXA_ALOS_PALSAR_YEARLY_SAR_COPERNICUS_S2_SR COMPLETED
