In [77]:
import sys

import ee
import requests
import os
from osgeo import gdal, ogr, osr
import shutil
from time import sleep
from skimage.restoration import denoise_tv_bregman
from math import ceil
from googleapiclient.discovery import build
from sklearn.ensemble import RandomForestClassifier
from scipy.signal import convolve
from sklearn.cluster import KMeans
import pickle
import matplotlib.pyplot as plt
import numpy as np

ee.Initialize()

# Constants

In [2]:
# this is about 30 meters
RESOLUTION = 1/3600

#the value we use to signify no data at a pixel
NO_DATA_VALUE = 65535

#bands to despeckle
BANDS_TO_DESPECKLE = ['HH', 'HV']

#store the training data here
TRAINING_DATA_FOLDER = 'training_data'

#store the prediction data here
PREDICTION_DATA_FOLDER = 'prediction_data'

#create training data folder if not exists
if TRAINING_DATA_FOLDER not in os.listdir():
    os.mkdir(TRAINING_DATA_FOLDER)

#create prediction data folder if not exists
if PREDICTION_DATA_FOLDER not in os.listdir():
    os.mkdir(PREDICTION_DATA_FOLDER)
    
#features to extract from GEE in the training process
FEATURES = {
            ('collection','JAXA/ALOS/PALSAR/YEARLY/SAR'): ['HH', 'HV', 'qa'], 
            ('collection', 'LANDSAT/LC08/C01/T1_8DAY_NDVI'): ['NDVI'], 
            ('collection', 'LANDSAT/LC08/C01/T1_8DAY_NDWI'): ['NDWI'], 
            ('image','CGIAR/SRTM90_V4'): ['elevation']
           }

# User Input Area

In [3]:
#the shapefile storing training polygons
TRAINING_POLYGONS_FILE = 'C:/Users/ritvik/Desktop/JPLProject/mapping-colombia-wetlands/training_polygons/training_polygons.shp'

In [4]:
#the shapefile storing prediction polygons
PREDICTION_POLYGONS_FILE = 'C:/Users/ritvik/Desktop/JPLProject/mapping-colombia-wetlands/prediction_polygons/prediction_polygons.shp'

In [5]:
#the list of bands to use for training. Choose from:
#Landsat: ['NDVI', 'NDWI']
#ALOS-2: ['HH', 'HV']
#CGIAR: ['elevation']

SELECTED_BANDS = ['NDVI', 'NDWI', 'HH', 'HV', 'elevation']
NUM_FEATURES = len(SELECTED_BANDS)

In [6]:
#any pixels above this elevation (in meters) will be disregarded from training 
MAX_CONSIDERED_ELEVATION = 50

In [7]:
#the folder id in Google Drive where to temporarily store the GEE data before locally downloading
GOOGLE_EARTH_ENGINE_GDRIVE_FOLDER_ID = '1KvlrUHs_rN7xPlw53qtd9pweeLwmrJSP'

#the name of that same Google Drive folder
GDRIVE_FOLDER_NAME = 'GoogleEarthEngine'

In [8]:
#data date range
DATE_RANGE = ['2017-01-01', '2020-01-01']

In [170]:
#method to use for prediction. Choices are 'histogram' or 'random_forest'
METHOD = 'histogram'

# Functions to Manipulate data from Google Drive

In [10]:
def download_file_from_google_drive(file_id, destination):
    URL = "https://docs.google.com/uc?export=download"
    
    max_tries = 10
    curr_try = 0
    status_code = -1
    
    while status_code != 200 and curr_try < max_tries:
        if curr_try > 0:
            sleep(30)
        session = requests.Session()
        response = session.get(URL, params = { 'id' : file_id }, stream = True)
        status_code = response.status_code
        curr_try += 1
        
    if status_code != 200:
        return
    
    token = get_confirm_token(response)

    if token:
        params = { 'id' : file_id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)    

In [11]:
def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

In [12]:
def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

In [13]:
def get_file_ids_from_google_drive():
    creds = None
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)

    service = build('drive', 'v3', credentials=creds)

    result = service.files().list(q="parents in '%s'"%GOOGLE_EARTH_ENGINE_GDRIVE_FOLDER_ID).execute()

    file_name_to_file_id = {info['name'].split('-')[0]: info['id'] for info in result['files'] if len(info['name'].split('-')) == 2}
    
    return file_name_to_file_id

In [14]:
def delete_file_from_google_drive_by_file_id(fid):
    creds = None
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)

    service = build('drive', 'v3', credentials=creds)
    
    service.files().delete(fileId=fid).execute()

# Functions to Download Data From Google Earth Engine

In [15]:
def get_training_data(polygon_features, features, gdrive_folder, date_range, selected_bands):
    """
    This function accepts the below parameters and querys Google Earth Engine for data. The data is stored in 
    Google Drive.
    
    Inputs:
        polygon_features: a list of pairs like (index, polygon feature) indicating which polygons of data to download
        features: the features to extract from GEE
        gdrive_folder: the name of the folder in Google Drive where the downloaded data will live
        date_range: the start and end date to gather data
        selected_bands: list of bands to subset
        
    Output:
        list of tasks which are ready to be started
    """
    
    #this will store all started tasks
    tasks = {}
    
    #work through each sub-region 
    for curr_idx, polygon_feature in polygon_features:
        
        skip_polygon = False
        
        filtered_imgs = []

        #store the reference coordinates
        x1 = polygon_feature.GetGeometryRef().GetEnvelope()[0]
        y1 = polygon_feature.GetGeometryRef().GetEnvelope()[2]
        ref_coords = (x1,y1)
        
        #get polygon area coordinates
        area_coords = [list(pair) for pair in polygon_feature.GetGeometryRef().GetBoundary().GetPoints()]

        #create an area of interest from Earth Engine Geometry
        area_of_interest = ee.Geometry.Polygon(coords=area_coords)

        #iterate over each data source
        for data_type_source, bands in features.items():
            data_type = data_type_source[0]
            data_source = data_type_source[1]
                
            print('Working on data source: %s...'%data_source)
            
            if data_type == 'collection':
                #access the Earth Engine image collection with the specified bands
                data = ee.ImageCollection(data_source).select(bands)

                #filter on date range and area of interest
                data_filtered = data.filterBounds(area_of_interest).filterDate(date_range[0], date_range[1])
                
                #limit on cloud cover if LANDSAT
                if data_source == 'LANDSAT/LC08/C01/T1_SR':
                    data_filtered = data_filtered.filterMetadata('CLOUD_COVER', 'less_than', 20)
                    
                #ensure there is at least 1 image
                num_items = data_filtered.size().getInfo()
                if num_items == 0:
                    skip_polygon = True
                    break

                #if LANDSAT NDVI band, get quality mosaic by that band
                if 'LANSAT' in data_source:
                    mosaic = data_filtered.qualityMosaic(data_source.split('_')[-1])
                #otherwise just do a simple median
                else:
                    mosaic = data_filtered.median()
                   
            elif data_type == 'image':
                mosaic = ee.Image(data_source).select(bands)

            #add this mosaic to the list
            filtered_imgs.append(mosaic)
        
        if skip_polygon:
            print('Skipping %s'%fname)
            tasks[fname] = None
            print('==================================')
            continue
            
        #generate file name
        features_str = '_'.join([item[1] for item in features.keys()]).replace('/','_')
        fname = '%s-%s'%(curr_idx, features_str)
        print(fname)
        
        #add the various layers on top of each other to create a data cube with all features
        final_img = ee.Image()
        
        for img in filtered_imgs:
            final_img = ee.Image.addBands(final_img,img)
        
        #use the ALOS qa band to filter out invalid pixels
        if 'qa' in features[('collection','JAXA/ALOS/PALSAR/YEARLY/SAR')]:
            qa_band = final_img.select('qa')
            qa_mask = qa_band.eq(0)
            final_img = final_img.where(qa_mask, NO_DATA_VALUE)
            
        #use the SRTM elevation band to filter out invaild pixels
        if 'elevation' in features['image','CGIAR/SRTM90_V4']:
            elevation_band = final_img.select('elevation')
            elevation_mask = elevation_band.gt(MAX_CONSIDERED_ELEVATION)
            final_img = final_img.where(elevation_mask, NO_DATA_VALUE)
            
        #if any of the selected bands has NO_DATA_VALUE, mark that whole pixel as NO_DATA_VALUE
        for b in selected_bands:
            b_values = final_img.select(b)
            b_mask = b_values.eq(NO_DATA_VALUE)
            final_img = final_img.where(b_mask, NO_DATA_VALUE)
         
        #store the result with just the needed bands
        selected_bands = sorted(selected_bands)
        result = final_img.select(*selected_bands).float()
          
        #define the task to gather the data
        task = ee.batch.Export.image.toDrive(image=result,
                                             region=area_of_interest.getInfo()['coordinates'],
                                             description=str(curr_idx),
                                             folder=gdrive_folder,
                                             fileNamePrefix=fname,
                                             crs_transform=[RESOLUTION, 0.0, ref_coords[0], 0.0, -RESOLUTION, ref_coords[1]],
                                             crs='EPSG:4326')
        
        #store the task
        tasks[fname] = task
        
        print('==================================')
        
    return list(tasks.items())

In [16]:
def execute_tasks_in_batches(tasks, batch_size, FOLDER):
    """
    Executes a list of tasks in batches
    
    Inputs:
        tasks: list of tasks
        batch_size: number of tasks to execute per batch
        FOLDER: the folder to store the downloaded rasters
    """
    
    #create mapping of polygon to fname
    polygon_to_fname = {}

    #process the tasks in small batches to avoid memory running out
    for batch_idx in range(ceil(len(tasks) / batch_size)):

        #get the current batch of tasks
        curr_tasks = tasks[batch_size*batch_idx:batch_size*(batch_idx+1)]
        print('Processing Batch %s'%(batch_idx+1))

        #start all tasks in that batch
        for name,task in curr_tasks:
            if task != None:
                task.start()

        print('Started all tasks in batch')

        #wait until all tasks in that batch are done
        curr_states = [task.status()['state'] for name,task in curr_tasks if task != None]
        while 'RUNNING' in set(curr_states) or 'READY' in set(curr_states):
            print('Current states: %s'%curr_states)
            sleep(30)
            curr_states = [task.status()['state'] for name,task in curr_tasks if task != None]

        #once all tasks done, get their file ids on google drive
        file_name_to_file_id = get_file_ids_from_google_drive()

        #for each file...
        for fname, fid in file_name_to_file_id.items():

            #get feature file name
            features_file_name = '%s/features_%s.tiff'%(FOLDER, fname)

            #check if data already downloaded
            print('Downloading %s from Drive'%fname)
            download_file_from_google_drive(fid, features_file_name)

            print('Deleting %s from Drive'%fname)
            delete_file_from_google_drive_by_file_id(fid)

        print('================================')

# Despeckling and Clustering Functions

In [17]:
def img_to_db(img):
    return 10 * np.log10(img)

def db_to_img(img):
    return 10**(img / 10)

def tv_denoise(arr, idxs_to_despeckle, weight):
    copy_arr = arr.copy()
    for idx in idxs_to_despeckle:
        #get the layer
        layer = copy_arr[:,:,idx]
        
        orig_valid_mask = ~np.isnan(layer)
        
        #denoise
        img_db = img_to_db(layer)
        img_db_tv = denoise_tv_bregman(img_db, weight)
        img_tv = db_to_img(img_db_tv)
        img_tv[orig_valid_mask & np.isnan(img_tv)] = layer[orig_valid_mask & np.isnan(img_tv)]
        
        #set denoised into copy of array
        copy_arr[:,:,idx] = img_tv
        
    return copy_arr

In [18]:
def get_sep_metric(feat_file_name, num_clusters_options, band_names_options):
    """
    Get the separability metric for the given array and given possible clusters
    
    Inputs:
        feat_file_name: path to file to analyze
        num_clusters_options: list of number of clusters to try
        band_names_options: name of the bands to try for clustering
        
    """
    
    #read the array
    ds = gdal.Open(feat_file_name, gdal.GA_ReadOnly)

    #get the band names
    band_names = [ds.GetRasterBand(idx+1).GetDescription() for idx in range(ds.RasterCount)]

    #get gt and read array
    gt = ds.GetGeoTransform()
    arr = ds.ReadAsArray()
    arr = np.stack([arr[i] for i in range(arr.shape[0])], axis=-1)
    ds = None

    #transform array to 2d
    arr_2d = arr.reshape(-1,arr.shape[-1])
    arr_2d[arr_2d == NO_DATA_VALUE] = np.nan
    valid_indices = np.where(np.all(~np.isnan(arr_2d), axis=-1))[0]
    arr_2d_valid = arr_2d[valid_indices]
    
    sep_metric_dict = {}
    
    for k in num_clusters_options:
        for b in band_names_options:
            
            band_idx = band_names.index(b)
        
            #define model
            model = KMeans(n_clusters=k)

            #fit model and predict clusters
            cluster_preds = model.fit_predict(arr_2d_valid[:,[band_idx]])

            mu_vals = np.zeros(k)
            dev_vals = np.zeros(k)

            for cid in range(k):
                cluster_data = arr_2d_valid[cluster_preds == cid]
                mu, dev = cluster_data.mean(), cluster_data.std()
                mu_vals[cid] = mu
                dev_vals[cid] = dev

            sep_metric_vals = []
            for c1 in range(k):
                for c2 in range(c1+1,k):
                    sep_metric_vals.append(abs(mu_vals[c1] - mu_vals[c2]) / (dev_vals[c1] + dev_vals[c2]))
                    
            sep_metric_dict[(b,k)] = {'sep_metric_vals': sep_metric_vals, 'cluster_preds': cluster_preds}
    
    best_sep_entry = sorted(sep_metric_dict.items(), key=lambda info: -np.min(info[1]['sep_metric_vals']))[0]
    print('Best Separating Params: %s'%str(best_sep_entry[0]))
    
    result = np.ones(arr_2d.shape[0])*-1
    result[valid_indices] = best_sep_entry[1]['cluster_preds']
    result = result.reshape(arr.shape[:2])

    ds = np_array_to_raster('%s_suggested.tiff'%feat_file_name.split('.')[-2], result, gt, no_data=-1, nband=1, gdal_data_type=gdal.GDT_Float64)
    ds = None

# Functions to Process Feature Files

In [19]:
def process_feature_files(feat_file_names, confidence_levels=None, preprocess=True):
    """
    This function accepts a list of file names and processes those rasters. 
    
    Inputs:
        feat_file_names: a list of names of the downloaded training data files
        confidence_levels: a list of confidence levels associated with each file in feat_file_names
        preprocess: whether to remove some pixels that likely do not belong
        
    Outputs:
        the processed training data and auxilary data such as geotransforms
    """
    
    #this will store the numpy array of training data for each file
    data = {}
    
    #this will store auxilary data for each file
    feat_file_data = {}
    
    #iterate over each file
    for feat_file_name in feat_file_names:

        #open file and get geotransform
        ds = gdal.Open(feat_file_name, gdal.GA_ReadOnly)
        try:
            gt = ds.GetGeoTransform()
        except AttributeError:
            print('Could not process %s'%feat_file_name)
            continue

        #despeckle any bands which need to be despeckled
        idx_to_despeckle = [idx for idx in range(ds.RasterCount) if ds.GetRasterBand(idx+1).GetDescription() in BANDS_TO_DESPECKLE]
        arr = ds.ReadAsArray()
        arr = np.stack([arr[i] for i in range(arr.shape[0])], axis=-1)
        data_mask = (arr == NO_DATA_VALUE) | np.isnan(arr)
        arr[data_mask] = NO_DATA_VALUE

        arr = tv_denoise(arr, idx_to_despeckle, 1)
        arr[data_mask] = np.nan

        ds = None
        
        arr_2d = arr.reshape(-1,arr.shape[-1])
        valid_indices = np.where(np.all(~np.isnan(arr_2d), axis=-1))[0]
    
        if preprocess:
            #define model
            model = KMeans(n_clusters=2)

            #fit model and predict clusters
            try:
                cluster_preds = model.fit_predict(arr_2d[valid_indices])
            except ValueError:
                print('Could not process %s'%feat_file_name)
                continue

            #get main cluster indices
            main_cluster = np.median(cluster_preds)
            main_cluster_indices = np.where(cluster_preds == main_cluster)[0]

            #refine chosen indices
            valid_indices = valid_indices[main_cluster_indices]

        #add this training data to the list
        data[feat_file_name] = arr_2d[valid_indices]
        
        #add auxilary information
        feat_file_data[feat_file_name] = {'chosen_indices': valid_indices, 'shape': arr.shape, 'gt': gt}
        
    #if confidence levels supplied, then sample according to those levels
    if confidence_levels != None:
        num_pixels_to_sample = np.median([item.shape[0] for item in data.values()])
        data = {fname: ds[np.random.choice(ds.shape[0], int(num_pixels_to_sample*confidence_levels[fname]))] for fname,ds in data.items()}

    return data, feat_file_data

In [20]:
def create_histograms(training_datasets):
    """
    Creates a set of histograms, one for each class
    
    Inputs:
        training_histograms: a dictionary mapping class_id to a dataset
        
    Outputs:
        a dictionary mapping class_id to a histogram
        a list of histogram bin cuttoffs for each feature
    """
    
    min_feature_values = [min([min(dataset[:,idx]) for dataset in training_datasets.values()]) for idx in range(NUM_FEATURES)]
    max_feature_values = [max([max(dataset[:,idx]) for dataset in training_datasets.values()]) for idx in range(NUM_FEATURES)]
    
    histogram_ranges = []
    training_histograms = {}
    num_bins = 5

    for idx in range(NUM_FEATURES):
        width = (max_feature_values[idx] - min_feature_values[idx]) / num_bins
        histogram_ranges.append(np.arange(min_feature_values[idx], max_feature_values[idx]+width*.99, width))
    histogram_ranges = np.array(histogram_ranges)

    for class_id in class_ids:
        training_histograms[class_id] = np.histogramdd(training_datasets[class_id], bins=histogram_ranges, density=True)[0] 
        
    return training_histograms, histogram_ranges

In [21]:
def get_classes(test_set, training_histograms, histogram_ranges, class_ids, frac=0.25):
    """
    This function accepts a data set and histograms and classifies each pixel and assigns a score
    
    Inputs:
        test_set: the test set of features which we would like to classify
        training_histograms: a dictionary of histograms, one for each class
        histogram_ranges: the bin cuttoffs for the histograms
        class_ids: a set of class ids
        frac: between 0 and 1. Higher values mean we require more confidence to classify a pixel as non-NaN
        
    Output:
        an array of predicted classes and corresponding scores
    """
    
    #this will store the probabilities for each class
    class_id_to_probs = {}
    
    #any probability density below this is considered as 0
    min_allowable_density = min([frac*np.max(training_histograms[cid]) for cid in class_ids])
    
    #iterate over each histogram
    for class_id, histogram in training_histograms.items():
        
        #get the position of each test pixel in the context of this histogram
        transposed_ranges = np.transpose(histogram_ranges)
        expanded_dims_ranges = np.expand_dims(transposed_ranges, axis=1)
        extended_ranges = np.concatenate([expanded_dims_ranges for _ in range(test_set.shape[0])], axis=1)
        diffs = extended_ranges - test_set
    
        #get the probability density at each position
        indices = np.argmax(diffs > 0, axis=0) - 1
        indices[indices < 0] = 0
        indices = indices - np.all((diffs>0)==False, axis=0)
        indices = tuple(np.transpose(indices))
        probs = histogram[indices]
        
        #store this in the dictionary
        class_id_to_probs[class_id] = probs
      
    #create matrix of probabilities for each class
    prob_mtx = np.stack([class_id_to_probs[cid] for cid in class_ids], axis=-1)
    
    #sort matrix of probs
    sorted_probs = np.sort(prob_mtx, axis=1)
    
    #any pixel where all classes have 0 probability is NaN
    nan_indices = np.where(sorted_probs[:,-1] < min_allowable_density)[0]
    
    #compute scores based on ratio of most likely class to second most likely class
    scores = 1/(1+np.exp(-(sorted_probs[:,-1] / sorted_probs[:,-2])))
    
    #apply NaN pixels
    scores[nan_indices] = np.nan

    #get the predicted class
    pred_class = np.argmax(prob_mtx, axis=1).astype(float)
    
    #apply NaN pixels
    pred_class[nan_indices] = np.nan
    
    return pred_class, scores

# Numpy Array to TIFF Functions

In [22]:
def create_raster(output_path, columns, rows, nband=1, gdal_data_type=gdal.GDT_Int32, driver=r'GTiff'):
    ''' 
    returns gdal data source raster object 
    '''
    
    # create driver
    driver = gdal.GetDriverByName(driver)

    output_raster = driver.Create(output_path, columns, rows, nband, eType = gdal_data_type)    
    
    return output_raster

def np_array_to_raster(output_path, arr, geotransform, no_data=None, nband=1, gdal_data_type=gdal.GDT_Int32, spatial_reference_system_wkid=4326, driver=r'GTiff'):
    ''' 
    returns a gdal raster data source

    keyword arguments:

    output_path -- full path to the raster to be written to disk
    numpy_array -- numpy array containing data to write to raster
    upper_left_tuple -- the upper left point of the numpy array (should be a tuple structured as (x, y))
    cell_resolution -- the cell resolution of the output raster
    no_data -- value in numpy array that should be treated as no data
    nband -- the band to write to in the output raster
    gdal_data_type -- gdal data type of raster (see gdal documentation for list of values)
    spatial_reference_system_wkid -- well known id (wkid) of the spatial reference of the data
    driver -- string value of the gdal driver to use
    '''

    rows, columns = arr.shape[0], arr.shape[1]

    # create output raster
    output_raster = create_raster(output_path, columns, rows, nband, gdal_data_type) 

    spatial_reference = osr.SpatialReference()
    spatial_reference.ImportFromEPSG(spatial_reference_system_wkid)
    output_raster.SetProjection(spatial_reference.ExportToWkt())
    output_raster.SetGeoTransform(geotransform)
    
    for band_idx in range(1,nband+1):
        output_band = output_raster.GetRasterBand(band_idx)
        if no_data != None:
            output_band.SetNoDataValue(no_data)
        if nband > 1:
            output_band.WriteArray(arr[:,:,band_idx-1])
        else:
            output_band.WriteArray(arr)
        output_band.FlushCache() 
        output_band.ComputeStatistics(False)

    if os.path.exists(output_path) == False:
        raise Exception('Failed to create raster: %s' % output_path)

    return output_raster

# Driver Code : Download Prediction Regions

In [23]:
driver = ogr.GetDriverByName('ESRI Shapefile')
dataSource = driver.Open(PREDICTION_POLYGONS_FILE, gdal.GA_ReadOnly)
layer = dataSource.GetLayer()

prediction_polygon_features = [layer.GetNextFeature() for _ in range(layer.GetFeatureCount())]
prediction_polygon_features_to_process = [(idx,f) for idx,f in enumerate(prediction_polygon_features) if 'features_%s.tiff'%(idx) not in os.listdir(PREDICTION_DATA_FOLDER)]

In [24]:
tasks = get_training_data(prediction_polygon_features_to_process, FEATURES, GDRIVE_FOLDER_NAME, DATE_RANGE, SELECTED_BANDS)

Working on data source: JAXA/ALOS/PALSAR/YEARLY/SAR...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDVI...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDWI...
Working on data source: CGIAR/SRTM90_V4...
0-JAXA_ALOS_PALSAR_YEARLY_SAR_LANDSAT_LC08_C01_T1_8DAY_NDVI_LANDSAT_LC08_C01_T1_8DAY_NDWI_CGIAR_SRTM90_V4
Working on data source: JAXA/ALOS/PALSAR/YEARLY/SAR...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDVI...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDWI...
Working on data source: CGIAR/SRTM90_V4...
1-JAXA_ALOS_PALSAR_YEARLY_SAR_LANDSAT_LC08_C01_T1_8DAY_NDVI_LANDSAT_LC08_C01_T1_8DAY_NDWI_CGIAR_SRTM90_V4
Working on data source: JAXA/ALOS/PALSAR/YEARLY/SAR...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDVI...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDWI...
Working on data source: CGIAR/SRTM90_V4...
2-JAXA_ALOS_PALSAR_YEARLY_SAR_LANDSAT_LC08_C01_T1_8DAY_NDVI_LANDSAT_LC08_C01_T1_8DAY_NDWI_CGIAR_SRTM90_V4
Working on data source: JAXA/ALOS/PALSAR/YEARL

In [25]:
execute_tasks_in_batches(tasks, 3, PREDICTION_DATA_FOLDER)

Processing Batch 1
Started all tasks in batch
Current states: ['READY', 'READY', 'READY']
Current states: ['RUNNING', 'RUNNING', 'READY']
Current states: ['RUNNING', 'RUNNING', 'READY']
Current states: ['RUNNING', 'RUNNING', 'RUNNING']
Current states: ['RUNNING', 'RUNNING', 'RUNNING']
Current states: ['RUNNING', 'RUNNING', 'RUNNING']
Current states: ['RUNNING', 'RUNNING', 'RUNNING']
Current states: ['COMPLETED', 'RUNNING', 'RUNNING']
Current states: ['COMPLETED', 'RUNNING', 'RUNNING']
Current states: ['COMPLETED', 'RUNNING', 'RUNNING']
Current states: ['COMPLETED', 'RUNNING', 'RUNNING']
Current states: ['COMPLETED', 'RUNNING', 'COMPLETED']
Current states: ['COMPLETED', 'RUNNING', 'COMPLETED']
Current states: ['COMPLETED', 'RUNNING', 'COMPLETED']
Current states: ['COMPLETED', 'RUNNING', 'COMPLETED']
Downloading 1 from Drive
Deleting 1 from Drive
Downloading 2 from Drive
Deleting 2 from Drive
Downloading 0 from Drive
Deleting 0 from Drive
Processing Batch 2
Started all tasks in batch
Cur

In [26]:
prediction_feat_file_names = ['%s/%s'%(PREDICTION_DATA_FOLDER, fname) for fname in os.listdir(PREDICTION_DATA_FOLDER) if 'suggested' not in fname and 'predicted' not in fname]
prediction_data, prediction_feat_file_data = process_feature_files(prediction_feat_file_names, confidence_levels=None, preprocess=False)
print('Processed Prediction Regions')

  


Processed Prediction Regions


# Driver Code : Get Suggested Number of Classes

In [27]:
num_clusters_options = [3,4]

sep_bands = [b for b in SELECTED_BANDS if b not in BANDS_TO_DESPECKLE]

for feat_file_name in prediction_feat_file_names:
    print('Processing %s'%feat_file_name)
    get_sep_metric(feat_file_name, num_clusters_options, sep_bands)
    print('----------------------')

Processing prediction_data/features_0.tiff
Best Separating Params: ('NDWI', 3)
----------------------
Processing prediction_data/features_1.tiff
Best Separating Params: ('elevation', 3)
----------------------
Processing prediction_data/features_2.tiff
Best Separating Params: ('elevation', 3)
----------------------
Processing prediction_data/features_3.tiff
Best Separating Params: ('elevation', 3)
----------------------
Processing prediction_data/features_4.tiff
Best Separating Params: ('elevation', 4)
----------------------
Processing prediction_data/features_5.tiff
Best Separating Params: ('NDVI', 3)
----------------------
Processing prediction_data/features_6.tiff
Best Separating Params: ('NDVI', 3)
----------------------
Processing prediction_data/features_7.tiff
Best Separating Params: ('elevation', 3)
----------------------
Processing prediction_data/features_8.tiff
Best Separating Params: ('NDWI', 4)
----------------------


# Driver Code : Download Training Data

In [28]:
driver = ogr.GetDriverByName('ESRI Shapefile')

dataSource = driver.Open(TRAINING_POLYGONS_FILE, gdal.GA_ReadOnly)

layer = dataSource.GetLayer()

training_polygon_features = [layer.GetNextFeature() for _ in range(layer.GetFeatureCount())]
training_polygon_features_to_process = [(idx,f) for idx,f in enumerate(training_polygon_features) if 'features_%s.tiff'%idx not in os.listdir(TRAINING_DATA_FOLDER)]

In [29]:
tasks = get_training_data(training_polygon_features_to_process, FEATURES, GDRIVE_FOLDER_NAME, DATE_RANGE, SELECTED_BANDS)

Working on data source: JAXA/ALOS/PALSAR/YEARLY/SAR...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDVI...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDWI...
Working on data source: CGIAR/SRTM90_V4...
0-JAXA_ALOS_PALSAR_YEARLY_SAR_LANDSAT_LC08_C01_T1_8DAY_NDVI_LANDSAT_LC08_C01_T1_8DAY_NDWI_CGIAR_SRTM90_V4
Working on data source: JAXA/ALOS/PALSAR/YEARLY/SAR...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDVI...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDWI...
Working on data source: CGIAR/SRTM90_V4...
1-JAXA_ALOS_PALSAR_YEARLY_SAR_LANDSAT_LC08_C01_T1_8DAY_NDVI_LANDSAT_LC08_C01_T1_8DAY_NDWI_CGIAR_SRTM90_V4
Working on data source: JAXA/ALOS/PALSAR/YEARLY/SAR...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDVI...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDWI...
Working on data source: CGIAR/SRTM90_V4...
2-JAXA_ALOS_PALSAR_YEARLY_SAR_LANDSAT_LC08_C01_T1_8DAY_NDVI_LANDSAT_LC08_C01_T1_8DAY_NDWI_CGIAR_SRTM90_V4
Working on data source: JAXA/ALOS/PALSAR/YEARL

Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDWI...
Working on data source: CGIAR/SRTM90_V4...
23-JAXA_ALOS_PALSAR_YEARLY_SAR_LANDSAT_LC08_C01_T1_8DAY_NDVI_LANDSAT_LC08_C01_T1_8DAY_NDWI_CGIAR_SRTM90_V4
Working on data source: JAXA/ALOS/PALSAR/YEARLY/SAR...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDVI...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDWI...
Working on data source: CGIAR/SRTM90_V4...
24-JAXA_ALOS_PALSAR_YEARLY_SAR_LANDSAT_LC08_C01_T1_8DAY_NDVI_LANDSAT_LC08_C01_T1_8DAY_NDWI_CGIAR_SRTM90_V4
Working on data source: JAXA/ALOS/PALSAR/YEARLY/SAR...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDVI...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDWI...
Working on data source: CGIAR/SRTM90_V4...
25-JAXA_ALOS_PALSAR_YEARLY_SAR_LANDSAT_LC08_C01_T1_8DAY_NDVI_LANDSAT_LC08_C01_T1_8DAY_NDWI_CGIAR_SRTM90_V4
Working on data source: JAXA/ALOS/PALSAR/YEARLY/SAR...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDVI...
Working on data source: LANDSAT/LC08/C01/T1

Working on data source: CGIAR/SRTM90_V4...
46-JAXA_ALOS_PALSAR_YEARLY_SAR_LANDSAT_LC08_C01_T1_8DAY_NDVI_LANDSAT_LC08_C01_T1_8DAY_NDWI_CGIAR_SRTM90_V4
Working on data source: JAXA/ALOS/PALSAR/YEARLY/SAR...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDVI...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDWI...
Working on data source: CGIAR/SRTM90_V4...
47-JAXA_ALOS_PALSAR_YEARLY_SAR_LANDSAT_LC08_C01_T1_8DAY_NDVI_LANDSAT_LC08_C01_T1_8DAY_NDWI_CGIAR_SRTM90_V4
Working on data source: JAXA/ALOS/PALSAR/YEARLY/SAR...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDVI...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDWI...
Working on data source: CGIAR/SRTM90_V4...
48-JAXA_ALOS_PALSAR_YEARLY_SAR_LANDSAT_LC08_C01_T1_8DAY_NDVI_LANDSAT_LC08_C01_T1_8DAY_NDWI_CGIAR_SRTM90_V4
Working on data source: JAXA/ALOS/PALSAR/YEARLY/SAR...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDVI...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDWI...
Working on data source: CGIAR/SRTM90_V4...


Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDVI...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDWI...
Working on data source: CGIAR/SRTM90_V4...
70-JAXA_ALOS_PALSAR_YEARLY_SAR_LANDSAT_LC08_C01_T1_8DAY_NDVI_LANDSAT_LC08_C01_T1_8DAY_NDWI_CGIAR_SRTM90_V4
Working on data source: JAXA/ALOS/PALSAR/YEARLY/SAR...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDVI...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDWI...
Working on data source: CGIAR/SRTM90_V4...
71-JAXA_ALOS_PALSAR_YEARLY_SAR_LANDSAT_LC08_C01_T1_8DAY_NDVI_LANDSAT_LC08_C01_T1_8DAY_NDWI_CGIAR_SRTM90_V4
Working on data source: JAXA/ALOS/PALSAR/YEARLY/SAR...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDVI...
Working on data source: LANDSAT/LC08/C01/T1_8DAY_NDWI...
Working on data source: CGIAR/SRTM90_V4...
72-JAXA_ALOS_PALSAR_YEARLY_SAR_LANDSAT_LC08_C01_T1_8DAY_NDVI_LANDSAT_LC08_C01_T1_8DAY_NDWI_CGIAR_SRTM90_V4
Working on data source: JAXA/ALOS/PALSAR/YEARLY/SAR...
Working on data source: LANDSAT/LC08/C01/T1

In [31]:
execute_tasks_in_batches(tasks, 5, TRAINING_DATA_FOLDER)

Processing Batch 1
Started all tasks in batch
Current states: ['READY', 'READY', 'READY', 'READY', 'READY']
Current states: ['COMPLETED', 'COMPLETED', 'READY', 'READY', 'READY']
Current states: ['COMPLETED', 'COMPLETED', 'COMPLETED', 'RUNNING', 'RUNNING']
Downloading 4 from Drive
Deleting 4 from Drive
Downloading 3 from Drive
Deleting 3 from Drive
Downloading 2 from Drive
Deleting 2 from Drive
Downloading 1 from Drive
Deleting 1 from Drive
Downloading 0 from Drive
Deleting 0 from Drive
Processing Batch 2
Started all tasks in batch
Current states: ['READY', 'READY', 'READY', 'READY', 'READY']
Current states: ['COMPLETED', 'COMPLETED', 'RUNNING', 'READY', 'READY']
Current states: ['COMPLETED', 'COMPLETED', 'COMPLETED', 'RUNNING', 'COMPLETED']
Downloading 8 from Drive
Deleting 8 from Drive
Downloading 9 from Drive
Deleting 9 from Drive
Downloading 7 from Drive
Deleting 7 from Drive
Downloading 5 from Drive
Deleting 5 from Drive
Downloading 6 from Drive
Deleting 6 from Drive
Processing Bat

Deleting 82 from Drive
Downloading 81 from Drive
Deleting 81 from Drive
Downloading 80 from Drive
Deleting 80 from Drive
Processing Batch 18
Started all tasks in batch
Current states: ['READY']
Downloading 85 from Drive
Deleting 85 from Drive


In [56]:
training_feat_file_names = ['%s/%s'%(TRAINING_DATA_FOLDER, fname) for fname in os.listdir(TRAINING_DATA_FOLDER) if 'suggested' not in fname and 'predicted' not in fname]
confidence_levels = {name: training_polygon_features[int(''.join([i for i in name if i.isdigit()]))].GetField('confidence') for name in training_feat_file_names}

#this creates a dictionary mapping training data file name to the corresponding processed data set
input_to_dataset, _ = process_feature_files(training_feat_file_names, confidence_levels, preprocess=True)

Could not process training_data/features_52.tiff
Could not process training_data/features_55.tiff


In [57]:
#this dictionary maps prediction data file name to class id to input data file name
pred_to_class_to_input = {'%s/%s'%(PREDICTION_DATA_FOLDER, pred_fname): {} for pred_fname in os.listdir(PREDICTION_DATA_FOLDER) if 'suggested' not in pred_fname and 'predicted' not in pred_fname}

#for each training polygon...
for idx_input, input_poly in enumerate(training_polygon_features):
    #check if its data downloaded successfully
    if 'features_%s.tiff'%idx_input not in os.listdir(TRAINING_DATA_FOLDER):
        continue
        
    #get file name
    input_fname = '%s/features_%s.tiff'%(TRAINING_DATA_FOLDER, idx_input)
  
    #for each prediction polygon
    for idx_pred, pred_poly in enumerate(prediction_polygon_features):
        #check if its data downloaded successfully
        if 'features_%s.tiff'%idx_pred not in os.listdir(PREDICTION_DATA_FOLDER):
            continue
        
        #get file name
        pred_fname = '%s/features_%s.tiff'%(PREDICTION_DATA_FOLDER, idx_pred)
        
        #check if this training polygon inside the prediction polygon
        if input_poly.GetGeometryRef().Centroid().Within(pred_poly.GetGeometryRef()):
            
            #get class id
            class_id = input_poly.GetField('class_id')
            
            #add this training file to files belonging to this prediction file
            if class_id in pred_to_class_to_input[pred_fname]:
                pred_to_class_to_input[pred_fname][class_id].append(input_fname)
            else:
                pred_to_class_to_input[pred_fname][class_id] = [input_fname]

In [58]:
#for each prediction file, construct full dataset
for pred_fname, mapping_data in pred_to_class_to_input.items():
    for class_id in pred_to_class_to_input[pred_fname]:
        pred_to_class_to_input[pred_fname][class_id] = \
        np.concatenate([input_to_dataset[fname] for fname in pred_to_class_to_input[pred_fname][class_id] if fname in input_to_dataset], axis=0)

# Driver Code : Classify Prediction Regions

In [64]:
#create empty histograms and histogram ranges dictionaries
histograms_dict = {}
histogram_ranges_dict = {}

random_forest_clf_dict = {}


for pred_fname in pred_to_class_to_input:
    
    class_ids = sorted(list(pred_to_class_to_input[pred_fname].keys()))
    
    #create histograms and histogram ranges for this prediction polygon
    histograms, histogram_ranges = create_histograms(pred_to_class_to_input[pred_fname])
    histograms_dict[pred_fname] = histograms
    histogram_ranges_dict[pred_fname] = histogram_ranges
    
    #store constructed datasets
    features = np.concatenate([pred_to_class_to_input[pred_fname][cid] for cid in class_ids], axis=0)
    labels = np.array([single for item in [[cid]*len(pred_to_class_to_input[pred_fname][idx]) for idx,cid in enumerate(class_ids)] for single in item])
    
    #fit a random forest classifier
    clf = RandomForestClassifier()
    clf.fit(features, labels)
    random_forest_clf_dict[pred_fname] = clf

In [171]:
if METHOD == 'histogram':
    #get the predicted classes and scores for histogram method
    pred_class_scores = [get_classes(d, histograms_dict[fname], histogram_ranges_dict[fname], sorted(list(histograms_dict[fname].keys())), 0) for fname,d in prediction_data.items()]
elif METHOD == 'random_forest':
    #get the predicted classes and scores for random forest method
    pred_class_scores = [[random_forest_clf_dict[fname].predict(d), np.sort(random_forest_clf_dict[fname].predict_proba(d), axis=1)[:,-2:]] for fname,d in prediction_data.items()]
    scores = [2/(1+np.exp(1-item[1][:,-1] / item[1][:,-2]))-1 for item in pred_class_scores]
    pred_class_scores = [[pred_class_scores[idx][0], scores[idx]] for idx in range(len(scores))]
    
print('Got Predicted Classes and Scores')



Got Predicted Classes and Scores


In [172]:
#this kernel will be used for the low pass filter
ksize = 5
kernel = np.ones((ksize,ksize)) / ksize**2

#for each prediction file...
for idx, feat_file_name in enumerate(prediction_feat_file_names):
    
    #get the shape of the file
    feat_file_shape = prediction_feat_file_data[feat_file_name]['shape'][:2]
    shape = feat_file_shape + (2,)
    
    #create empty matrix to store result
    result = np.empty(shape)
    result = result.reshape(-1, 2)
    result[:] = np.nan
    
    #these are the valid indices 
    indices = prediction_feat_file_data[feat_file_name]['chosen_indices']

    #load the predicted classes and scores
    result[indices, 0] = pred_class_scores[idx][0]
    result[indices, 1] = pred_class_scores[idx][1]

    #shape back into a 3d matrix
    result = result.reshape(shape)
    mask = np.isnan(result[:,:,0])
    
    #get unique class ids
    class_ids = np.sort(np.unique(result[:,:,0][result[:,:,0] >= 0]).astype(int))
    class_scores = np.zeros(feat_file_shape + (len(class_ids),))
    
    #get the matrix of T/F wheter each pixel is predicted as each class
    class_mtxs = {cid: (result[:,:,0] == cid) for cid in class_ids}
    
    #for each class id...
    for cid in class_ids:
        #convolve with the low pass filter
        conv_mtx = convolve(class_mtxs[cid], kernel, mode='same')
        conv_mtx[np.isnan(result[:,:,0])] = np.nan
        class_scores[:,:,cid] = conv_mtx
    
    result[:,:,0] = np.argmax(class_scores, axis=-1)
    result[:,:,0][mask] = np.nan

    ds = np_array_to_raster('%s/%s_predicted_%s.tiff'%(PREDICTION_DATA_FOLDER, METHOD, idx), result, prediction_feat_file_data[feat_file_name]['gt'], no_data=-1, nband=2, gdal_data_type=gdal.GDT_Float64)
    ds = None

