In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# geospatial frameworks
import rasterio as rs
from rasterio.plot import show  # imshow for raster
import geopandas as gpd
from shapely.geometry import Point, Polygon  # for geometry processing

# Strategy
- I'm not sure if this is the best approach, but it worked for me
- Tiles have lots of overlap between each other, making it difficult to split data for training and validation
- My strategy is to create 10 splitting area over the region, and grab image_ids that have minimum of 50.1% intersection withing that region
- This way, no tile will be present in multiple splits. 10 splits can also be used for cross-validation training

In [None]:
def get_filepath(image_id, mode='PS-RGB'):
    return f'{ROOT_DIR}{mode}/SN6_Train_AOI_11_Rotterdam_{mode}_{image_id}.tif'

def get_raster(image_id, mode='PS-RGB'):
    return rs.open(get_filepath(image_id, mode))

# Load all tiles
- Total of 3401 image tiles
- Tile boundaries can be read using .geojson file for each image_id
- we'll grab the tile's boundaries then create a POLYGON object containing coordinates for each corner
- loop through all tiles and save into a geodataframe as .geojson file
- when creating geodataframe, make sure we use the coordinate system (crs) used in this dataset

In [None]:
ROOT_DIR = '../input/spacenet-6-multisensor-allweather-mapping/AOI_11_Rotterdam/'

# grab unique image_id from the annotation csv
df = pd.read_csv(ROOT_DIR+'SummaryData/SN6_Train_AOI_11_Rotterdam_Buildings.csv')
image_ids = df.ImageId.unique()

# we'll use the tile boundaries
ex_image_id = image_ids[0]
ex_raster = get_raster(ex_image_id)
show(ex_raster)

print(f'Coordinate system: {ex_raster.crs}')
print(f'Tile boundaries: {ex_raster.bounds}')

# convert to corner coordinates
lbox, bbox, rbox, tbox = ex_raster.bounds
geometry = Polygon([(lbox,tbox),
                  (rbox,tbox),
                  (rbox,bbox),
                  (lbox,bbox)])

In [None]:
# create geodataframe
d = {'tile': [ex_image_id], 'geometry': [geometry]}
gdf = gpd.GeoDataFrame(d, crs='epsg:32631')

display(gdf)

# we can easily plot a geodataframe using .plot() method
gdf.plot(); plt.show()

### Creating geojson of all boundaries
- I anticipated adding mode options, but I think the tile boundaries are the same for each mode
- This takes a while to process since it basicall loads all 3401 raster only to take the boundary information

In [None]:
%%time
# I anticipated adding mode options, but I think the tile boundaries are the same for each mode
def create_geometry(mode, image_ids):
    geometry = []

    for image_id in image_ids:
        # read the raster
        ex_raster = rs.open(get_filepath(image_id, mode))
        
        # grab its boundaries
        lbox, bbox, rbox, tbox = ex_raster.bounds
        # convert into coordinates
        geometry.append(Polygon([
            (lbox,tbox),
            (rbox,tbox),
            (rbox,bbox),
            (lbox,bbox)]))

    # create geodataframe
    d = {'image_id': image_ids, 'geometry': geometry}
    gdf = gpd.GeoDataFrame(d, crs='epsg:32631')

    # saving to geojson file
    gdf.to_file(f'{mode}.geojson', driver='GeoJSON')
    print(f'{mode}.geojson saved successfully!')
    
create_geometry('PS-RGB', image_ids)

# Previewing tiles

In [None]:
gdf = gpd.read_file('PS-RGB.geojson')
display(gdf.head())
display(gdf.crs)

As shown in figure below, odd tile numbers are adjacent with each other, and the same with even tiles

In [None]:
# return a geodataframe where image_id contains the following tile id
odd_gdf = gdf[gdf['image_id'].str.contains('3721|3723|3725')]
even_gdf = gdf[gdf['image_id'].str.contains('3720|3722|3724')]

# plot all tile boundaries and highlight odd and even tiles
fig, ax1 = plt.subplots(figsize=(10,10))
gdf.plot(color='grey', alpha=0.3, edgecolor='black', ax=ax1)
odd_gdf.plot(color='green', alpha=0.7, edgecolor='white', ax=ax1)
even_gdf.plot(color='red', alpha=0.7, edgecolor='white', ax=ax1)

plt.title('All tile boundaries');plt.show()

# Selecting tiles

## Create AOI
the `.total_bounds` method calculates boundaries for the gdf. we can create a area of interest (AOI) with for example, the combined 3 odd tiles above (colored green), extended by 100 in all sides (colored blue)

In [None]:
print(odd_gdf.total_bounds)
# make a polygon as our AOI
lbox, bbox, rbox, tbox = odd_gdf.total_bounds
ext = 100
geometry = Polygon([
            (lbox-ext,tbox+ext),
            (rbox+ext,tbox+ext),
            (rbox+ext,bbox-ext),
            (lbox-ext,bbox-ext)])

d = {'tile': 1, 'geometry': geometry}
aoi_df = gpd.GeoDataFrame(d, crs='epsg:32631', index=[0])

f,ax = plt.subplots()
aoi_df.plot(color='blue', alpha=0.7, edgecolor='white', ax=ax)
odd_gdf.plot(color='green', edgecolor='white', ax=ax)
plt.show()

## Clipping method
- overlay clips the tiles, while cx returns all that touches the given area [[source](https://gis.stackexchange.com/questions/266730/filter-by-bounding-box-in-geopandas)]
- both returns the same number of tiles
- but how do we select tiles that intersects for a given area of overlap?

In [None]:
# use overlay
aoi_overlay = gpd.overlay(gdf, aoi_df, how='intersection')
aoi_overlay.plot(color='grey', alpha=0.3, edgecolor='black')
plt.show()
print(f'{aoi_overlay.shape[0]} filtered tiles')

In [None]:
# using geopandas cx index
lbox, bbox, rbox, tbox = odd_gdf.total_bounds
ext = 100
aoi_df_cx = gdf.cx[lbox-ext:rbox+ext, bbox-ext:tbox+ext]

fig, ax1 = plt.subplots(figsize=(10,10))
aoi_df_cx.plot(color='grey', alpha=0.3, edgecolor='black', ax=ax1)
aoi_df.plot(color='blue', alpha=0.7, edgecolor='white', ax=ax1)
plt.show()
print(f'{aoi_df_cx.shape[0]} filtered tiles')

## Using percentage of remaining area
- overlay clips tiles, but the ones included remains with smaller tile area.
- assuming all tiles have the same sq area, percentage of remaining area can be calculated by comparing with the max area
- when filtering tiles with >99 % remaining area, it does guarentee no tiles overlap to another AOI stripe, but it removes most of the tiles valuable as training data
- if filtering with >50.1% remaining area, that tile should meet the minimum amount to be inside one AOI stripe, and excluded from the next stripe

In [None]:
# percentage of remaining area
aoi_overlay['Area'] = aoi_overlay.area  # calculates area of each polygon
max_area = aoi_overlay.Area.max()  # max area of 1 tile
aoi_overlay['Per_Area'] = aoi_overlay['Area'].apply(lambda x: x/max_area*100)
aoi_overlay.head()

As shown in table above, tiles completely within the ROI should show 100% in `Per_Area` column, while tiles with some portion inside the ROI should show the percentage of its intersection

In [None]:
# filter only tiles that has high percentage of remaining area
aoi_overlay_filt = aoi_overlay[aoi_overlay.Per_Area > 50.1]
# grab the image_id of the filtered tiles
gdf_filt = aoi_overlay_filt.image_id.values
# grab original the original tile boundaries of filtered image_ids
gdf_filt = gdf[gdf.image_id.isin(gdf_filt)]

fig, ax = plt.subplots(figsize=(10,10))
gdf_filt.plot(color='grey', alpha=0.3, edgecolor='black', ax=ax)
aoi_df.plot(color='blue', alpha=0.7, edgecolor='white', ax=ax)
plt.show()
print(aoi_overlay_filt.shape)

- compared using the cx method, this only grab tiles if they have half it's area in the ROI
- returns less number of tiles, but ensures no tiles will be present in a nearby ROI

## Generate AOI Stripes
we'll divide the whole region with 10 stripes and split image_ids based on which AOI they are located in

In [None]:
# get total bounds of gdf
def generate_AOI(split, gdf):
    lbox, bbox, rbox, tbox = gdf.total_bounds
    # horizontal stripes divides top-bot
    unit = (tbox-bbox)/split
    geometry = []
    for i in range(split):
        u_bbox = bbox+(unit*i)  # i starts at 0, so u_bbox=bbox, then adds unit for each iter
        u_tbox = u_bbox+unit
        stripe = Polygon([
            (lbox,u_tbox),
            (rbox,u_tbox),
            (rbox,u_bbox),
            (lbox,u_bbox)])
        geometry.append(stripe)
        
    # create geodataframe
    df = gpd.GeoDataFrame({'geometry':geometry}, crs='epsg:32631')
    return df


SPLIT = 10
AOI_stripes_gdf = generate_AOI(SPLIT, gdf)
AOI_stripes_gdf.plot(color='grey', alpha=0.3, edgecolor='black')
plt.show()

## Grab tiles for each AOI stripe
`filter_tile` basically does all done in previous section. It returns the image_id of each tile contained in each AOI stripe which will be stored in `filtered_tiles`. In figure below we can see each AOI stripe in gray, and tiles are colored based on it's stripe

In [None]:
def filter_tile(aoi_df, gdf):
    # overlay
    aoi_overlay = gpd.overlay(gdf, aoi_df, how='intersection')
    
    # count percentage remaining area
    aoi_overlay['Area'] = aoi_overlay.area
    max_area = aoi_overlay.Area.max()
    aoi_overlay['Per_Area'] = aoi_overlay['Area'].apply(lambda x: x/max_area*100)
    
    # grab tiles that are more than half in the AOI
    aoi_overlay_filt = aoi_overlay[aoi_overlay.Per_Area > 50.1]
    return aoi_overlay_filt.image_id.values

In [None]:
filtered_tiles = []

# iterate through all rows
for idx,rows in AOI_stripes_gdf.iterrows():
    aoi_df = gpd.GeoDataFrame({'geometry': rows}, crs='epsg:32631')
    filtered_tiles.append(filter_tile(aoi_df, gdf))

print(f'num of splits: {len(filtered_tiles)}')
print(filtered_tiles[0][:5])

In [None]:
col_map = ['aqua','beige','brown','coral','fuchsia','green','lime','orange','tan','tomato']
fig, ax = plt.subplots(figsize=(10,10))

# color tiles based on it's AOI stripe
for i,stripe in enumerate(filtered_tiles):
    stripe_gdf = gdf[gdf['image_id'].isin(stripe)]
    stripe_gdf.plot(
        color=col_map[i], alpha=0.6, edgecolor='black', ax=ax
    )
    
AOI_stripes_gdf.plot(color='grey', alpha=0.2, edgecolor='grey', ax=ax)
plt.show()

# Saving Filtered Tiles
Let's make sure we don't leave out any tiles. Each stripe has different number of tiles, to make this the same size we could add a flag where it populates the next stripe when it reaches a certain number of tile, but it's something for another day

In [None]:
# how many tiles in each stripe?
total_split_tiles = 0
for i,stripes in enumerate(filtered_tiles):
    print(f'stripe {i}: {len(stripes)} tiles')
    total_split_tiles += len(stripes)

# total split tiles vs total tiles in dataset
print(f'{total_split_tiles} tiles in split vs {gdf.shape[0]} total tiles available')

In [None]:
# save the filtered image ids, convert to np array with dtype object for nested array with different shapes
np.save(f'SN6_{SPLIT}_splits.npy', np.array(filtered_tiles,dtype='object'))

In [None]:
# make sure to load using allow_pickle
fil = np.load('./SN6_10_splits.npy', allow_pickle=True)
fil[0][:5]