## Shoreline extraction from binary Landsat images for Vietnam (1980-2021)

This notebook extracts subpixel contours from binary land/ water images derived from Landsat imagery, which have been processing on the Google Earth Engine (*reference to Javascript-file*). 

Content of the notebook:
+ Setup: Python libaries, Directories, etc.
+ Contour extraction 
+ Removal of short LineStrings
+ ...

### 1.| Setup

In [107]:
# Libraries
import os
import glob
import numpy as np
import pandas as pd
import geopandas as gpd
import shapely as shp
import rasterio as rio 
import matplotlib.pyplot as plt
from rasterio.plot import show 
from coasty import postprocess

In [127]:
# Directories
data_dir = os.path.join(os.getcwd(),"data") # path to data-folder with aux data
proc_tiles_path = os.path.join(os.path.join(data_dir,"VN_processing_polygons")) # path to processing tiles
country_bounds_path = os.path.join(data_dir,"VN_country_bounds") # path to country bounds
osm_sl_path = os.path.join(os.path.join(data_dir,"VN_osm_coastline")) # path to reference shoreline
buffer_path = os.path.join(os.path.join(data_dir,"VN_buffer"))

# Params
export_folder = "GEE"               # folder on Google Drive with GEE images to download
crs = "EPSG:3857"                   # coordinate system code of a projected crs 
min_length = 3000                   # min length of shoreline to keep [m]
buffer_dist = 2500                  # buffer around reference shorelines to clip detected shorelines [m]
transect_len = 5000                 # length of transects [m]
transect_dist = 200                 # distance between transects [m]
transect_min_line_length = 10000    # min legnth of polygon outline at which to draw transects [m]
                                    # (for removing small islands) 

In [128]:
# read/ create aux data 
proc_tiles = gpd.read_file(proc_tiles_path).to_crs(crs)
country_bounds = gpd.read_file(country_bounds_path).to_crs(crs)
osm_sl = gpd.read_file(osm_sl_path).to_crs(crs)

if os.path.exists(buffer_path):
    buffer = gpd.read_file(buffer_path)
else:
    print("Create osm shoreline buffer:")
    buffer = osm_sl.buffer(buffer_dist)
    buffer.to_file(buffer_path,driver="GeoJSON")

print("Everything successfully read.")

### 2.| Download binary rasters from Google Drive

In [56]:
# Loop through processing tiles and save in separate folers 
for i in proc_tiles.id:
    tile_name = "P"+str(i) # name of processing tile
    folder_path = os.path.join(data_dir,tile_name) # path to save the rasters
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    # download images of current tile form Drive 
    postprocess.download_from_drive(export_folder,folder_path,tile_name) 
    # move downloaded images to out_path folder 
    for raster in os.listdir(os.getcwd()):
        if raster.endswith(tile_name+".tif"):
            os.replace(os.path.join(os.getcwd(),raster),os.path.join(folder_path,raster))
    print('Files moved to: data/',tile_name)

P0
P1
P2
P3
P4
P5
P6
P7
P8
P9
P10


### 3.| Reproject and quality check binary rasters

In [163]:
for i in proc_tiles.id:
    tile_name = "P"+str(i)
    print(("--")*10,"Treating",tile_name,("--")*10)
    folder_path = os.path.join(data_dir,tile_name)
    if os.path.exists(folder_path):
        raster_paths = glob.glob(os.path.join(data_dir,tile_name,"*"+tile_name+".tif"))
        raster_paths.sort()
        for r in raster_paths:
            raster_path = os.path.join(data_dir,folder_path,raster)
            postprocess.reproject_raster(raster_path,raster_path,crs)
            postprocess.mask_single_observation_pixel(raster_path)
    else:
        print(folder_path,"does not exist.")    
    break

-------------------- Treating P0 --------------------
1_2000_L5_P0.tif already projected to given CRS.
1_2000_L5_P0.tif masked.
1_2000_L5_P0_03avg_aq.tif saved.


In [176]:
shoreline

Unnamed: 0,geometry
0,"LINESTRING (11663577.388 1029554.452, 11663588..."
1,"LINESTRING (11664280.754 1029603.379, 11664295..."
2,"LINESTRING (11664746.810 1029617.825, 11664761..."
3,"LINESTRING (11665829.559 1029617.531, 11665844..."
4,"LINESTRING (11666264.076 1029602.202, 11666251..."
...,...
37463,"LINESTRING (11670746.294 942766.199, 11670775...."
37464,"LINESTRING (11670669.358 942631.481, 11670654...."
37465,"LINESTRING (11670533.463 941474.445, 11670521...."
37466,"LINESTRING (11671977.970 939873.160, 11671988...."


### 2.| Subpixel contours

In [173]:
for i in proc_tiles.id:
    tile_name = "P"+str(i)
    print(("--")*10,"Treating",tile_name,("--")*10)
    folder_path = os.path.join(data_dir,tile_name)
    if os.path.exists(folder_path):

        #  Clip osm shoreline buffer to processing tile 
        buffer_clip_path = os.path.join(data_dir,tile_name,tile_name+"_buffer")
        if not os.path.exists(buffer_clip_path):
            buffer_clip = gpd.clip(buffer,proc_tiles[proc_tiles.index == i])
            buffer_clip.to_file(buffer_clip_path,driver="GeoJSON")
            print("Buffer has been saved.")
        else: 
            print("Buffer exists and has been loaded.")
            buffer_clip = gpd.read_file(buffer_clip_path)
        
        # Create shorelines
        shorelines = []
        shorelines_path = os.path.join(folder_path,tile_name+"_shorelines") 
        if not os.path.exists(shorelines_path):
            print("Process shorelines...")    
            raster_paths = glob.glob(os.path.join(data_dir,"*aq.tif"))
            raster_paths.sort()
            for r in raster_paths:    
                shoreline = subpixel_contours(r,30)
                # save single shoreline without modifications as backup
                sl_path = os.path.join(folder_path,tile_name+"_single_shorelines")
                if not os.path.exists(sl_path): os.mkdir(sl_path)
                shoreline.to_file(os.path.join(sl_path,os.path.basename(r)+"_shoreline"))
                # postprocess shoreline
                shoreline = gpd.clip(shoreline,buffer_clip)
                cleaned = postprocess.remove_small_lines(shoreline, min_size=min_length)
                if not cleaned.empty:
                    year = os.path.basename(r)[2:6]
                    sat = os.path.basename(r)[7:9]
                    avg_aq = os.path.basename(r)[13:15]
                    cleaned['id']=year
                    cleaned = cleaned.dissolve(by=cleaned.id,aggfunc="sum")
                    cleaned['year']=year
                    cleaned['sat']=sat
                    cleaned['avg_aq']=avg_aq
                    cleaned['proc_tile']=tile_name                        
                    shorelines.append(cleaned)
                    print(year+": shoreline processed.")
            shorelines_gdf = pd.concat(shorelines,ignore_index=True)
            shorelines_gdf.to_file(os.path.join(shorelines_path),driver="GeoJSON")
            print("All shorelines have been created and saved.")
        else:
            print("Shorelines already exist.")
    else:
        print('Folder does not exist.')

-------------------- Treating P0 --------------------
Buffer exists and has been loaded.
Process shorelines...
2000: Shoreline processed.
2000: Shoreline processed.
Shorelines have been created and saved.
-------------------- Treating P1 --------------------
Folder does not exist.
-------------------- Treating P2 --------------------
Folder does not exist.
-------------------- Treating P3 --------------------
Folder does not exist.
-------------------- Treating P4 --------------------
Folder does not exist.
-------------------- Treating P5 --------------------
Folder does not exist.
-------------------- Treating P6 --------------------
Folder does not exist.
-------------------- Treating P7 --------------------
Folder does not exist.
-------------------- Treating P8 --------------------
Folder does not exist.
-------------------- Treating P9 --------------------
Folder does not exist.
-------------------- Treating P10 --------------------
Folder does not exist.


In [3]:
# access raster files by proccessing tiles
proc_tiles = gpd.read_file(proc_tiles_path)
proc_tiles = proc_tiles.to_crs(crs)

# create buffer around osm reference shoreline
osm_sl = gpd.read_file(osm_sl_path)
osm_sl = osm_sl.to_crs(crs)

for i in range(0,len(proc_tiles)):
    raster_folder_path = os.path.join(data_dir,"P"+str(i))
    if os.path.exists(raster_folder_path):
        (print("Images for P"+str(i)+" exist."))
        shorelines = []
        shorelines_path = os.path.join(data_dir,"P"+str(i),"P"+str(i)+"_shorelines")
        #  clip osm reference shoreline and to AOI 
        buffer_path = os.path.join(data_dir,"P"+str(i),"P"+str(i)+"_buffer")
        if not os.path.exists(buffer_path):
            osm_sl_clip = gpd.clip(osm_sl,proc_tiles[proc_tiles.index == i])
            buffer = osm_sl_clip.buffer(buffer_dist)
            buffer.to_file(buffer_path,driver="GeoJSON")
            print("Buffer has been saved.")
        else: 
            print("Buffer exists and has been loaded.")
            buffer = gpd.read_file(buffer_path) 
        if not os.path.exists(shorelines_path):
            print("Process shorelines for P"+str(i)+"...")    
            for r in os.listdir(raster_folder_path):
                if r.endswith("aq.tif"):
                    print(r)
                    raster_path = os.path.join(raster_folder_path,r)
                    raster_path_reproj = os.path.join(raster_folder_path,os.path.splitext(r)[0]+"_reproj.tif")
                    raster_path_clip = os.path.join(raster_folder_path,os.path.splitext(r)[0]+"_clip.tif")
                    if not os.path.exists(raster_path_reproj):
                        postprocess.reproject_raster(raster_path,raster_path_reproj,crs)
                        print("   Raster reprojected")
                    if not os.path.exists(raster_path_clip):
                        postprocess.crop_raster(raster_path_reproj,test_box_path,raster_path_clip)
                        print("   Raster clipped")
                    raster = rio.open(raster_path_reproj)
                    shoreline = postprocess.subpixel_contours(raster,30)
                    cleaned = postprocess.remove_small_lines(shoreline, min_size=min_length)
                    if not cleaned.empty:
                        year = r[2:6]
                        sat = r[7:9]
                        proc_tile = r[10:12]
                        cleaned['id']=year
                        cleaned = cleaned.dissolve(by=cleaned.id,aggfunc="sum")
                        cleaned['year']=year
                        cleaned['sat']=sat
                        cleaned['proc_tile']=proc_tile                        
                        shorelines.append(cleaned)
                        print("   Shoreline processed")
            shorelines_gdf = pd.concat(shorelines,ignore_index=True)
            
            shorelines_gdf.to_file(os.path.join(shorelines_path),driver="GeoJSON")
            print("Shorelines have been created and saved.\n")

        else:
            print("Shorelines for P"+str(i)+" exist.\n")

Images for P1 exist.
Buffer exists and has been loaded.
Process shorelines for P1...
1_1995_L5_P1_24aq.tif
   Raster reprojected
   Raster clipped
1_1992_L5_P1_8aq.tif
   Raster reprojected
   Raster clipped
1_1996_L5_P1_27aq.tif
   Raster reprojected
   Raster clipped
1_1989_L5_P1_20aq.tif
   Raster reprojected
   Raster clipped
1_1994_L5_P1_18aq.tif
   Raster reprojected
   Raster clipped
1_1988_L5_P1_13aq.tif
   Raster clipped
1_1993_L5_P1_15aq.tif
   Raster reprojected
   Raster clipped
1_1990_L5_P1_14aq.tif
   Raster reprojected
   Raster clipped
1_1991_L5_P1_10aq.tif
   Raster reprojected
   Raster clipped
Shorelines have been created and saved.



### 3.| Transects between highest and lowest water extent

In [179]:
# Calculate raster with min and max water extent 

def calc_water_extent(files_list,min_file,max_file):
    all_masks = None
    for idx, file in enumerate(files):#
        print("Eating file: %s" % file)
        with rio.open(file, "r") as src:  # src has meta that can be accessed through 
                                            # src.meta or directly, e.g. src.height
            if all_masks is None:  # we have not defined it yet but we only have do define ones
                all_masks = np.zeros((len(files), src.height, src.width), dtype=np.float32)  # np.float32 may have nans
                meta = src.meta
            all_masks[idx] = src.read(1)                    
    min_water_extent = np.nanmin(all_masks, 0)  # water = 1, min water extent
    max_water_extent = np.nanmax(all_masks, 0)  # no water = 0, max water extent
    # write the masks
    for arr, out_file in zip([min_water_extent, max_water_extent], [min_water_file, max_water_file]):
        with rio.open(out_file, "w", **meta) as tgt:
            tgt.write(arr,1)

for i in range(len(proc_tiles)):
    raster_folder_path = os.path.join(data_dir,"P"+str(i))
    if os.path.exists(raster_folder_path):
        files = glob.glob(os.path.join(raster_folder_path,"*reproj.tif"))
        min_water_file = os.path.join(data_dir,"P"+str(i),"P"+str(i)+"_min_water_extent")
        max_water_file = os.path.join(data_dir,"P"+str(i),"P"+str(i)+"_max_water_extent")
        if not os.path.exists(min_water_file):
            calc_water_extent(files,min_water_file,max_water_file)
        else:
            print("Files exist.")

Eating file: /Users/Ronjamac/Documents/02_Studium/Masterarbeit/Code/VN_coastline_dynamics/test_data/P1/1_1995_L5_P1_24aq_reproj.tif
Eating file: /Users/Ronjamac/Documents/02_Studium/Masterarbeit/Code/VN_coastline_dynamics/test_data/P1/1_1993_L5_P1_15aq_reproj.tif
Eating file: /Users/Ronjamac/Documents/02_Studium/Masterarbeit/Code/VN_coastline_dynamics/test_data/P1/1_1996_L5_P1_27aq_reproj.tif
Eating file: /Users/Ronjamac/Documents/02_Studium/Masterarbeit/Code/VN_coastline_dynamics/test_data/P1/1_1991_L5_P1_10aq_reproj.tif
Eating file: /Users/Ronjamac/Documents/02_Studium/Masterarbeit/Code/VN_coastline_dynamics/test_data/P1/1_1992_L5_P1_8aq_reproj.tif
Eating file: /Users/Ronjamac/Documents/02_Studium/Masterarbeit/Code/VN_coastline_dynamics/test_data/P1/1_1988_L5_P1_13aq_reproj.tif
Eating file: /Users/Ronjamac/Documents/02_Studium/Masterarbeit/Code/VN_coastline_dynamics/test_data/P1/1_1990_L5_P1_14aq_reproj.tif
Eating file: /Users/Ronjamac/Documents/02_Studium/Masterarbeit/Code/VN_coastl

In [151]:
# calcualte shoreline for min and max water extent rasters (might be useful at one point :))
min_water_file = os.path.join(data_dir,"P"+str(1),"P"+str(1)+"_min_water_extent")
max_water_file = os.path.join(data_dir,"P"+str(1),"P"+str(1)+"_max_water_extent")
min_raster = rio.open(min_water_file)
max_raster = rio.open(max_water_file)

#test_transects = gpd.read_file(os.path.join(data_dir,"transects_clip"))

#min_sl = postprocess.subpixel_contours(min_raster,30)
#min_sl = postprocess.remove_small_lines(min_sl,min_length)
#min_sl = gpd.clip(min_sl,buffer)
#max_sl = postprocess.subpixel_contours(max_raster,30)
#max_sl = postprocess.remove_small_lines(max_sl,min_length)
#max_sl = gpd.clip(max_sl,buffer)

In [246]:
# remove small pixel clusters
import skimage

def remove_pixel_cluster(raster_path,out_path,min_size1,min_size0,connectivity=0):
    with rio.open(raster_path,'r') as src:
        im = src.read(1)
        meta = src.meta
        im_rev = im.copy()
        im_rev[im_rev==0]=2
        im_rev[im_rev==1]=0
        im_rev[im_rev==2]=1
        processed_rev = skimage.morphology.remove_small_objects(im_rev.astype(bool),min_size=min_size1,connectivity=connectivity).astype('int16')
        im[processed_rev==0]=1
        processed = skimage.morphology.remove_small_objects(im.astype(bool), min_size=min_size0, connectivity=connectivity).astype('int16')
        # black out pixels
        #mask_x, mask_y = np.where(processed == 1)
        #im[mask_x, mask_y] = 1
        # plot the result
        #plt.figure(figsize=(20,10))
        #plt.imshow(processed)
        meta.update({
            "compress":"LZW",
            "dtype":"int16"
        })
        with rio.open(out_path,'w',**meta) as dst:
            dst.write(processed, 1)

# polygonize min water raster 
def polygonize_raster(raster_path,raster_value,min_length):
    # Read input band with Rasterio
    with rio.open(raster_path) as src:
        crs = src.crs
        src_band = src.read(1)
        #src_band[src_band==0]=2
        #src_band[src_band==1]=0
        #src_band[src_band==2]=1
        # Polygonize with Rasterio. `shapes()` returns an iterable
        # of (geom, value) as tuples
        shapes = list(rio.features.shapes(src_band, transform=src.transform))
    shp_schema = {
        'geometry': 'MultiPolygon',
        'properties': {'pixelvalue': 'int'}
        }
    # keep polygons with specified raster pixel value    
    polygons = [shp.geometry.shape(geom) for geom, value in shapes
                if value == raster_value]
    # save polygons as geodataframe
    polygons_gdf = gpd.GeoDataFrame(geometry=polygons,crs=crs)
    # remove small polygons with given min length
    #polygons_gdf = postprocess.remove_small_lines(polygons_gdf,min_length)
    return polygons_gdf

In [219]:
%%time
# test polyonize min water raster with clip images:
#min_water_file_clip = os.path.join(data_dir,"P"+str(1),"P"+str(1)+"_min_water_extent_clip")
#max_water_file_clip = os.path.join(data_dir,"P"+str(1),"P"+str(1)+"_max_water_extent_clip")
min_water_file = os.path.join(data_dir,"P"+str(1),"P"+str(1)+"_min_water_extent")
min_water_file_out = os.path.join(data_dir,"P"+str(1),"P"+str(1)+"_min_water_extent_simple")

#max_water_file = os.path.join(data_dir,"P"+str(1),"P"+str(1)+"_max_water_extent")
#max_water_file_out = os.path.join(data_dir,"P"+str(1),"P"+str(1)+"_max_water_extent_simple")

remove_pixel_cluster(min_water_file,min_water_file_out,50000,100000,0)
print("Pixel cluster removed.")
min_water_poly = polygonize_raster(min_water_file_out,0,10000) #current:10000
min_water_poly.to_file(os.path.join(data_dir,"P1","P1_min_water_poly_simple"),driver="GeoJSON")
print("Minimum water extent polygon created.")

# clip transects to min and max water extent polygons
proc_tiles = gpd.read_file(proc_tiles_path)
proc_tiles = proc_tiles.to_crs(crs)
country_bounds = gpd.read_file(country_bounds_path).to_crs(crs)
tile1 = proc_tiles.geometry.iloc[1]
bounds = gpd.clip(country_bounds,tile1)#.buffer(buffer_dist)
transects = gpd.read_file(country_transects_path)
transects = gpd.clip(transects,tile1)

min_water_buffer = min_water_poly.buffer(200)
min_water_buffer.to_file(os.path.join(data_dir,"P1","P1_min_water_poly_simple_buffer"),driver="GeoJSON")
print("Minimum water extent polygon buffered.")

transects_trim = gpd.clip(transects,min_water_buffer)

transects_trim = transects_trim.explode().reset_index(drop=True)
transects_trim = transects_trim.drop_duplicates(subset="id",keep="last")
transects_trim.to_file(os.path.join(data_dir,"P1","P1_transects"),driver="GeoJSON")

print("Done!")


Pixel cluster removed.
Minimum water extent polygon created.
Minimum water extent polygon buffered.
Done!


### 3.| Time Series Analysis

In [268]:
# Transects 
proc_tiles = gpd.read_file(proc_tiles_path)
proc_tiles = proc_tiles.to_crs(crs)

if not os.path.exists(country_transects_path):
    country_bounds = gpd.read_file(country_bounds_path)
    country_bounds = country_bounds.to_crs(crs)
    country_transects = postprocess.draw_transects_polygon(country_bounds,transect_len,transect_dist,transect_min_line_length)
    country_transects.to_file(country_transects_path,driver="GeoJSON")
    print("Transects for Vietnam have been created and saved.")
else:
    country_transects = gpd.read_file(country_transects_path)
    print("Transects for Vietnam exist and have been loaded.")

# Intersections
for i, tile in proc_tiles.iterrows():
    shorelines_path = os.path.join(data_dir,"P"+str(i),"P"+str(i)+"_shorelines")
    
    if os.path.exists(shorelines_path):
        shorelines_gdf = gpd.read_file(shorelines_path)
        print("Calcualte intersections for", os.path.basename(shorelines_path)+"...")
        tile_poly = tile.geometry
        #test_box = gpd.read_file(test_box_path)
        #test_box = test_box.to_crs(crs)
        transects = gpd.clip(country_transects,tile_poly)
        #osm_sl = gpd.clip(osm_sl,test_box)
        intersections = compute_intersections(transects,shorelines_gdf,remove_outliers=True)
        intersections.to_file(os.path.join(data_dir,"P"+str(i),"P"+str(i)+"_intersections"),driver="GeoJSON")
print("Done!")

Transects for Vietnam exist and have been loaded.
Calcualte intersections for P1_shorelines...
Done!


In [243]:
def compute_intersections(transects, shorelines, remove_outliers=False,reference=None):
    """This functions calculates intersections between shore-perpendicular transects and a GeoDataFrame with shorelines. 
    It calculates the distance of the each intersection point to the origin of the transects and adds it as a property to 
    the output intersections GeoDataFrame. If the parameter "reference" is given, the distance of each intersection point 
    to a reference shoreline is calculated additionally in order to only keep the intersection point of each year which is 
    closest to the reference line.

    Args:
        transects (GeoDataFrame): with LineStrings [required column: "transect_id"]
        shorelines (GeoDataFrame): with LineStrings and/ or MultiLineStrings [recommended column: "year"]
        reference (GeoDataFrame, optional): with LineStrings and/ or MultiLineString. Defaults to None.

    Returns:
        GeoDataFrame: with Points and information on location, transect number, (year) and distance to
    transect origin and the reference line, if given.
    """
    # crs
    crs = shorelines.crs
    transects = transects.to_crs(crs)
    if reference is not None:
        reference = reference.to_crs(crs)
        ref_inter = compute_intersections(transects, reference)
    # empty list to store point dataframes for all transects
    all_intersections = []
    # loop through all transects and compute intersections
    for t, transect in transects.iterrows():
        transect = transect.geometry
        transect_id = t
        # empty list to store intersection points dataframe for each transect
        intersections = []
        # loop through all shorelines 
        for s, shoreline in shorelines.iterrows():
            shoreline = shoreline.geometry
            inter = []
            # handle single linestrings
            if type(shoreline)==shp.geometry.linestring.LineString:
                inter.append(transect.intersection(shoreline))
            # handle mutlilinestrings
            elif type(shoreline)==shp.geometry.multilinestring.MultiLineString:
                for sh in shoreline:
                    inter.append(transect.intersection(sh))
            # create geodataframe from list of intersection points 
            gdf = gpd.GeoDataFrame(geometry=inter, crs=crs)
            # add transect id
            gdf['transect_id'] = transect_id
            # add year
            if 'year' in shorelines:
                gdf['year'] = shorelines.year.loc[s]
            # add to list
            intersections.append(gdf)
        # merge dataframes of each transect to one 
        intersections_gdf =  pd.concat(intersections,ignore_index=True)
        # drop empty geometries
        intersections_gdf = intersections_gdf[~intersections_gdf.is_empty].reset_index(drop=True)
        # seperate Multipoint geometries 
        intersections_gdf = intersections_gdf.explode()
        # calculate the distance of intersections points to the (landwards) origin of the transect
        dist = []
        for i, inter in intersections_gdf.iterrows():
            origin = shp.geometry.Point(transect.coords[1])
            dist.append(origin.distance(inter.geometry))
        # add distance information to dataframe
        intersections_gdf['dist_to_transect_origin'] = dist
        
        ### 1. OPTION: CALCULATE DISTANCE TO REFERENCE SHORELINE AND SELECT POINT 
        # additionally calculate distance to reference shoreline
        if reference is not None:
            dist_to_osm_sl = []
            for p, point in intersections_gdf.iterrows():
                sl_point = point.geometry
                osm_point = ref_inter[ref_inter.transect_id==point.transect_id].geometry.iloc[0]
                dist = sl_point.distance(osm_point)
                dist_to_osm_sl.append(dist)
            intersections_gdf["dist_to_osm_sl"] = dist_to_osm_sl
            intersections_gdf = intersections_gdf.sort_values(by="dist_to_osm_sl")
            intersections_gdf = intersections_gdf.drop_duplicates(subset="year",keep="first")

        ### 2. OPTION: CALCULATE DISTANCE TO MEDIAN INTERSECTION POINT AND SELECT UPON    
        # calculate the median distance to the origin of the reference shoreline
        #median_dist = np.median(intersections_gdf.dist_to_transect_origin)
        #intersections_gdf['change'] = intersections_gdf.dist_to_transect_origin - median_dist
        # create new column with absolute change to identify outliers
        #intersections_gdf['abs_change'] = abs(intersections_gdf.change)
        # drop duplicates. keep only one point per year which is closest to the median 
        #intersections_gdf = intersections_gdf.sort_values(by="abs_change")
        #intersections_gdf = intersections_gdf.drop_duplicates(subset="year",keep="first")


        ### 3. OPTION: CHOOSE THE OUTERMOST POINT IN SEAWARDS DIRECTION 
        intersections_gdf = intersections_gdf.sort_values(by="dist_to_transect_origin")
        intersections_gdf = intersections_gdf.drop_duplicates(subset="year",keep="last")
        
        # remove outliers 
        if remove_outliers == True:
            inter_median = np.median(intersections_gdf.dist_to_transect_origin)
            inter_std =  np.std(intersections_gdf.dist_to_transect_origin)
            intersections_gdf = intersections_gdf[intersections_gdf.dist_to_transect_origin.map(
                    lambda x: abs(x-inter_median))<abs(3*inter_std)]
        
        # sort dataframe by date
        if 'year' in intersections_gdf:
            intersections_gdf = intersections_gdf.sort_values(by="year")
        # add dataframe to list
        all_intersections.append(intersections_gdf)
    # merge all dataframes
    new_gdf = pd.concat(all_intersections,ignore_index=True)
    new_gdf = new_gdf.to_crs(crs)
    return new_gdf
    
#transects = transects.reset_index(drop=True)
#i = 1
#transects_trim = gpd.read_file(os.path.join(data_dir,"P1","P1_transects"))
#shorelines_gdf = gpd.read_file(os.path.join(data_dir,"P1","P1_shorelines"))

#transects_trim = transects_trim.explode().reset_index(drop=True)
#test = compute_intersections(transects_trim,shorelines_gdf)
#test.to_file(os.path.join(data_dir,"P"+str(i),"P"+str(i)+"_intersections"),driver="GeoJSON")

In [221]:
i=1
intersections = compute_intersections(transects_trim,shorelines_gdf)
intersections.to_file(os.path.join(data_dir,"P"+str(i),"P"+str(i)+"_intersections"),driver="GeoJSON")

In [244]:
# Test intersections function
transects_clip_file = os.path.join(data_dir,"P1","P1_transects_clip")
intersections_clip_file = os.path.join(data_dir,"P1","P1_clip_intersections")

#test_box = gpd.read_file(test_box_path).to_crs(crs)
#shorelines_clip = gpd.read_file(os.path.join(data_dir,"P1","P1_clip_shorelines"))
#transects_clip = gpd.clip(transects_trim,test_box)
#transects_clip.to_file(transects_clip_file,driver="GeoJSON")

intersections_clip = compute_intersections(transects_clip,shorelines_clip,remove_outliers=True)
intersections_clip.to_file(intersections_clip_file,driver="GeoJSON")
print("Done!")

Done!
