In [1]:
import os

import geopandas as gpd
import fiona

import rasterio
import numpy as np
import matplotlib.pyplot as plt

In [2]:
in_annotations = r"/media/ross/ssd/00_2015_DAR_marinedebris/maui/labels/maui_md_label_envelopes.gpkg"
in_tile_index = r"/media/ross/ssd/00_2015_DAR_marinedebris/maui/tiles_shp/maui_512x512_tindex.gpkg"
in_window_retile_dir = r"/media/ross/ssd/00_2015_DAR_marinedebris/maui/04_window_retile"

out_dir = r"/media/ross/ssd/00_2015_DAR_marinedebris/maui/labels"

out_path_gpkg = os.path.join(out_dir, "maui_annotations.gpkg")
out_path_csv = os.path.join(out_dir, "maui_annotations_mltrainable.csv")

                               # Debris categories:
remap_debris_classes = {'B':0, # B = Buoys and floats
                       'C':1,  # C = Cloth
                       'F':2,  # F = Foam 
                       'L':3,  # L = Line (single pieces of rope, not net)
                       'M':4,  # M = Metal
                       'N':5,  # N = Net
                       'P':6,  # P = Plastic
                       'T':7,  # T = Tire
                       'W':8,  # W = Processed wood
                       'V':9,  # V = Vessel
                       'O':10} # O = Other 

remap_debris_classes_string = {'B':'buoy', # B = Buoys and floats
                       'C':'cloth',  # C = Cloth
                       'F':'foam',  # F = Foam 
                       'L':'line',  # L = Line (single pieces of rope, not net)
                       'M':'metal',  # M = Metal
                       'N':'net',  # N = Net
                       'P':'plastic',  # P = Plastic
                       'T':'tire',  # T = Tire
                       'W':'processed wood',  # W = Processed wood
                       'V':'vessel',  # V = Vessel
                       'O':'other'} # O = Other 
              

In [3]:
envelopes_raw = gpd.read_file(in_annotations)
in_tindex = gpd.read_file(in_tile_index)

In [5]:
envelopes_raw['label'] = envelopes_raw['type']

In [6]:
envelopes = envelopes_raw[['unique_pt_id', 'label', 'geometry']].replace({'label':remap_debris_classes_string})

In [7]:
envelopes.head()

Unnamed: 0,unique_pt_id,label,geometry
0,MA-001-0001,other,"POLYGON ((746767.042812153 2326430.389631396, ..."
1,MA-001-0002,buoy,"POLYGON ((746944.3821923686 2326443.440532876,..."
2,MA-001-0003,plastic,"POLYGON ((747264.7125466345 2326555.103654591,..."
3,MA-001-0004,plastic,"POLYGON ((747264.9353615083 2326554.11005451, ..."
4,MA-001-0005,processed wood,"POLYGON ((747449.5547359405 2326629.859620453,..."


In [8]:
#Find all the intersections of our envelopes and the tile index.

intersection = gpd.overlay(envelopes, in_tindex)

In [9]:
#a little eda showed that a single envelope could produce two valid annotations when split by a image tile edge. So we're going to throw away
# any funny little slivers that we find. We will identify funny slivers by looking at the x:y ratio

normalized_ratio_x_y = abs(1 - ((intersection.bounds['maxx'] - intersection.bounds['minx']) / (intersection.bounds['maxy'] - intersection.bounds['miny'])))

In [10]:
normalized_ratio_x_y

0       0.000000
1       0.000000
2       0.000000
3       0.201197
4       4.970264
          ...   
2229    0.461550
2228    1.081919
2230    0.538450
2232    0.094675
2231    0.905325
Length: 2233, dtype: float64

In [11]:
intersection['normalized_ratio_x_y'] = normalized_ratio_x_y

In [12]:
intersection.head()

Unnamed: 0,unique_pt_id,label,filename,geometry,normalized_ratio_x_y
0,MA-001-0001,other,maui_1038_6_15,"POLYGON ((746767.042812153 2326430.389631396, ...",0.0
1,MA-001-0002,buoy,maui_1038_23_14,"POLYGON ((746944.3821923686 2326443.440532876,...",0.0
2,MA-001-0003,plastic,maui_1039_25_3,"POLYGON ((747264.7125466345 2326555.103654591,...",0.0
3,MA-001-0004,plastic,maui_1039_25_3,"POLYGON ((747264.9353615083 2326554.277551278,...",0.201197
4,MA-001-0004,plastic,maui_1039_25_4,"POLYGON ((747264.9353615083 2326554.11005451, ...",4.970264


In [13]:
filtered = intersection[intersection['normalized_ratio_x_y'] <= .7]

In [14]:
filtered.to_file(out_path_gpkg, driver='GPKG')

In [15]:
#Create a couple placeholder columns to store pixel coordinates

filtered['xmin'] = 0
filtered['ymin'] = 0
filtered['xmax'] = 0
filtered['ymax'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [None]:
#final step is to pull the affine transformation grid from each envelope's corresponding image 
#and then convert utm coordinates to pixel coordinates

for i, row in filtered.iterrows():
    #get the mins and the maxs
    xmin = row.geometry.bounds[0]
    xmax = row.geometry.bounds[2]
    ymin = row.geometry.bounds[1]
    ymax = row.geometry.bounds[3]
    
    #
    xs = (xmin, xmax)
    ys = (ymin, ymax)
    
    image_path = os.path.join(in_window_retile_dir, row['filename'] + '.jpg')
    
    if os.path.exists(image_path) == True:
        with rasterio.open(image_path, 'r') as src:
            geotrans = src.transform

            pix_coords = rasterio.transform.rowcol(geotrans, xs, ys)

        filtered.loc[i, 'xmin'] = pix_coords[1][0]
        filtered.loc[i, 'xmax'] = pix_coords[1][1]

        filtered.loc[i, 'ymin'] = pix_coords[0][1]   
        filtered.loc[i, 'ymax'] = pix_coords[0][0]
    else:
        print(f'{image_path} nope.')
        filtered.drop(i)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [None]:
#Write to a gpkg for GIS display and a csv for ml training (ml training format matters!)

In [None]:
filtered.head()

In [None]:
filtered['label'] = filtered['type']

In [None]:
filtered_lbl = filtered.replace({'label':remap_debris_classes})

In [None]:

filtered_lbl[['unique_pt_id', 'label', 'filename', 'xmin', 'ymin','xmax', 'ymax', 'normalized_ratio_x_y', 'geometry']].to_file(out_path_gpkg, driver="GPKG")
filtered_lbl[['filename', 'xmin', 'ymin','xmax', 'ymax', 'label']].to_csv(out_path_csv)                                                                                                

In [None]:
# find all the positive tiles so only images with labels get uploaded to azure

In [None]:
all_tiles = in_tindex.filename.unique()
print(len(all_tiles))

In [None]:
pos_tiles = filtered_lbl.filename.unique().astype('str')
print(len(pos_tiles))

In [None]:
pos_list = pos_tiles.tolist()
pos_list_ext = []

for pos in pos_list:
    new_pos = str(pos) + str('.jpg')
    
    pos_list_ext.append(new_pos)
    
if len(pos_list_ext) == len(pos_tiles):
    print('positive lists match')
else:
    print('WARNING, INPUT AND OUTPUT POSTITIVE LIST LENGTHS DONT MATCH')

In [None]:
neg_tiles = np.setdiff1d(all_tiles, pos_tiles)
len(neg_tiles)

In [None]:
neg_list = neg_tiles.tolist()
neg_list_ext = []

for neg in neg_list:
    new_neg = str(neg) + str('.jpg')
    
    neg_list_ext.append(new_neg)
    
if len(neg_list_ext) == len(neg_tiles):
    print('positive lists match')
else:
    print('WARNING, INPUT AND OUTPUT POSTITIVE LIST LENGTHS DONT MATCH')

In [None]:
if len(pos_tiles) + len(neg_tiles) == len(all_tiles):
    print("Gottem.")
    
    out_path_pos_tiles = os.path.join(in_window_retile_dir, 'positive_tiles.txt')
    with open(out_path_pos_tiles, 'w') as f:
        for item in pos_list_ext:
            f.write("%s\n" % item)
    out_path_neg_tiles = os.path.join(in_window_retile_dir, 'negative_tiles.txt')
    with open(out_path_neg_tiles, 'w') as f:
        for item in neg_list_ext:
            f.write("%s\n" % item)
else:
    print(f'I think I lost some tiles: {len(pos_tiles)} pos/ {len(neg_tiles)} neg/ {len(all_tiles)} total')