In [67]:
import os
from glob import glob
import shutil
import pandas as pd
import geopandas as gpd

In [68]:
wfdp_path = '/datasets/rpartsey/satellite/planet/wfdp_labels'

In [69]:
os.listdir(wfdp_path)

['Zaporizhzhia', 'Lugansk', 'Kharkiv']

In [70]:
geojson_paths = glob(os.path.join(wfdp_path, '*', '*.geojson'))
sorted([os.path.basename(f) for f in geojson_paths])

['Kharkiv_2017_autumn_Leiberiuk_2019-06-25.geojson',
 'Kharkiv_2017_spring_Leiberiuk_2019-07-17.geojson',
 'Kharkiv_2017_summer_Leiberiuk_2019-07-02.geojson',
 'Kharkiv_2018_spring_Burned_Areas.geojson',
 'Kharkiv_2019_spring_Leiberiuk_2019-08-25.geojson',
 'Lugansk_2017_autumn_Kuzminova_2019_06_22.geojson',
 'Lugansk_2017_spring_Leiberiuk_2019-10-08.geojson',
 'Lugansk_2017_summer_Leiberiuk_2019-09-13.geojson',
 'Lugansk_2018_autumn_Kuzminova_2019-02-22.geojson',
 'Lugansk_2018_autumn_Kuzminova_2019-03-27.geojson',
 'Lugansk_2018_autumn_Kuzminova_2019_04_30.geojson',
 'Lugansk_2018_spring_Kuzminova_2019-03-27.geojson',
 'Lugansk_2018_spring_Kuzminova_2019-06-05.geojson',
 'Lugansk_2018_summer_Kuzminova_2019-03-27.geojson',
 'Lugansk_2018_summer_Kuzminova_2019_04_30.geojson',
 'Zaporozhje_2017_autumn_Leiberiuk_2019-02-17.geojson',
 'Zaporozhje_2017_spring_Leiberiuk_2019-02-08.geojson',
 'Zaporozhje_2018_autumn_Leiberiuk_2019-01-16.geojson',
 'Zaporozhje_2018_spring_Leiberiuk_2019-01-16

In [71]:
poly_gdf = pd.concat([gpd.read_file(f) for f in geojson_paths])
poly_gdf.shape

(2277, 5)

In [53]:
poly_gdf.head()

Unnamed: 0,date,planet_img,firms_sour,editor,geometry
0,2018-08-28,20180831_075824_0f35_tms.xml,V1,Oleksandr Leiberiuk - 0001,"POLYGON ((36.62941 47.27804, 36.63887 47.27809..."
1,2018-08-22,20180823_075453_0e20_tms.xml,V1,Oleksandr Leiberiuk - 0002,"POLYGON ((36.85774 46.82754, 36.85798 46.82746..."
2,2018-08-23,20180824_075716_1011_tms.xml,V1,Oleksandr Leiberiuk - 0003,"POLYGON ((36.85279 46.81438, 36.85312 46.81543..."
3,2018-08-22,20180823_073520_0f3c_tms.xml,_,Oleksandr Leiberiuk - 0004,"POLYGON ((35.86952 46.65867, 35.86971 46.65775..."
4,2018-08-22,20180823_073521_0f3c_tms.xml,_,Oleksandr Leiberiuk - 0005,"POLYGON ((35.85320 46.66909, 35.85242 46.67009..."


In [76]:
def valid_invalid_split(path):
    df = gpd.read_file(path) 
    
    file_name = os.path.basename(path)
    
    none_fields_mask = df.planet_img.isna() | df.geometry.isna()
    print('Number of none fields: ', none_fields_mask.sum())
    
    none_df = df[none_fields_mask]
    if none_df.shape[0] > 0:
        none_path = path.replace(file_name, 'none_{}'.format(file_name))
        none_df.to_file(none_path, driver='GeoJSON')
    
    df = df[~none_fields_mask]
    
    invalid_symbols = ['Sentinel', '10.04.18', ',', '?????', 'NULL', '(', ')']
    invalid_symbols_mask = df.planet_img.map(lambda p_img: any(s in p_img for s in invalid_symbols))
    print('Number of invalid symbols fields: ',invalid_symbols_mask.sum())
    
    invalid_symbols_df = df[invalid_symbols_mask]
    if invalid_symbols_df.shape[0] > 0:
        inv_symbols_path = path.replace(file_name, 'inv_symbols_{}'.format(file_name))
        invalid_symbols_df.to_file(inv_symbols_path, driver='GeoJSON')
    
    df = df[~invalid_symbols_mask]
    if df.shape[0] > 0:
        df.to_file(path, driver='GeoJSON')
    else:
        os.remove(path)

In [77]:
for path in geojson_paths:
    print(path)
    valid_invalid_split(path)
    print()

/datasets/rpartsey/satellite/planet/wfdp_labels/Zaporizhzhia/Zaporozhje_2018_summer_Leiberiuk_2019-01-16.geojson
Number of none fields:  0
Number of invalid symbols fields:  0

/datasets/rpartsey/satellite/planet/wfdp_labels/Zaporizhzhia/Zaporozhje_2019_spring_Leiberiuk_2019-06-11.geojson
Number of none fields:  0
Number of invalid symbols fields:  0

/datasets/rpartsey/satellite/planet/wfdp_labels/Zaporizhzhia/Zaporozhje_2018_autumn_Leiberiuk_2019-01-16.geojson
Number of none fields:  0
Number of invalid symbols fields:  0

/datasets/rpartsey/satellite/planet/wfdp_labels/Zaporizhzhia/Zaporozhje_2018_spring_Leiberiuk_2019-01-16.geojson
Number of none fields:  0
Number of invalid symbols fields:  0

/datasets/rpartsey/satellite/planet/wfdp_labels/Zaporizhzhia/Zaporozhje_2017_autumn_Leiberiuk_2019-02-17.geojson
Number of none fields:  0
Number of invalid symbols fields:  0

/datasets/rpartsey/satellite/planet/wfdp_labels/Zaporizhzhia/Zaporozhje_2017_spring_Leiberiuk_2019-02-08.geojson
Nu

In [24]:
def check_duplicate_poly(df, epsilon=0.1):
    for row, poly in df.iterrows():
        for col, another in df.iterrows():
            if row >= col:
                continue
            