In [1]:
import pandas as pd
import glob as glob
import json
from pprint import pprint


import geojson


In [2]:
msrgb_train = glob.glob('./groundtruth/train/**/*rgb.json', recursive=True)
msrgb_val = glob.glob('./groundtruth/val/**/*rgb.json', recursive=True)

In [3]:
url = 'https://spacenet-dataset.s3.amazonaws.com/Hosted-Datasets/fmow/fmow-rgb/'

In [4]:
from shapely.wkt import loads
from shapely.geometry import mapping


def processJson(filelist, img_url_prefix, dataset):
    result = []
    
    if dataset == 'train':
        prefix = 'train'
    else:
        prefix = 'val'
        
    for f in filelist:
        # with open(f, "r") as infile:
        entry = json.load(open(f))

        category = entry['bounding_boxes'][0]['category']
        num = [str(i) for i in entry['img_filename'].split('_') if i.isdigit()]
        image_url = img_url_prefix + prefix + '/' + category + '/' + category + '_' + num[0] + '/' + category + '_' + num[0] + '_' + num[1] + '_msrgb.jpg' 

        geometry = json.loads(geojson.dumps(mapping(loads(entry['raw_location']))))

        coordinates = json.loads(geojson.dumps(loads(entry['raw_location']).centroid.coords[0]))[::-1]

        entry['category'] = category
        entry['geometry'] = geometry
        entry['coordinates'] = coordinates
        entry['image_url'] = image_url
        entry['set'] = prefix

        result.append(entry)
    return result
    # pprint(url_train + obj + '/' + obj + '_' + idnum + '/' + obj + '_' + idnum + '_' + foldernum + '_msrgb.jpg' )

fulldata = processJson(msrgb_train, url, 'train')
fulldata.extend(processJson(msrgb_val, url, 'value'))

In [5]:
len(fulldata)

416613

In [7]:
pprint(fulldata[0])

{'abs_cal_factors': [{'band': 'blue', 'value': 0.0150102},
                     {'band': 'green', 'value': 0.0120281},
                     {'band': 'red', 'value': 0.0097083},
                     {'band': 'nir', 'value': 0.0134302}],
 'approximate_wavelengths': [661, 545, 477],
 'bounding_boxes': [{'ID': -1,
                     'box': [466, 299, 72, 60],
                     'category': 'fountain',
                     'crowd_rank': 0.662536,
                     'epsg': '4326',
                     'raw_category': 'Fountain',
                     'raw_location': 'POLYGON ((8.5829783604202152 '
                                     '47.4491842338692322, 8.5833139643901841 '
                                     '47.4491842338692322, 8.5833139643901841 '
                                     '47.4489085080230737, 8.5829783604202152 '
                                     '47.4489085080230737, 8.5829783604202152 '
                                     '47.4491842338692322))',
             

In [8]:
with open("FullData.json", "w") as outfile:
    json.dump(fulldata, outfile)

In [9]:
json.dump(fulldata[:4000], open('sampletestingdata.json', 'w'))

In [10]:
load_json = pd.read_json('FullData.json')

In [11]:
load_json['bounding_boxes']

0         [{'raw_category': 'Fountain', 'category': 'fou...
1         [{'raw_category': 'Fountain', 'category': 'fou...
2         [{'raw_category': 'Fountain', 'category': 'fou...
3         [{'raw_category': 'Fountain', 'category': 'fou...
4         [{'raw_category': 'Fountain', 'category': 'fou...
                                ...                        
416608    [{'raw_category': 'Smokestack', 'category': 's...
416609    [{'raw_category': 'Smokestack', 'category': 's...
416610    [{'raw_category': 'Smokestack', 'category': 's...
416611    [{'raw_category': 'Smokestack', 'category': 's...
416612    [{'raw_category': 'Smokestack', 'category': 's...
Name: bounding_boxes, Length: 416613, dtype: object

In [12]:
reverseCoordinates = load_json['coordinates'].apply(lambda x: x[::-1])

In [13]:
load_json['coordinates'][0]

[47.4490394778, 8.583081799999999]

In [16]:
reverseCoordinates[0]

[8.583081799999999, 47.4490394778]

In [17]:
pd.set_option('display.max_colwidth', None)
load_json[load_json['category'] == 'multi-unit_residential']['image_url'].head()

102904    https://spacenet-dataset.s3.amazonaws.com/Hosted-Datasets/fmow/fmow-rgb/train/multi-unit_residential/multi-unit_residential_232/multi-unit_residential_232_8_msrgb.jpg
102905    https://spacenet-dataset.s3.amazonaws.com/Hosted-Datasets/fmow/fmow-rgb/train/multi-unit_residential/multi-unit_residential_232/multi-unit_residential_232_9_msrgb.jpg
102906    https://spacenet-dataset.s3.amazonaws.com/Hosted-Datasets/fmow/fmow-rgb/train/multi-unit_residential/multi-unit_residential_232/multi-unit_residential_232_2_msrgb.jpg
102907    https://spacenet-dataset.s3.amazonaws.com/Hosted-Datasets/fmow/fmow-rgb/train/multi-unit_residential/multi-unit_residential_232/multi-unit_residential_232_3_msrgb.jpg
102908    https://spacenet-dataset.s3.amazonaws.com/Hosted-Datasets/fmow/fmow-rgb/train/multi-unit_residential/multi-unit_residential_232/multi-unit_residential_232_5_msrgb.jpg
Name: image_url, dtype: object

In [18]:
load_json.columns

Index(['img_filename', 'bounding_boxes', 'gsd', 'img_width', 'img_height',
       'mean_pixel_height', 'mean_pixel_width', 'utm', 'country_code',
       'cloud_cover', 'timestamp', 'scan_direction', 'approximate_wavelengths',
       'pan_resolution_dbl', 'pan_resolution_start_dbl',
       'pan_resolution_end_dbl', 'pan_resolution_min_dbl',
       'pan_resolution_max_dbl', 'multi_resolution_dbl',
       'multi_resolution_start_dbl', 'multi_resolution_end_dbl',
       'multi_resolution_min_dbl', 'multi_resolution_max_dbl',
       'target_azimuth_dbl', 'target_azimuth_start_dbl',
       'target_azimuth_end_dbl', 'target_azimuth_min_dbl',
       'target_azimuth_max_dbl', 'sun_azimuth_dbl', 'sun_azimuth_min_dbl',
       'sun_azimuth_max_dbl', 'sun_elevation_dbl', 'sun_elevation_min_dbl',
       'sun_elevation_max_dbl', 'off_nadir_angle_dbl',
       'off_nadir_angle_start_dbl', 'off_nadir_angle_end_dbl',
       'off_nadir_angle_min_dbl', 'off_nadir_angle_max_dbl', 'catalog_id',
       'senso

In [20]:
load_json['sensor_platform_name'].head()

0            GEOEYE01
1         WORLDVIEW02
2    WORLDVIEW03_VNIR
3    WORLDVIEW03_VNIR
4         WORLDVIEW02
Name: sensor_platform_name, dtype: object

In [21]:
extracted_filters=load_json[['img_filename', 'category', 'country_code', 'sensor_platform_name', 'spatial_reference', 'set']]

In [23]:
extracted_filters['coordinates']=reverseCoordinates

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_filters['coordinates']=reverseCoordinates


In [24]:
extracted_filters.to_json('./FilteringData.json', orient='records')

In [26]:
extracted_filters.iloc[0]

img_filename                        fountain_514_1_rgb.tif
category                                          fountain
country_code                                           CHE
sensor_platform_name                              GEOEYE01
spatial_reference                             GCS_WGS_1984
set                                                  train
coordinates             [8.583081799999999, 47.4490394778]
Name: 0, dtype: object