In [4]:
!aws s3 cp s3://cloverleaf-dev/image_geometries/7e3583e8-b579-4947-accd-e0685dd4cc57.geojson data/image_geometries.geojson
!aws s3 cp s3://cloverleaf-dev/tdr_cloverleaf_queries/7e3583e8-b579-4947-accd-e0685dd4cc57.geojsonl data/results.geojsonl
!aws s3 cp s3://cloverleaf-dev/nominatims/nominatim_dev.csv data/nominatim.csv

Collecting ujson
  Using cached ujson-3.0.0-cp37-cp37m-macosx_10_14_x86_64.whl (44 kB)
Collecting mocha
  Cloning ssh://****@github.com/maxar-analytics/mocha to /private/var/folders/zp/k9kcczd1733dhwv2dgyqqlmc0000gn/T/pip-install-dygm55_z/mocha
  Running command git clone -q 'ssh://****@github.com/maxar-analytics/mocha' /private/var/folders/zp/k9kcczd1733dhwv2dgyqqlmc0000gn/T/pip-install-dygm55_z/mocha
Collecting skyway
  Cloning ssh://****@github.com/maxar-analytics/skyway (to revision nominatim-load-from-file-object) to /private/var/folders/zp/k9kcczd1733dhwv2dgyqqlmc0000gn/T/pip-install-dygm55_z/skyway
  Running command git clone -q 'ssh://****@github.com/maxar-analytics/skyway' /private/var/folders/zp/k9kcczd1733dhwv2dgyqqlmc0000gn/T/pip-install-dygm55_z/skyway
  Running command git checkout -b nominatim-load-from-file-object --track origin/nominatim-load-from-file-object
  Switched to a new branch 'nominatim-load-from-file-object'
  Branch 'nominatim-load-from-file-object' set up 

In [5]:
from skyway.query.nominatim import Nominatim
import ujson as json

# load image geometries
with open("data/image_geometries.geojson") as fh:
    image_geometries = json.load(fh)

# load results
with open("data/results.geojsonl") as fh:
    results = [json.loads(line) for line in fh]

# load nominatim
nominatim = Nominatim.from_csv("data/nominatim.csv")

In [9]:
# quad_id => catalog_ids
from collections import defaultdict
from shapely.geometry import shape
from maxar_canvas_grid import covers

images_map = defaultdict(list)
for feature in image_geometries.get("features", []):
    catalog_id = feature["properties"]["catalog_id"]
    image_geometry = shape(feature["geometry"])
    for cell in covers(image_geometry, zoom=18):
        images_map[cell.id].append(catalog_id)

In [14]:
from typing import Dict, List

def match_tags(
    nominatim: Nominatim, osm_type: str, osm_tags: Dict[str, str]
) -> List[str]:
    """
    Matches osm tags and element information to a nominatim, in order to map the osm results back to feature classes.
    Returns: list of class names (strings)

    Arguments:
    * `nominatim` - the nominatim of features used to map osm data to feature classes
    * `osm_type` - the element type of a feature, e.g. "node" or "way"
    * `osm_tags` - dictionary of osm tags returned for a feature
    """
    if osm_type not in ["node", "way"]:
        raise ValueError(f"Invalid `osm_type`: {osm_type}")

    element_id = 1 if osm_type == "node" else 2
    classes = []
    for class_name in nominatim.feature_map.keys():
        feature = nominatim.feature_map[class_name]
        if element_id not in feature.elements:
            continue

        match = True
        for k, v in feature.tag_profile.items():
            if osm_tags.get(k) not in v:
                match = False

        if match:
            classes.append(class_name)

    return classes

In [15]:
class_map = defaultdict(list)
for i, result in enumerate(results):
    classes = match_tags(
        nominatim,
        result["properties"]["osm_type"],
        result["properties"]["osm_tags"],
    )
    if not classes:
        continue
        
    geom = shape(result["geometry"])
    for cell in covers(geom, zoom=18):
        class_map[cell.id] += classes
        
    if i % 10000 == 0:
        print(f"Completed {i} of {len(results)} results")

Completed 0 of 2485435 results
Completed 1000 of 2485435 results
Completed 2000 of 2485435 results
Completed 3000 of 2485435 results
Completed 4000 of 2485435 results
Completed 5000 of 2485435 results
Completed 6000 of 2485435 results
Completed 7000 of 2485435 results
Completed 8000 of 2485435 results
Completed 9000 of 2485435 results
Completed 10000 of 2485435 results
Completed 11000 of 2485435 results
Completed 12000 of 2485435 results
Completed 13000 of 2485435 results
Completed 14000 of 2485435 results
Completed 15000 of 2485435 results
Completed 16000 of 2485435 results
Completed 17000 of 2485435 results
Completed 18000 of 2485435 results
Completed 19000 of 2485435 results
Completed 20000 of 2485435 results
Completed 21000 of 2485435 results
Completed 22000 of 2485435 results
Completed 23000 of 2485435 results
Completed 24000 of 2485435 results
Completed 25000 of 2485435 results
Completed 26000 of 2485435 results
Completed 27000 of 2485435 results
Completed 28000 of 2485435 result

In [21]:
class_keys = set(class_map)
images_keys = set(images_map)
valid_quad_ids = images_keys & class_keys
len(valid_quad_ids)

1387271

In [22]:
mocha_categories = [{
    "id": i + 1,
    "name": feature_name,
} for i, feature_name in enumerate(list(nominatim.feature_map.keys()))]

categories_map = {category["name"]: category["id"] for category in mocha_categories}

In [27]:
from datetime import datetime
from maxar_canvas_grid import Cell

mocha_images, mocha_annotations = [], []
images_index, annotations_index = 1, 1
now = datetime.now().isoformat()

for i, quad_id in enumerate(valid_quad_ids):
    image_ids = []
    
    # comput bounds
    x_min, y_min, x_max, y_max = Cell(quad_id).geom_WGS84.bounds
    image_bounds = [x_min, y_min, x_min, y_max, x_max, y_max, x_max, y_min, x_min, y_min]
    
    for catalog_id in images_map.get(quad_id, []):
        mocha_images.append({
            "id": images_index,
            "width": 256,
            "height": 256,
            "file_name": f"{quad_id}_{catalog_id}.json",
            "image_path": "s3://imagesim-storage/chips/0.1/{quad_id}_{catalog_id}.json",
            "date_captured": now,
            "image_bounds": image_bounds,
            "epsg_code": 4326,
        })
        
        image_ids.append(images_index)
        images_index += 1
    
    for image_id in image_ids:
        for category in class_map.get(quad_id, []):
            category_id = categories_map[category]
            mocha_annotations.append({
                "id": annotations_index,
                "image_id": image_id,
                "category_id": category_id,
            })
            annotations_index += 1
    
    if i % 10000 == 0:
        print(f'through {i} of {len(valid_quad_ids)} iterations')

through 0 of 1387271 iterations
through 10000 of 1387271 iterations
through 20000 of 1387271 iterations
through 30000 of 1387271 iterations
through 40000 of 1387271 iterations
through 50000 of 1387271 iterations
through 60000 of 1387271 iterations
through 70000 of 1387271 iterations
through 80000 of 1387271 iterations
through 90000 of 1387271 iterations
through 100000 of 1387271 iterations
through 110000 of 1387271 iterations
through 120000 of 1387271 iterations
through 130000 of 1387271 iterations
through 140000 of 1387271 iterations
through 150000 of 1387271 iterations
through 160000 of 1387271 iterations
through 170000 of 1387271 iterations
through 180000 of 1387271 iterations
through 190000 of 1387271 iterations
through 200000 of 1387271 iterations
through 210000 of 1387271 iterations
through 220000 of 1387271 iterations
through 230000 of 1387271 iterations
through 240000 of 1387271 iterations
through 250000 of 1387271 iterations
through 260000 of 1387271 iterations
through 270000 

In [29]:
from datetime import datetime

mocha_info = {
    "version": "0.1",
    "description": "osm-based dataset used for imagesim model, built with cloverleaf/skyway",
    "contributor": "cameron.derwin@maxar.com",
    "s3_path": "s3://imagesim-storage/datasets/0.1/dataset.json",
    "date_created": datetime.now().isoformat(),
}


mocha_json = {
    "info": mocha_info,
    "images": mocha_images,
    "annotations": mocha_annotations,
    "categories": mocha_categories,
}

with open('mocha.json', 'w') as fh:
    fh.write(json.dumps(mocha_json))

In [33]:
len(mocha_annotations) / len(mocha_images)

3.8504479867813477