In [1]:
%load_ext autoreload
%autoreload 2

import sys, os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon
from google.colab import drive
import ee
import geemap.foliumap as geemap

In [2]:
ee.Authenticate()
ee.Initialize(project="215656163750")
drive.mount('/content/drive')

sys.path.append('/content/drive/MyDrive/Colab Notebooks/')
import explore

#where boundary files will be written
shp_path = "/content/drive/MyDrive/CAFO_data/Misc_global/shapefiles/"

Mounted at /content/drive


In [3]:
def get_relevant_boundaries(state_boundary, subareas, row_indexes, rename,\
                            fname):
  """
  For application regions that aren't whole countries, get the boundaries of
  the lower-level admin units within the district of interest.
  """

  gdf = gpd.sjoin(subareas, state_boundary, how="inner",\
                                    predicate="intersects", lsuffix=None)
  print(f"There are {len(gdf)} subareas in this administrative region")

  # If we want to take a subset of rows (subareas)
  if row_indexes is not None:
    print(f"  - Selecting rows {row_indexes[0]}:{row_indexes[1]}")
    gdf = gdf.iloc[row_indexes[0]:row_indexes[1]]
    print(f"  - Selected {len(gdf)} rows")

  # Standardize the admin region name(s) and code(s), drop extraneous columns
  gdf.rename(columns=rename, inplace=True)
  keep = [c for c in rename.values()] + ["geometry"]
  gdf = gdf[keep].reset_index(drop=True)

  # Save boundary file for use by getSentinel
  print(f"Saving boundary file to {fname}")
  gdf.to_file(fname, driver="GeoJSON")

  return gdf


In [4]:
# India - Gujarat, prep, needed for both sections of Gujarat

# Get Gujarat state boundary
path = "/content/drive/MyDrive/CAFO_data/India/"
india_adm1 = gpd.read_file(f"{path}geoBoundaries-IND-ADM1.geojson")
state_boundary = india_adm1[india_adm1["shapeISO"] == "IN-GJ"]

# Get boundaries of ADM3-level districts in India
india_adm3 = gpd.read_file(f"{path}geoBoundaries-IND-ADM3.geojson")

In [5]:
# India - Gujarat part 1

gujarat1 = {"Name": "Gujarat1", "Country code": "IND", "CRS":"EPSG:7761"}

# Get the boundaries of the first 150 of 299 ADM3 regions within Gujarat
gujarat1["Boundary gdf"] = get_relevant_boundaries(state_boundary, india_adm3,\
                                                   row_indexes=[0,150],\
                                                   rename={"shapeName": "ADM3",\
                                                           "shapeID": "ADM3_CODE"},\
                                                   fname=f"{shp_path}Gujarat1_ADM3.geojson")
gujarat1["Region code"] = "ADM3_CODE"
gujarat1["Region name"] = "ADM3"

There are 299 subareas in this administrative region
  - Selecting rows 0:150
  - Selected 150 rows
Saving boundary file to /content/drive/MyDrive/CAFO_data/Misc_global/shapefiles/Gujarat1_ADM3.geojson


In [6]:
# India - Gujarat part 2

gujarat2 = {"Name": "Gujarat2", "Country code": "IND", "CRS":"EPSG:7761"}

# Get the boundaries of the remaining ADM3 regions within Gujarat
gujarat2["Boundary gdf"] = get_relevant_boundaries(state_boundary, india_adm3,\
                                                   row_indexes=[150,300],\
                                                   rename={"shapeName": "ADM3",\
                                                           "shapeID": "ADM3_CODE"},\
                                                   fname=shp_path+"Gujarat2_ADM3.geojson")
gujarat2["Region code"] = "ADM3_CODE"
gujarat2["Region name"] = "ADM3"

There are 299 subareas in this administrative region
  - Selecting rows 150:300
  - Selected 149 rows
Saving boundary file to /content/drive/MyDrive/CAFO_data/Misc_global/shapefiles/Gujarat2_ADM3.geojson


In [7]:
# Indonesia - West Java

w_java = {"Name": "West_Java", "Country code": "IDN", "CRS":"EPSG:23830"}

# Get West Java state boundary
indonesia_adm1 = gpd.read_file(f"{shp_path}idn_admbnda_adm1_bps_20200401.shp")
state_boundary = indonesia_adm1[indonesia_adm1["ADM1_EN"] == "Jawa Barat"]

# Get ADM2 boundaries within West Java
indonesia_adm2 = gpd.read_file(f"{shp_path}idn_admbnda_adm2_bps_20200401.shp")
w_java["Boundary gdf"] = get_relevant_boundaries(state_boundary, indonesia_adm2,\
                                                 row_indexes=None,\
                                                 rename={"ADM2_EN":"ADM2",\
                                                         "ADM1_EN":"ADM1",\
                                                         "ADM0_EN":"ADM0",\
                                                         "ADM2_PCODE":"ADM2_CODE"},\
                                                 fname=shp_path+"West_Java_ADM2.geojson")

w_java["Region code"] = "ADM2_CODE"
w_java["Region name"] = "ADM2"

There are 36 subareas in this administrative region
Saving boundary file to /content/drive/MyDrive/CAFO_data/Misc_global/shapefiles/West_Java_ADM2.geojson


In [15]:
# Sudan - entire country

# Sudan country boundary
sudan = {"Name": "Sudan", "Country code": "SDN", "CRS": "EPSG:20135"}
sudan_adm0 = gpd.read_file(shp_path+"sdn_admbnda_adm0_cbs_nic_ssa_20200831.shp")

# ADM2 region boundaries
sudan_adm2 = gpd.read_file(shp_path+"sdn_admbnda_adm2_cbs_nic_ssa_20200831.shp")
sudan["Boundary gdf"] = get_relevant_boundaries(sudan_adm0, sudan_adm2,\
                                                 row_indexes=None,\
                                                 rename={"ADM2_EN":"ADM2",\
                                                         "ADM1_EN":"ADM1",\
                                                         "ADM0_EN":"ADM0",\
                                                         "ADM2_PCODE":"ADM2_CODE"},\
                                                 fname=shp_path+"Sudan_ADM2.geojson")

sudan["Region code"] = "ADM2_CODE"
sudan["Region name"] = "ADM2"

There are 189 subareas in this administrative region
Saving boundary file to /content/drive/MyDrive/CAFO_data/Misc_global/shapefiles/Sudan_ADM2.geojson


In [49]:
# South Africa - Western Cape

w_cape = {"Name": "Western Cape", "Country code": "ZAF", "CRS": "EPSG:2055"}

# Western Cape state boundary
sa_adm1 = gpd.read_file(f"{shp_path}zaf_admbnda_adm1_sadb_ocha_20201109.shp")
state_boundary = sa_adm1[sa_adm1["ADM1_EN"] == "Western Cape"]

# ADM3 boundaries within Western Cape
sa_adm3 = gpd.read_file(f"{shp_path}zaf_admbnda_adm3_sadb_ocha_20201109.shp")
w_cape["Boundary gdf"] = get_relevant_boundaries(state_boundary, sa_adm3,\
                                                 row_indexes=None,\
                                                 rename={"ADM3_EN":"ADM3",\
                                                         "ADM2_EN":"ADM2",\
                                                         "ADM1_EN":"ADM1",\
                                                         "ADM0_EN":"ADM0",\
                                                         "ADM3_PCODE":"ADM3_CODE"},\
                                                 fname=shp_path+"WesternCape_ADM3.geojson")
w_cape["Region code"] = "ADM3_CODE"
w_cape["Region name"] = "ADM3"

There are 31 subareas in this administrative region
Saving boundary file to /content/drive/MyDrive/CAFO_data/Misc_global/shapefiles/WesternCape_ADM3.geojson


In [16]:
min_building_size = 800
sentinel_bands = ['B4', 'B3', 'B2'] # just for visualization
sentinel_year = 2023 # just for visualization

where = sudan

In [17]:

data_dict = {}

original_boundaries = where["Boundary gdf"].copy()

codes = where["Boundary gdf"][where["Region code"]]
names = where["Boundary gdf"][where["Region name"]]

for n, (adm_code, adm_name) in enumerate(zip(codes, names)):


  print(f'Processing {adm_code}:{adm_name} ({n}/{len(codes)})')

  # Get the large buildings for this region
  where["Boundary gdf"] = where["Boundary gdf"][where["Boundary gdf"][where["Region code"]] == adm_code]
  where["Boundary"] = where["Boundary gdf"].geometry.iloc[0]
  buildings_fc, boundary = explore.get_buildings(where, min_building_size, where["Country code"])

  # Some regions have no large buildings, so continue w/o them
  if buildings_fc.first().getInfo() is None:
    print(f" -- No large buildings in {adm_code}:{adm_name}")
    where["Boundary gdf"] = original_boundaries
    continue

  # Merge buildings into "clusters", find largest building per cluster
  merged, largest = explore.merge_and_make_box(where, buildings_fc)

  # Get Sentinel data for visualization
  sentinel = explore.get_sentinel(where, boundary, sentinel_bands, sentinel_year)

  # Add output to data dict
  data_dict[adm_code] = {"Buildings": buildings_fc, "Boundary": boundary,\
                         "Merged": merged, "Largest": largest,\
                         "Sentinel": sentinel}

  # Restore original admin regions
  where["Boundary gdf"] = original_boundaries

print(f'\nCreated large building images for {len(data_dict.keys())} admin\
 districts')

Processing SD07090:Abassiya (0/189)
Went from 3 buildings to 3 boxes
Processing SD16008:Abu Hamad (1/189)
Went from 56 buildings to 40 boxes
Processing SD14037:Abu Hujar (2/189)
Went from 21 buildings to 11 boxes
Processing SD05140:Abu Jabrah (3/189)
Went from 3 buildings to 3 boxes
Processing SD07088:Abu Jubayhah (4/189)
 -- No large buildings in SD07088:Abu Jubayhah
Processing SD05155:Abu Karinka (5/189)
 -- No large buildings in SD05155:Abu Karinka
Processing SD07104:Abu Kershola (6/189)
 -- No large buildings in SD07104:Abu Kershola
Processing SD18028:Abu Zabad (7/189)
Went from 3 buildings to 3 boxes
Processing SD18087:Abyei (8/189)
Went from 8 buildings to 4 boxes
Processing SD19001:Abyei PCA area (9/189)
 -- No large buildings in SD19001:Abyei PCA area
Processing SD17019:Ad Dabbah (10/189)
Went from 37 buildings to 29 boxes
Processing SD14039:Ad Dali (11/189)
Went from 1 buildings to 1 boxes
Processing SD16011:Ad Damar (12/189)
Went from 384 buildings to 166 boxes
Processing SD1

In [27]:
# Visualize the merged polygons and the largest buildings over the entire area.
# This "works" in the sense that it doesn't crash, but it can be very slow.

os.environ["HYBRID"] = 'https://mt1.google.com/vt/lyrs=y&x={x}&y={y}&z={z}'

Map = geemap.Map()
Map.add_basemap("HYBRID")

sentinel_viz = {
    'min': 0.0,
    'max': 3000,
    'bands': ['B4', 'B3', 'B2'],
}

boundary_viz = {
  'color': 'red',
  'width': 2,
  'fillColor': '00000000'
}

buildings_viz = {
  'color': 'yellow',
  'width': 2,
  'fillColor': '00000000'
}

merged_viz = {
  'color': 'cyan',
  'width': 2,
  'fillColor': '00000000'
}

largest_viz = {
  'color': 'blue',
  'width': 2,
  'fillColor': '00000000'
}


for n, adm_region in enumerate(data_dict.keys()):

  #buildings_fc = data_dict[adm_region]["Buildings"]
  largest_fc = geemap.geopandas_to_ee(data_dict[adm_region]["Largest"])
  #merged_fc = geemap.geopandas_to_ee(data_dict[adm_region]["Merged"])

  if n == 0:
    Map.centerObject(largest_fc.first().geometry(), 8)
  #Map.addLayer(data_dict[adm_region]["Sentinel"], sentinel_viz, "Sentinel")
  #Map.addLayer(data_dict[adm_region]["Boundary"].style(**boundary_viz), {}, "Boundary")
  #Map.addLayer(buildings_fc.style(**buildings_viz), {}, "Buildings")
  #Map.addLayer(merged_fc.style(**merged_viz), {}, "Merged")
  Map.addLayer(largest_fc.style(**largest_viz), {}, "Largest")

Map

In [18]:
# Create a file containing buildings to be used by getSentinel

candidates = pd.concat([data_dict[adm_region]["Largest"] for adm_region in data_dict.keys()])
candidates.loc[:, 'Farm type'] = "Unlabeled"
candidates.loc[:, "Dataset name"] = where["Name"]
candidates.rename(columns={"area_in_meters": "Area (sq m)"}, inplace=True)

candidates.drop(columns=["bf_source", "boundary_id", "confidence"], inplace=True)
candidates.reset_index(drop=True, inplace=True)

path = "/content/drive/MyDrive/CAFO_data/forTraining/interim_files/"
fname = f'{path}{where["Name"]}_bldgs.pkl'
print(f'Saving {len(candidates)} large building images from\
 {len(data_dict.keys())} admin districts to {fname}')
candidates.to_pickle(fname)

Saving 5420 large building images from 156 admin districts to /content/drive/MyDrive/CAFO_data/forTraining/interim_files/Sudan_bldgs.pkl
