In [15]:
%load_ext autoreload
%autoreload 2

import sys, os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon
from google.colab import drive
import ee
import geemap.foliumap as geemap

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
ee.Authenticate()
ee.Initialize(project="215656163750")
drive.mount('/content/drive')

sys.path.append('/content/drive/MyDrive/Colab Notebooks/')
import explore

#where boundary files will be written
shp_path = "/content/drive/MyDrive/CAFO_data/Misc_global/shapefiles/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
def get_relevant_boundaries(state_boundary, subareas, row_indexes, old_col,\
                            new_col, fname):
  """
  For application regions that aren't whole countries, get the boundaries of
  the lower-level admin units within the district of interest.
  """

  gdf = gpd.sjoin(subareas, state_boundary, how="inner",\
                                    predicate="intersects", lsuffix=None)
  print(f"There are {len(gdf)} subareas in this administrative region")

  # If we want to take a subset of rows (subareas)
  if row_indexes is not None:
    print(f"  - Selecting rows {row_indexes[0]}:{row_indexes[1]}")
    gdf = gdf.iloc[row_indexes[0]:row_indexes[1]]
    print(f"  - Selected {len(gdf)} rows")

  # Standardize the admin region name, drop extraneous columns
  gdf.rename(columns={old_col:new_col}, inplace=True)
  gdf = gdf[[new_col, "geometry"]].reset_index(drop=True)

  # Save boundary file for use by getSentinel
  print(f"Saving boundary file to {fname}")
  gdf.to_file(fname, driver="GeoJSON")

  return gdf


In [39]:
# India - Gujarat, prep, needed for both sections of Gujarat

# Get Gujarat state boundary
path = "/content/drive/MyDrive/CAFO_data/India/"
india_adm1 = gpd.read_file(f"{path}geoBoundaries-IND-ADM1.geojson")
state_boundary = india_adm1[india_adm1["shapeISO"] == "IN-GJ"]

# Get boundaries of ADM3-level districts in India
india_adm3 = gpd.read_file(f"{path}geoBoundaries-IND-ADM3.geojson")

In [40]:
# India - Gujarat part 1

gujarat1 = {"Name": "Gujarat1", "Code": "IND"}
gujarat1["CRS"] = "EPSG:7761"

# Get the boundaries of the first 150 of 299 ADM3 regions within Gujarat
gujarat1["Boundary gdf"] = get_relevant_boundaries(state_boundary, india_adm3,\
                                                   row_indexes=[0,150],\
                                                   old_col="shapeName",\
                                                   new_col="ADM3",\
                                                   fname=f"{shp_path}Gujarat1_ADM3.geojson")
gujarat1["Region col"] = "ADM3"

There are 299 subareas in this administrative region
  - Selecting rows 0:150
  - Selected 150 rows
Saving boundary file to /content/drive/MyDrive/CAFO_data/Misc_global/shapefiles/Gujarat1_ADM3.geojson


In [41]:
# India - Gujarat part 2

gujarat2 = {"Name": "Gujarat2", "Code": "IND"}
gujarat2["CRS"] = "EPSG:7761"

# Get the boundaries of the remaining ADM3 regions within Gujarat
gujarat2["Boundary gdf"] = get_relevant_boundaries(state_boundary, india_adm3,\
                                                   row_indexes=[150,300],\
                                                   old_col="shapeName",\
                                                   new_col="ADM3",\
                                                   fname=shp_path+"Gujarat2_ADM3.geojson")
gujarat2["Region col"] = "ADM3"

There are 299 subareas in this administrative region
  - Selecting rows 150:300
  - Selected 149 rows
Saving boundary file to /content/drive/MyDrive/CAFO_data/Misc_global/shapefiles/Gujarat2_ADM3.geojson


In [48]:
# Indonesia - West Java

w_java = {"Name": "West_Java", "Code": "IDN"}
w_java["CRS"] = "EPSG:23830"

# Get West Java state boundary
indonesia_adm1 = gpd.read_file(f"{shp_path}idn_admbnda_adm1_bps_20200401.shp")
state_boundary = indonesia_adm1[indonesia_adm1["ADM1_EN"] == "Jawa Barat"]

# Get ADM boundaries within West Java
indonesia_adm3 = gpd.read_file(f"{shp_path}idn_admbnda_adm3_bps_20200401.shp")
w_java["Boundary gdf"] = get_relevant_boundaries(state_boundary, indonesia_adm3,\
                                                 row_indexes=None,\
                                                 old_col="ADM3_EN",\
                                                 new_col="ADM3",\
                                                 fname=shp_path+"West_Java_ADM3.geojson")

w_java["Region col"] = "ADM3"

There are 662 subareas in this administrative region
Saving boundary file to /content/drive/MyDrive/CAFO_data/Misc_global/shapefiles/West_Java_ADM3.geojson


In [None]:
# Sudan - entire country

sudan = {"Name": "Sudan", "Code": "SDN"}
sudan["CRS"] = "EPSG:20135"
sudan["Boundary gdf"] = gpd.read_file(path+"sdn_admbnda_adm2_cbs_nic_ssa_20200831.shp")
sudan["Boundary gdf"].rename(columns={"ADM2_EN": "ADM2"}, inplace=True)
sudan["Boundary gdf"] = sudan["Boundary gdf"][["ADM2", "geometry"]]
sudan["Region col"] = "ADM2"

In [None]:
# South Africa - Western Cape

w_cape = {"Name": "Western Cape", "Code": "ZAF"}
w_cape["CRS"] = "EPSG:2055"

# Western Cape state boundary
sa_adm1 = gpd.read_file(f"{path}zaf_admbnda_adm1_sadb_ocha_20201109.shp")
state_boundary = sa_adm1[sa_adm1["ADM1_EN"] == "Western Cape"]

# ADM3 boundaries within Western Cape
sa_adm3 = gpd.read_file(f"{path}zaf_admbnda_adm3_sadb_ocha_20201109.shp")
w_cape["Boundary gdf"] = get_relevant_boundaries(state_boundary, sa_adm3,\
                                                 "ADM3_EN", "ADM3", path,\
                                                 "WesternCape_ADM3.geojson")
w_cape["Region col"] = "ADM3"

Saving boundary file to /content/drive/MyDrive/CAFO_data/Misc_global/shapefiles/WesternCape_ADM3.geojson


In [49]:
min_building_size = 800
sentinel_bands = ['B4', 'B3', 'B2'] # just for visualization
sentinel_year = 2023 # just for visualization

where = w_java

In [50]:

data_dict = {}

original_boundaries = where["Boundary gdf"].copy()
regions = where["Boundary gdf"][where["Region col"]].unique()
for n, adm_region in enumerate(regions):
  print(f'Processing {adm_region} ({n}/{len(regions)})')

  # Get the large buildings for this region
  where["Boundary gdf"] = where["Boundary gdf"][where["Boundary gdf"][where["Region col"]] == adm_region]
  where["Boundary"] = where["Boundary gdf"].geometry.iloc[0]
  buildings_fc, boundary = explore.get_buildings(where, min_building_size, where["Code"])

  # Some regions have no large buildings, so continue w/o them
  if buildings_fc.first().getInfo() is None:
    print(f" -- No large buildings in {adm_region}")
    where["Boundary gdf"] = original_boundaries
    continue

  # Merge buildings into "clusters", find largest building per cluster
  merged, largest = explore.merge_and_make_box(where, buildings_fc)

  # Get Sentinel data for visualization
  sentinel = explore.get_sentinel(where, boundary, sentinel_bands, sentinel_year)

  # Add output to data dict
  data_dict[adm_region] = {"Buildings": buildings_fc, "Boundary": boundary,\
                           "Merged": merged, "Largest": largest,\
                           "Sentinel": sentinel}

  # Restore original admin regions
  where["Boundary gdf"] = original_boundaries


Processing Agrabinta (0/610)
Went from 3 buildings to 2 boxes
Processing Andir (1/610)
Went from 132 buildings to 40 boxes
Processing Anjatan (2/610)
Went from 40 buildings to 26 boxes
Processing Antapani (3/610)
Went from 45 buildings to 25 boxes
Processing Arahan (4/610)
Went from 13 buildings to 9 boxes
Processing Arcamanik (5/610)
Went from 124 buildings to 41 boxes
Processing Argapura (6/610)
Went from 51 buildings to 38 boxes
Processing Arjasari (7/610)
Went from 117 buildings to 34 boxes
Processing Arjawinangun (8/610)
Went from 91 buildings to 39 boxes
Processing Astanaanyar (9/610)
Went from 53 buildings to 23 boxes
Processing Astanajapura (10/610)
Went from 99 buildings to 40 boxes
Processing Babakan (11/610)
Went from 30 buildings to 17 boxes
Processing Babakan Ciparay (12/610)
Went from 406 buildings to 36 boxes
Processing Babakan Madang (13/610)
Went from 385 buildings to 125 boxes
Processing Babakancikao (14/610)
Went from 139 buildings to 43 boxes
Processing Babelan (15/

In [None]:
# Visualize the merged polygons and the largest buildings over the entire area.
# This "works" in the sense that it doesn't crash, but it's very slow.

os.environ["HYBRID"] = 'https://mt1.google.com/vt/lyrs=y&x={x}&y={y}&z={z}'

Map = geemap.Map()
Map.add_basemap("HYBRID")

sentinel_viz = {
    'min': 0.0,
    'max': 3000,
    'bands': ['B4', 'B3', 'B2'],
}

boundary_viz = {
  'color': 'red',
  'width': 2,
  'fillColor': '00000000'
}

buildings_viz = {
  'color': 'yellow',
  'width': 2,
  'fillColor': '00000000'
}

merged_viz = {
  'color': 'cyan',
  'width': 2,
  'fillColor': '00000000'
}

largest_viz = {
  'color': 'blue',
  'width': 2,
  'fillColor': '00000000'
}

for n, adm_region in enumerate(data_dict.keys()):

  buildings_fc = data_dict[adm_region]["Buildings"]
  #largest_fc = geemap.geopandas_to_ee(data_dict[adm_region]["Largest"].to_crs("EPSG:4326"))
  #merged_fc = geemap.geopandas_to_ee(data_dict[adm_region]["Merged"].to_crs("EPSG:4326"))

  if n == 0:
    Map.centerObject(buildings_fc.first().geometry(), 8)
  #Map.addLayer(data_dict[adm_region]["Sentinel"], sentinel_viz, "Sentinel")
  Map.addLayer(data_dict[adm_region]["Boundary"].style(**boundary_viz), {}, "Boundary")
  #Map.addLayer(buildings_fc.style(**buildings_viz), {}, "Buildings")
  #Map.addLayer(merged_fc.style(**merged_viz), {}, "Merged")
  #Map.addLayer(largest_fc.style(**largest_viz), {}, "Largest")

Map

In [51]:
# Create a file containing buildings to be used by getSentinel

candidates = pd.concat([data_dict[adm_region]["Largest"] for adm_region in data_dict.keys()])
candidates.loc[:, 'Farm type'] = "Unlabeled"
candidates.rename(columns={"area_in_meters": "Area (sq m)"}, inplace=True)
coldict = {"Dataset name": where["Name"], "Parent coords": None,\
            "Number of animals": np.nan, "Length (m)": np.nan,\
            "Aspect ratio": np.nan}
for col, val in coldict.items():
  candidates.loc[:, col] = val

candidates = explore.re_order(candidates)

path = "/content/drive/MyDrive/CAFO_data/forTraining/interim_files/"
fname = f'{path}{where["Name"]}_bldgs.pkl'
print(f'Saving {len(candidates)} large building images to {fname}')
candidates.to_pickle(fname)

Saving 22271 large building images to /content/drive/MyDrive/CAFO_data/forTraining/interim_files/West_Java_bldgs.pkl


  candidates = pd.concat([data_dict[adm_region]["Largest"] for adm_region in data_dict.keys()])
