In [1]:
!pip install pyogrio==0.7.2 geopandas==0.14.3 unidecode -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.6/56.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.0/22.0 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
%load_ext autoreload
%autoreload 2

import sys, os
import math
from IPython.core.magic import register_cell_magic
import glob
import pickle
from unidecode import unidecode
import numpy as np
from google.colab import drive
import ee
import geemap
import scipy
import pandas as pd
import geopandas as gpd
import pyproj
import pyarrow

In [3]:
ee.Authenticate()
ee.Initialize(project="215656163750")
drive.mount('/content/drive')

gpd.options.io_engine = "pyogrio"
os.environ["PYOGRIO_USE_ARROW"] = "1"

sys.path.append('/content/drive/MyDrive/Colab Notebooks/')
import utils

@register_cell_magic
def skip(line, cell):
    return

interim_path = "/content/drive/MyDrive/CAFO_data/forTraining/interim_files/"
shp_path = "/content/drive/MyDrive/CAFO_data/"
out_path = "/content/drive/MyDrive/CAFO_data/forTraining/"

Mounted at /content/drive


In [4]:
# Model training regions

"""
prefix = 'iowa'
bldgs_file = f"{interim_path}Iowa_bldgs_filtered.pkl"
boundary_shapefile = 'USA/cb_2021_us_county_5m.shp'
adm_no = '2'
"""

"""
prefix = 'chl'
bldgs_file = f"{interim_path}Chile_bldgs_filtered.pkl"
boundary_shapefile = 'Chile/shapefiles/chl_admbnda_adm3_bcn_20211008.shp'
adm_no = '3'
"""

"""
prefix = 'mex'
bldgs_file = f"{interim_path}Mexico_bldgs_filtered.pkl"
boundary_shapefile = 'Mexico/shapefiles/mex_admbnda_adm2_govmex_20210618.shp'
adm_no = '2'
"""

"""
prefix = 'rou'
bldgs_file = f"{interim_path}Romania_bldgs_filtered.pkl"
boundary_shapefile = "Europe/shapefiles/geoBoundaries-ROU-ADM2_simplified.geojson"
adm_no = '2'
"""

# Held-out test regions

info_dict = {'col': 'Colombia', 'ind': 'India', 'jor': 'Jordan', 'kaz':\
             'Kazakhstan', 'per': 'Peru', 'per2': "Peru2", 'mys': 'Malaysia',\
             'sau': "Saudi", 'tha': 'Thailand', 'tur': 'Turkey',\
             'tur2': 'Turkey2', 'ven': 'Venezuela'}

"""
prefix = 'per2'
bldgs_file = f"{interim_path}{info_dict[prefix]}_bldgs.pkl"
boundary_shapefile = f"Misc_global/shapefiles/{info_dict[prefix]}.shp"
adm_no = '-1'
"""

# Model application regions
# Sudan, Gujarat (in 2 parts), Western Cape

"""
prefix = 'sdn'
bldgs_file = f"{interim_path}Sudan_bldgs.pkl"
boundary_shapefile = "Misc_global/shapefiles/sdn_admbnda_adm2_cbs_nic_ssa_20200831.shp"
adm_no = '2'
"""

"""
prefix = 'guj1'
bldgs_file = f"{interim_path}Gujarat1_bldgs.pkl"
boundary_shapefile = "Misc_global/shapefiles/Gujarat1_ADM3.geojson"
adm_no = '3'
"""

#"""
prefix = 'guj2'
bldgs_file = f"{interim_path}Gujarat2_bldgs.pkl"
boundary_shapefile = "Misc_global/shapefiles/Gujarat2_ADM3.geojson"
adm_no = '3'
#"""

"""
prefix = 'wcape'
bldgs_file = f"{interim_path}Western Cape_bldgs.pkl"
boundary_shapefile = "Misc_global/shapefiles/WesternCape_ADM3.geojson"
adm_no = '3'
"""

print("")




In [5]:
sentinel_bands = ['B4', 'B3', 'B2'] # mainly to reduce data volume/get max resolution
                                    # keep bands in this order

training_image_radius = 240 # m, to obtain (approx) 48 x ?? pixel images

In [6]:
# Get the df containing the farm and not-farm buildings/locations selected for
# training.

bldgs = pd.read_pickle(bldgs_file)

In [7]:
# Get the ADMx-level boundaries for each country. This is just to break things
# up into chunks that don't make Earth Engine barf. Mexico has to be done at
# the ADM2 level, Iowa at ADM2, and Chile at ADM3. This creates a ridiculous
# number of files and should be recoded to use a grid or something instead.

def modify_shapefile(shpfile):
  shp = gpd.read_file(f"{shp_path}{shpfile}")

  # Get consistent CRSs
  try:
    shp = shp.set_crs("EPSG:4326")
  except ValueError:
    shp = shp.to_crs("EPSG:4326")

  # Get consistent column names.

  # Iowa, assuming COUNTY-level
  try:
    shp.loc[:, "ADM2"] = shp["NAME"]
  except KeyError:
    pass

  # Romania
  try:
    shp.loc[:, "ADM2"] = shp["shapeName"]
  except KeyError:
    pass

  # Mexico, Chile
  shp.rename(columns={"ADM0_ES": "ADM0", "ADM1_ES": "ADM1",\
                      "ADM2_ES": "ADM2", "ADM3_ES": "ADM3"}, inplace=True)

  # Held-out regions
  if any(place in shpfile for place in info_dict.values()):
    shp.loc[:, "ADM-1"] = ""

  # Nothing to do for application regions other than Sudan
  try:
    shp.loc[:, "ADM2"] = shp["ADM2_EN"]
  except KeyError:
    pass

  return shp

areas = modify_shapefile(boundary_shapefile)

In [8]:
# Identify the administrative regions that contain farm/not-farm locations,
# because we don't want to bother iterating over ones with no farms.  This step
# is only necessary for the larger training datasets, not for the held-out
# regions

def narrow_down(bldgs_df, boundaries_df, adm_no):

  # preserve the building geometry, we will need this later
  bldgs_df.loc[:, 'Building geom'] = bldgs_df.loc[:, 'geometry']
  joined = boundaries_df.sjoin(bldgs_df, how='inner', predicate='intersects')

  # create a df containing only boundaries containing buildings
  bounds_w_data = joined.drop_duplicates(subset=[f'ADM{adm_no}'])

  # create a version of the buildings df in which ADMx is identified
  bldgs_df_2 = joined.drop(columns=['geometry'])
  bldgs_df_2.loc[:, 'geometry'] = bldgs_df_2.loc[:, 'Building geom']
  bldgs_df_2 = bldgs_df_2.set_geometry('geometry')
  bldgs_df_2 = bldgs_df_2.astype({'Building geom': object})

  # drop extraneous columns, but keep admin areas as they will be useful for
  # defining held-out regions later on
  columns_to_keep = bldgs_df.columns.to_list()
  additional = list(set([f'ADM{adm_no}', 'ADM2', 'ADM1', 'ADM0']))
  for column in additional:
    if column not in bldgs_df_2.columns:
      bldgs_df_2.loc[:, column] = np.nan
  columns_to_keep = columns_to_keep + additional
  bldgs_df_2 = bldgs_df_2[columns_to_keep]

  return bounds_w_data, bldgs_df_2

if prefix not in info_dict.keys():
  areas_2, bldgs_2 = narrow_down(bldgs, areas, adm_no=adm_no)
else:
  areas_2, bldgs_2 = areas, bldgs
  bldgs_2.loc[:, "ADM-1"] = ""

In [9]:
# Function to obtain Sentinel data for a specified region

# Data for 2023. Mexico farm data are from 2021, so this increases the risk of
# unregistered CAFOs appearing in the dataset. However, farms in Peru may have
# been constructed more recently than that, and they need to be in the images.
#   - May be better to restrict to a certain season
#   - May need to use better cloud masking:
# https://developers.google.com/earth-engine/tutorials/community/sentinel-2-s2cloudless

def get_sentinel(boundary, sentinel_bands):

  sentinel = (
      ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
      .filterDate('2023-01-01', '2023-12-31')
      .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 10))
      .select(sentinel_bands)
      .median() #crude cloud filter
      .clip(boundary)
  )

  return sentinel

In [10]:
# Function to clip Sentinel data to an area around each farm and not-farm
# building

def extract_snippets(buildings_fc, sentinel_data, filename=None, folder=None):

  # For each polygon, define a square region around its centroid
  def buffer_and_bound(feature, buffer_radius=training_image_radius):
    return feature.centroid().buffer(buffer_radius, 2).bounds()

  areas = buildings_fc.map(buffer_and_bound)

  # Obtain Sentinel data for the polygon
  pix = sentinel_data.sampleRegions(
      collection=areas,
      scale=10,
      geometries=True)

  return areas, pix

In [None]:
#%%skip
# Get the Sentinel snippets for each location in each AMDx-level area, and write
# to file. This is where we create the ridiculous number of files.

def create_interim_files(bldgs, bounds, prefix, adm_no, verbose=False):

  count = 0

  for place in bounds[f'ADM{adm_no}']:

    chunk_size = 500

    df = bldgs[bldgs[f'ADM{adm_no}']==place]
    df = df.reset_index(drop=True)
    num = math.ceil(len(df)/chunk_size)

    # File saving fails if we have too many buildings per region, so process in
    # chunks
    for i in range(num):
      df1 = df[i*chunk_size:i*chunk_size+chunk_size]

      # Convert building info and boundary to ee featureCollections
      geometry = geemap.gdf_to_ee(bounds[bounds[f'ADM{adm_no}']==place][['geometry']])
      bldgs_fc = geemap.gdf_to_ee(df1)

      # Get the Sentinel data for (this part of) this AMDx-level area
      sentinel = get_sentinel(geometry, sentinel_bands)

      # Extract Sentinel data around each building location
      _, pix = extract_snippets(bldgs_fc, sentinel)

      # Save a geojson file containing all the snippets for this region
      fname = ''.join([i for i in place if i.isalpha()])
      fname = unidecode(fname).title().replace(" ", "").strip()
      fname = f"{fname}_{i}"
      try:
        utils.write_to_file(pix, f"{prefix}_{fname}", 'interim_files')
        if verbose:
          print(f"Saving {prefix}: {place} (-->{fname}); {len(df1)} buildings")
      except:
        print(f" -- {place} failed")

      count += 1

  print(f"Created a total of {count} files")

  return count

n_files = create_interim_files(bldgs_2, areas_2, prefix, adm_no, verbose=True)

Saving guj2: Valod (-->Valod_0); 50 buildings
Saving guj2: Junagadh (-->Junagadh_0); 143 buildings
Saving guj2: Sojitra (-->Sojitra_0); 32 buildings
Saving guj2: Olpad (-->Olpad_0); 349 buildings
Saving guj2: Anklav (-->Anklav_0); 51 buildings
Saving guj2: Jhalod (-->Jhalod_0); 34 buildings
Saving guj2: Visavadar (-->Visavadar_0); 59 buildings
Saving guj2: Sagbara (-->Sagbara_0); 19 buildings
Saving guj2: Malia Hatina (-->Maliahatina_0); 46 buildings
Saving guj2: Talala (-->Talala_0); 54 buildings
Saving guj2: Kunkavav Vadia (-->Kunkavavvadia_0); 49 buildings
Saving guj2: Babra (-->Babra_0); 81 buildings
Saving guj2: Amreli (-->Amreli_0); 168 buildings
Saving guj2: Lilia (-->Lilia_0); 14 buildings
Saving guj2: Gariadhar (-->Gariadhar_0); 40 buildings
Saving guj2: Bagasara (-->Bagasara_0); 41 buildings
Saving guj2: Dhari (-->Dhari_0); 48 buildings
Saving guj2: Savar Kundla (-->Savarkundla_0); 83 buildings
Saving guj2: Khambha (-->Khambha_0); 14 buildings
Saving guj2: Botad (-->Botad_0);

In [12]:
utils.ee_task_status(n_tasks=n_files)

Task NVGJCORIT44GFX3RPYRUAETI started at 2024-12-14 01:13:24.700000
Current status: COMPLETED
Task ZCU427BGLPSJG52JF45MZ4CR started at 2024-12-14 01:13:07.346000
Current status: COMPLETED
Task 4MC7WCMEK7T3LWVCEIUQJVLC started at 2024-12-14 01:13:07.349000
Current status: COMPLETED
Task KELIQUY3WJUZZ4JYAXPDMOSO started at 2024-12-14 01:12:46.039000
Current status: COMPLETED
Task REIC7DC6ITOCQNEW7HJW2KAB started at 2024-12-14 01:12:46.040000
Current status: COMPLETED
Task IHF4XV42FQJH572VNV23ZA76 started at 2024-12-14 01:12:38.017000
Current status: COMPLETED
Task D3CNPQ64HCMH3JTRKQLOLLB6 started at 2024-12-14 01:12:21.875000
Current status: COMPLETED
Task 6GHKWCQ3ZYAFRJ54YJIJ6A4K started at 2024-12-14 01:12:12.737000
Current status: COMPLETED
Task 2XP2YDLBMNJUBZCXG3PJNVGL started at 2024-12-14 01:12:12.774000
Current status: COMPLETED
Task O6DE2OY627QHTLQKZOW73O2N started at 2024-12-14 01:11:47.346000
Current status: COMPLETED
Task IRWSPH23GHKZV426QY4IETP6 started at 2024-12-14 01:11:03

In [14]:
print(f"Created a total of {n_files} files for {prefix}")
files = glob.glob(f'{interim_path}{prefix}*.geojson')
print(f"Have saved the Sentinel snippets for {len(files)} of them")

Have saved the Sentinel snippets for 151 of them


In [15]:
# Convert pixel coords and values into a dictionary of 3D numpy array (height,
# width, channels). Band order is RGB, images are 64x64 pix, scaled to 0-255.
# This format should be suitable for keras preprocess_input functions, e.g.
# www.tensorflow.org/api_docs/python/tf/keras/applications/vgg16/preprocess_input

def create_images(gdf, index_start, verbose=False):

    # Remove suffixes from pixel ID numbers so we can group all pixels for a
    # given farm or not-farm
    # E.g, 1_1, 1_2, 1_3, 1_4 --> 1, 1, 1, 1

    gdf['id'] = gdf['id'].str.split('_').str[0]

    # Convert to int - this is very important, without it the gdf and image dict
    # end up out of sync!
    gdf['id'] = gdf['id'].astype(int)

    rejected = []
    arr_list = []

    # Here, a group will be an individual farm or not-farm
    groups = gdf.groupby(by='id')
    for n, group in groups:
      data = {'B4': [], 'B3': [], 'B2': []}
      # Identify image rows for each farm/not-farm
      _ = group.groupby(by=group.geometry.y)
      # Gather the group's pixels into a 3D array
      for coord, vals in _:
        for band in ['B4', 'B3', 'B2']:
          data[band].append([b for b in vals[band]])
      try:
        arr = np.stack([np.array(data['B4']), np.array(data['B3']), np.array(data['B2'])])
      except ValueError as e:
        # These are presumably groups that intersect with the boundary of the region
        # They cause problems because they aren't rectangular; might able to pad but
        # that seems like more trouble than it's worth
        rejected.append(group['id'].unique()[0])
        continue

      # Move the channels axis to the end
      arr = np.moveaxis(arr, [0], [2])

      # Resize the images to 64 x 64 pix
      arr = scipy.ndimage.zoom(
          arr,
          (64/arr.shape[0], 64/arr.shape[1], 1),
          mode='reflect'
          )

      # Rescale to 0-255 (using max over all bands)
      arr = (arr / np.max(arr)) * 255

      # The image is ready now, so add it to the dict
      arr_list.append(arr)

    # Create a version of the input array that has just one row per farm/not-farm
    # and doesn't include the now-redundant band info
    gdf = gdf[~gdf['id'].isin(rejected)]
    gdf = gdf.drop(columns=['B2', 'B3', 'B4']).drop_duplicates(subset=['id'])\
             .drop(columns=['id'])
    new_index = [i for i in range(index_start, index_start+len(gdf))]
    gdf.index = new_index

    # Make a dict of arrays in which keys are guaranteed to match gdf
    arr_dict = {key: value for key, value in zip(new_index, arr_list)}

    if verbose:
      print(f"Started with {len(groups)} farms/not-farms")
      print(f"Rejected {len(rejected)} images with irregular shapes")
      print(f'Retained {len(new_index)} images')

    return arr_dict, gdf

In [17]:
%%time

def images_and_metadata(prefix, verbose=False):
  """
  Save a geodataframe containing both metadata and Sentinel image for the
  specified region <prefix>
  """

  files = glob.glob(f'{interim_path}{prefix}*.geojson')

  gdf_list = []
  index_start = 0
  image_dict_list = []

  for n, f in enumerate(files):

    if verbose:
      print(f"Working on #{n}: {f}")
    gdf = gpd.read_file(f)
    image_dict, gdf = create_images(gdf, index_start)
    if len(gdf) == 0:
      # No images remain after rejecting irregular ones
      continue
    gdf_list.append(gdf)
    image_dict_list.append(image_dict)
    index_start += len(gdf)

  main_gdf = pd.concat(gdf_list)
  image_dict = image_dict_list[0]
  for d in image_dict_list[1:]:
    image_dict.update(d)

  try:
    main_gdf = main_gdf.drop(columns=["ADM-1"])
  except KeyError:
    pass
  main_gdf["Sentinel"] = image_dict.values()

  if "Building geom" in main_gdf.columns:
    main_gdf = main_gdf.drop(columns=['geometry']).\
                        rename(columns={"Building geom": "geometry"})
    main_gdf['geometry'] = gpd.GeoSeries.from_wkt(main_gdf['geometry'])
    main_gdf = main_gdf.set_geometry("geometry")
    main_gdf.set_crs("EPSG:4326", inplace=True)

  main_gdf.to_pickle(f"{out_path}{prefix}_iter0.pkl")

images_and_metadata(prefix, verbose=True)


Working on #0: /content/drive/MyDrive/CAFO_data/forTraining/interim_files/guj2_Valod_0.geojson
Working on #1: /content/drive/MyDrive/CAFO_data/forTraining/interim_files/guj2_Sojitra_0.geojson
Working on #2: /content/drive/MyDrive/CAFO_data/forTraining/interim_files/guj2_Anklav_0.geojson
Working on #3: /content/drive/MyDrive/CAFO_data/forTraining/interim_files/guj2_Jhalod_0.geojson
Working on #4: /content/drive/MyDrive/CAFO_data/forTraining/interim_files/guj2_Junagadh_0.geojson
Working on #5: /content/drive/MyDrive/CAFO_data/forTraining/interim_files/guj2_Sagbara_0.geojson
Working on #6: /content/drive/MyDrive/CAFO_data/forTraining/interim_files/guj2_Visavadar_0.geojson
Working on #7: /content/drive/MyDrive/CAFO_data/forTraining/interim_files/guj2_Maliahatina_0.geojson
Working on #8: /content/drive/MyDrive/CAFO_data/forTraining/interim_files/guj2_Olpad_0.geojson
Working on #9: /content/drive/MyDrive/CAFO_data/forTraining/interim_files/guj2_Talala_0.geojson
Working on #10: /content/drive

In [18]:
# Tidy up - run this cell and then empty trash in drive. While it takes a long
# time to create the geojson files of Sentinel snippets, they do take up a lot
# of space. And if I need to rerun this notebook, it's probably because I've
# changed the farm and not-farm locations anyway.

!rm /content/drive/MyDrive/CAFO_data/forTraining/interim_files/{prefix}_*.geojson