In [1]:
!pip install pyogrio==0.7.2 geopandas==0.14.3 unidecode -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.6/56.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.0/22.0 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
%load_ext autoreload
%autoreload 2

import sys, os
from IPython.core.magic import register_cell_magic
import glob
import pickle
from unidecode import unidecode
import numpy as np
from google.colab import drive
import ee
import geemap
import scipy
import pandas as pd
import geopandas as gpd
import pyproj
import pyarrow

In [37]:
ee.Authenticate()
ee.Initialize(project="215656163750")
drive.mount('/content/drive')

gpd.options.io_engine = "pyogrio"
os.environ["PYOGRIO_USE_ARROW"] = "1"

sys.path.append('/content/drive/MyDrive/Colab Notebooks/')
import utils

@register_cell_magic
def skip(line, cell):
    return

interim_path = "/content/drive/MyDrive/CAFO_data/forTraining/interim_files/"
shp_path = "/content/drive/MyDrive/CAFO_data/"
out_path = "/content/drive/MyDrive/CAFO_data/forTraining/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
"""
prefix = 'iowa'
bldgs_file = f"{interim_path}Iowa_bldgs_filtered.pkl"
boundary_shapefile = 'USA/cb_2021_us_county_5m.shp'
adm_no = '2'
"""

"""
prefix = 'chl'
bldgs_file = f"{interim_path}Chile_bldgs_filtered.pkl"
boundary_shapefile = 'Chile/shapefiles/chl_admbnda_adm3_bcn_20211008.shp'
adm_no = '3'
"""

"""
prefix = 'mex'
bldgs_file = f"{interim_path}Mexico_bldgs_filtered.pkl"
boundary_shapefile = 'Mexico/shapefiles/mex_admbnda_adm2_govmex_20210618.shp'
adm_no = '2'
"""

"""
prefix = 'rou'
bldgs_file = f"{interim_path}Romania_bldgs_filtered.pkl"
boundary_shapefile = "Europe/shapefiles/geoBoundaries-ROU-ADM2_simplified.geojson"
adm_no = '2'
"""

"""
prefix = 'tur'
bldgs_file = f"{interim_path}Turkey_bldgs.pkl"
boundary_shapefile = "/Misc_global/shapefiles/Turkey.shp"
adm_no = '-1'
"""

"""
prefix = 'ind'
bldgs_file = f"{interim_path}India_bldgs.pkl"
boundary_shapefile = "/Misc_global/shapefiles/India.shp"
adm_no = '-1'
"""

#"""
prefix = 'tha'
bldgs_file = f"{interim_path}Thailand_bldgs.pkl"
boundary_shapefile = "/Misc_global/shapefiles/Thailand.shp"
adm_no = '-1'
#"""

print("")




In [39]:
sentinel_bands = ['B4', 'B3', 'B2'] # mainly to reduce data volume/get max resolution
                                    # keep bands in this order

training_image_radius = 240 # m, to obtain (approx) 48 x ?? pixel images

In [40]:
# Get the df containing the farm and not-farm buildings/locations selected for
# training.

bldgs = pd.read_pickle(bldgs_file)

In [41]:
# Get the ADMx-level boundaries for each country. This is just to break things
# up into chunks that don't make Earth Engine barf. Mexico has to be done at
# the ADM2 level, Iowa at ADM2, and Chile at ADM3. This creates a ridiculous
# number of files and should be recoded to use a grid or something instead.

def modify_shapefile(shpfile):
  shp = gpd.read_file(f"{shp_path}{shpfile}")

  # Get consistent CRSs
  try:
    shp = shp.set_crs("EPSG:4326")
  except ValueError:
    shp = shp.to_crs("EPSG:4326")

  # Get consistent column names.

  # Iowa, assuming COUNTY-level
  try:
    shp.loc[:, "ADM2"] = shp["NAME"]
  except KeyError:
    pass

  # Romania
  try:
    shp.loc[:, "ADM2"] = shp["shapeName"]
  except KeyError:
    pass

  # Mexico, Chile
  shp.rename(columns={"ADM0_ES": "ADM0", "ADM1_ES": "ADM1",\
                      "ADM2_ES": "ADM2", "ADM3_ES": "ADM3"}, inplace=True)

  # Held-out regions
  if any(place in shpfile for place in ["Turkey", "India", "Thailand"]):
    shp.loc[:, "ADM-1"] = ""

  return shp
areas = modify_shapefile(boundary_shapefile)

In [42]:
# Identify the administrative regions that contain farm/not-farm locations,
# because we don't want to bother iterating over ones with no farms.  This step
# is only necessary for the larger training datasets, not for the held-out
# regions

def narrow_down(bldgs_df, boundaries_df, adm_no):

  bldgs_df.loc[:, 'location_geom'] = bldgs_df.loc[:, 'geometry']
  joined = boundaries_df.sjoin(bldgs_df, how='inner', predicate='intersects')

  # create a df containing only boundaries containing buildings
  bounds_w_data = joined.drop_duplicates(subset=[f'ADM{adm_no}'])

  # create a version of the buildings df in which ADMx is identified
  bldgs_df_2 = joined.drop(columns=['geometry'])\
                     .rename(columns={'location_geom': 'geometry'})\
                     .set_geometry('geometry')

  # drop extraneous columns, but keep admin areas as they will be useful for
  # defining held-out regions later on
  columns_to_keep = bldgs_df.columns.to_list()
  additional = list(set([f'ADM{adm_no}', 'ADM2', 'ADM1', 'ADM0']))
  for column in additional:
    if column not in bldgs_df_2.columns:
      bldgs_df_2.loc[:, column] = np.nan
  columns_to_keep = columns_to_keep + additional
  columns_to_keep.remove('location_geom')
  bldgs_df_2 = bldgs_df_2[columns_to_keep]

  return bounds_w_data, bldgs_df_2

if prefix not in ["tur", "ind", "tha"]:
  areas_2, bldgs_2 = narrow_down(bldgs, areas, adm_no=adm_no)
else:
  areas_2, bldgs_2 = areas, bldgs
  bldgs_2.loc[:, "ADM-1"] = ""


In [43]:
# Function to obtain Sentinel data for a specified region

# Data for 2020, because the infrastructure dataset (next cell) appears to be from 2021
#   - May be better to use an earlier year in the hope of excluding unregistered CAFOs,
#     which might be post-2021?
#   - May be better to restrict to a certain season
#   - May need to use better cloud masking:
# https://developers.google.com/earth-engine/tutorials/community/sentinel-2-s2cloudless

def get_sentinel(boundary, sentinel_bands):

  sentinel = (
      ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
      .filterDate('2020-01-01', '2020-12-31')
      .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 10))
      .select(sentinel_bands)
      .median() #crude cloud filter
      .clip(boundary)
  )

  return sentinel

In [44]:
# Function to clip Sentinel data to an area around each farm and not-farm
# building

def extract_snippets(buildings_fc, sentinel_data, filename=None, folder=None):

  # For each polygon, define a square region around its centroid
  def buffer_and_bound(feature, buffer_radius=training_image_radius):
    return feature.centroid().buffer(buffer_radius, 2).bounds()

  areas = buildings_fc.map(buffer_and_bound)

  # Obtain Sentinel data for the polygon
  pix = sentinel_data.sampleRegions(
      collection=areas,
      scale=10,
      geometries=True)

  return areas, pix

In [45]:
#%%skip
# Get the Sentinel snippets for each location in each AMDx-level area, and write
# to file. This is where we create the ridiculous number of files.

# O'Brien county in Iowa fails, presumably b/c of data volume. This is tolerable
# because Iowa already has many more farms than the other datasets, but it
# underscores the need for a better way of doing this.

def create_interim_files(bldgs, bounds, prefix, adm_no, verbose=False):

  for place in bounds[f'ADM{adm_no}']:

    # Convert building info and boundary to ee featureCollections
    geometry = geemap.gdf_to_ee(bounds[bounds[f'ADM{adm_no}']==place][['geometry']])
    bldgs_fc = geemap.gdf_to_ee(bldgs[bldgs[f'ADM{adm_no}']==place])

    # Get the Sentinel data for this area
    sentinel = get_sentinel(geometry, sentinel_bands)

    # Extract Sentinel data around each building location
    _, pix = extract_snippets(bldgs_fc, sentinel)

    # Save a file containing all the snippets for each region
    fname = unidecode(place).title().replace(" ", "").strip()
    try:
      utils.write_to_file(pix, f"{prefix}_{fname}", 'interim_files')
      if verbose:
        print(f"Saving {prefix}: {place} (-->{fname})")
    except:
      print(f" -- {place} failed")

create_interim_files(bldgs_2, areas_2, prefix, adm_no, verbose=True)

Saving tha:  (-->)


In [57]:
utils.ee_task_status(n_tasks=10)

Task R3MHT4ED34RNRZSUQJFNZRB7 started at 2024-10-25 13:27:50.714000
Current status: COMPLETED
Task WKLF7JBMSSWJTJUC6ELOWRZJ started at 2024-10-25 12:32:28.296000
Current status: COMPLETED
Task HI5XBNBFAHQAP7QSAB32RNTR started at 2024-10-25 12:31:46.567000
Current status: COMPLETED
Task HKFY7JKWW3GHBY3ZOTJ6FXMR started at 2024-10-25 12:31:25.882000
Current status: COMPLETED
Task NIZ5JKHMV6WMNOE44VHY4DAY started at 2024-10-25 12:30:25.077000
Current status: COMPLETED
Task LBBNHF6FGZ5ETREIYEOURCLT started at 2024-10-25 12:29:35.846000
Current status: COMPLETED
Task 66X35EQZVOQDCD6DRQXF2O7X started at 2024-10-25 12:29:24.865000
Current status: COMPLETED
Task Y6E3OIM7WM2EAW45XILPHDCC started at 2024-10-25 12:28:36.365000
Current status: COMPLETED
Task BF5R2Y2KETZ3IL57OAQE5JXZ started at 2024-10-25 12:28:08.699000
Current status: COMPLETED
Task JKEO7EAQDMW33UMRNQU23RSP started at 2024-10-25 12:27:15.235000
Current status: COMPLETED


In [58]:
print(f"There are {len(areas_2)} districts in {prefix}")
files = glob.glob(f'{interim_path}{prefix}*.geojson')
print(f"Have saved the Sentinel snippets for {len(files)} of them")

There are 1 districts in tha
Have saved the Sentinel snippets for 1 of them


In [59]:
# Convert pixel coords and values into a dictionary of 3D numpy array (height,
# width, channels). Band order is RGB, images are 64x64 pix, scaled to 0-255.
# This format should be suitable for keras preprocess_input functions, e.g.
# www.tensorflow.org/api_docs/python/tf/keras/applications/vgg16/preprocess_input

def create_images(gdf, index_start, verbose=False):

    # Remove suffixes from pixel ID numbers so we can group all pixels for a
    # given farm or not-farm
    # E.g, 1_1, 1_2, 1_3, 1_4 --> 1, 1, 1, 1

    gdf['id'] = gdf['id'].str.split('_').str[0]

    # Convert to int - this is very important, without it the gdf and image dict
    # end up out of sync!
    gdf['id'] = gdf['id'].astype(int)

    rejected = []
    arr_list = []

    # Here, a group will be an individual farm or not-farm
    groups = gdf.groupby(by='id')
    for n, group in groups:
      data = {'B4': [], 'B3': [], 'B2': []}
      # Identify image rows for each farm/not-farm
      _ = group.groupby(by=group.geometry.y)
      # Gather the group's pixels into a 3D array
      for coord, vals in _:
        for band in ['B4', 'B3', 'B2']:
          data[band].append([b for b in vals[band]])
      try:
        arr = np.stack([np.array(data['B4']), np.array(data['B3']), np.array(data['B2'])])
      except ValueError as e:
        # These are presumably groups that intersect with the boundary of the region
        # They cause problems because they aren't rectangular; might able to pad but
        # that seems like more trouble than it's worth
        rejected.append(group['id'].unique()[0])
        continue

      # Move the channels axis to the end
      arr = np.moveaxis(arr, [0], [2])

      # Resize the images to 64 x 64 pix
      arr = scipy.ndimage.zoom(
          arr,
          (64/arr.shape[0], 64/arr.shape[1], 1),
          mode='reflect'
          )

      # Rescale to 0-255 (using max over all bands)
      arr = (arr / np.max(arr)) * 255

      # The image is ready now, so add it to the dict
      arr_list.append(arr)

    # Create a version of the input array that has just one row per farm/not-farm
    # and doesn't include the now-redundant band info
    gdf = gdf[~gdf['id'].isin(rejected)]
    gdf = gdf.drop(columns=['B2', 'B3', 'B4']).drop_duplicates(subset=['id'])\
             .drop(columns=['id'])
    new_index = [i for i in range(index_start, index_start+len(gdf))]
    gdf.index = new_index

    # Make a dict of arrays in which keys are guaranteed to match gdf
    arr_dict = {key: value for key, value in zip(new_index, arr_list)}

    if verbose:
      print(f"Started with {len(groups)} farms/not-farms")
      print(f"Rejected {len(rejected)} images with irregular shapes")
      print(f'Retained {len(new_index)} images')

    return arr_dict, gdf

In [60]:
%%time

def images_and_metadata(prefix, verbose=False):
  files = glob.glob(f'{interim_path}{prefix}*.geojson')

  gdf_list = []
  index_start = 0
  image_dict_list = []

  for n, f in enumerate(files):
    if verbose:
      print(f"Working on #{n}: {f}")
    gdf = gpd.read_file(f)
    image_dict, gdf = create_images(gdf, index_start)
    if len(gdf) == 0:
      # No images remain after rejecting irregular ones
      continue
    gdf_list.append(gdf)
    image_dict_list.append(image_dict)
    index_start += len(gdf)

  main_gdf = pd.concat(gdf_list)
  image_dict = image_dict_list[0]
  for d in image_dict_list[1:]:
    image_dict.update(d)

  main_gdf.to_pickle(f"{out_path}{prefix}_metadata_gdf.pkl")
  with open(f"{out_path}{prefix}_sentinel_images.pkl", 'wb') as f:
    pickle.dump(image_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

  return main_gdf, image_dict

df, image_dict = images_and_metadata(prefix, verbose=True)


Working on #0: /content/drive/MyDrive/CAFO_data/forTraining/interim_files/tha_.geojson
CPU times: user 24.8 s, sys: 631 ms, total: 25.4 s
Wall time: 26.7 s


In [61]:
# For the held-out test regions only, combine the images and metadata into a
# single df. This is because, for those regions, we skip the initial model
# training, application, and data-cleaning steps used on the Mexico, Chile, etc.
# datasets. So this merging needs to be done now.

if prefix in ["tur", "ind", "tha"]:
  df = df.drop(columns=["ADM-1"])
  df["Sentinel"] = image_dict.values()
  df.to_pickle(f"{out_path}{prefix}_final.pkl")

In [62]:
# Tidy up - run this cell and then empty trash in drive. While it takes a long
# time to create the geojson files of Sentinel snippets, they do take up a lot
# of space. And if I need to rerun this notebook, it's probably because I've
# changed the farm and not-farm locations anyway.

if prefix == 'iowa':
  !rm /content/drive/MyDrive/CAFO_data/forTraining/interim_files/iowa_*.geojson
elif prefix == 'mex':
  !rm  /content/drive/MyDrive/CAFO_data/forTraining/interim_files/mex_*.geojson
elif prefix == 'chl':
  !rm /content/drive/MyDrive/CAFO_data/forTraining/interim_files/chl_*.geojson
elif prefix == 'rou':
  !rm /content/drive/MyDrive/CAFO_data/forTraining/interim_files/rou_*.geojson
elif prefix == 'tur':
  !rm /content/drive/MyDrive/CAFO_data/forTraining/interim_files/tur_*.geojson
elif prefix == 'ind':
  !rm /content/drive/MyDrive/CAFO_data/forTraining/interim_files/ind_*.geojson
elif prefix == "tha":
  !rm /content/drive/MyDrive/CAFO_data/forTraining/interim_files/tha_*.geojson
else:
  print("Don't know what to delete, not doing anything")