In [20]:
%load_ext autoreload
%autoreload 2

import sys, os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon
from google.colab import drive
import ee
import geemap.foliumap as geemap

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
ee.Authenticate()
ee.Initialize(project="215656163750")
drive.mount('/content/drive')

sys.path.append('/content/drive/MyDrive/Colab Notebooks/')
import explore

path = "/content/drive/MyDrive/CAFO_data/Misc_global/shapefiles/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
# TURKEY - region boundary coords defined by hand using Google Maps

turkey = {"Name": "Turkey", "Code": "TUR"}
turkey["CRS"] = "EPSG:5636"
turkey["Boundary latlon"] = [(38.675596, 30.745101), (38.693221, 30.607857),\
                             (38.788433, 30.603912), (38.810095, 30.636428),\
                             (38.821603, 30.691116), (38.722434, 30.699930)]
turkey["Boundary lonlat"] = [x[::-1] for x in turkey["Boundary latlon"]]
turkey["Boundary"]= Polygon(turkey["Boundary lonlat"])

# Save the region boundary for later use
gdf = gpd.GeoDataFrame(crs="EPSG:4326", geometry=[turkey["Boundary"]])
turkey["Boundary gdf"] = gdf
gdf.to_file(f"{path}Turkey.shp")

In [23]:
# INDIA (Tamil Nadu)

india = {"Name": "India", "Code": "IND"}
india["CRS"] = "EPSG:7785"

india["Boundary latlon"] = [(11.264466, 78.195514), (11.257338, 78.132436),\
                            (11.278384, 78.094542), (11.359775, 78.122547),\
                            (11.351684, 78.183257), (11.282473, 78.202641)]
india["Boundary lonlat"] = [x[::-1] for x in india["Boundary latlon"]]
india["Boundary"]= Polygon(india["Boundary lonlat"])

# Save the region boundary for later use
gdf = gpd.GeoDataFrame(crs="EPSG:4326", geometry=[india["Boundary"]])
india["Boundary gdf"] = gdf
gdf.to_file(f"{path}India.shp")

In [24]:
# THAILAND

thailand = {"Name": "Thailand", "Code": "THA"}
thailand["CRS"] = "EPSG:24047"

thailand["Boundary latlon"] = [(13.274320, 99.816416), (13.262629, 99.723449),\
                               (13.473833, 99.591679), (13.489666, 99.724536)]
thailand["Boundary lonlat"] = [x[::-1] for x in thailand["Boundary latlon"]]
thailand["Boundary"]= Polygon(thailand["Boundary lonlat"])

# Save the region boundary for later use
gdf = gpd.GeoDataFrame(crs="EPSG:4326", geometry=[thailand["Boundary"]])
thailand["Boundary gdf"] = gdf
gdf.to_file(f"{path}Thailand.shp")

In [25]:
# PERU

peru = {"Name": "Peru", "Code": "PER"}
peru["CRS"] = "EPSG:5387" #UTM 18S

peru["Boundary latlon"] = [(-13.437699, -76.128270), (-13.463783, -76.130057),\
                           (-13.453825, -75.982573), (-13.107994, -76.077131),\
                           (-13.221282, -76.244480), (-13.322838, -76.241936),\
                           (-13.399845, -76.110958), ]
peru["Boundary lonlat"] = [x[::-1] for x in peru["Boundary latlon"]]
peru["Boundary"]= Polygon(peru["Boundary lonlat"])

# Save the region boundary for later use
gdf = gpd.GeoDataFrame(crs="EPSG:4326", geometry=[peru["Boundary"]])
peru["Boundary gdf"] = gdf
gdf.to_file(f"{path}Peru.shp")

In [26]:
min_building_size=800
sentinel_bands = ['B4', 'B3', 'B2']
sentinel_year = 2023

where = peru

In [27]:
# Obtain a feature collection of buildings > min_building_size within the
# specified boundary

def get_buildings(area, min_building_size, country_code):
  gdf = gpd.GeoDataFrame(crs="EPSG:4326", geometry=[area["Boundary"]])
  geom = geemap.geopandas_to_ee(gdf[['geometry']])

  buildings_fc = (
      ee.FeatureCollection(f"projects/sat-io/open-datasets/VIDA_COMBINED/{country_code}")
      .filter(ee.Filter.gt('area_in_meters', min_building_size))
      .filterBounds(geom)
  )

  return buildings_fc, geom

In [28]:
# Combine closely-spaced buildings into a single polygon, and define a box
# around the centroid of each clump. The box should be roughly the same size as
# the Sentinel snippets I eventually use in model training and application;
# they don't get used directly but are useful for visualizing what we're
# working with

def merge_and_make_box(area, buildings_fc):
  buildings = geemap.ee_to_gdf(buildings_fc)

  merged = buildings.to_crs(area["CRS"]).buffer(50).union_all()
  merged = gpd.GeoDataFrame(merged.geoms).set_geometry(0).set_crs(area["CRS"])

  boxes = gpd.GeoDataFrame(merged.centroid.buffer(200).envelope)
  boxes = boxes.rename(columns={0: "geometry"}).set_geometry("geometry").to_crs("EPSG:4326")

  # remove boxes that intersect with boundary edges, as they'd be discarded
  # when the Sentinel images are created, anyway

  boxes = boxes.sjoin(area["Boundary gdf"], how="inner", predicate="within")
  boxes = boxes.drop(columns=["index_right"]).reset_index(drop=True)

  print(f"Went from {len(buildings)} buildings to {len(boxes)} boxes")

  return merged, boxes

In [29]:
# Get Sentinel data for the whole area (not bothering with snippets)

def get_sentinel(area, sentinel_bands, year=2023):

  gdf = gpd.GeoDataFrame(crs="EPSG:4326", geometry=[area["Boundary"]])
  geom = geemap.geopandas_to_ee(gdf[['geometry']])

  sentinel = (
      ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
      .filterDate(f'{year}-01-01', f'{year}-12-31')
      .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 10))
      .select(sentinel_bands)
      .median() #crude cloud filter
      .clip(boundary)
  )

  return sentinel

In [30]:
buildings_fc, boundary = get_buildings(where, min_building_size, where["Code"])
merged, boxes = merge_and_make_box(where, buildings_fc)
sentinel = get_sentinel(where, sentinel_bands, sentinel_year)

Went from 1334 buildings to 245 boxes


In [16]:
# Visualize the entire area, including its boundary, all the large buildings,
# the merged building polygons, and the boxes

boxes_fc = geemap.geopandas_to_ee(boxes)
merged_fc = geemap.geopandas_to_ee(merged.to_crs("EPSG:4326"))

os.environ["HYBRID"] = 'https://mt1.google.com/vt/lyrs=y&x={x}&y={y}&z={z}'

sentinel_viz = {
    'min': 0.0,
    'max': 2500,
    'bands': ['B4', 'B3', 'B2'],
}

boundary_viz = {
  'color': 'red',
  'width': 2,
  'fillColor': '00000000'
}

buildings_viz = {
  'color': 'yellow',
  'width': 2,
  'fillColor': '00000000'
}

merged_viz = {
  'color': 'cyan',
  'width': 2,
  'fillColor': '00000000'
}

boxes_viz = {
  'color': 'blue',
  'width': 2,
  'fillColor': '00000000'
}

Map = geemap.Map()
Map.centerObject(buildings_fc.first().geometry(), 13)
Map.add_basemap("HYBRID")
Map.addLayer(sentinel, sentinel_viz, "Sentinel")
Map.addLayer(boundary.style(**boundary_viz), {}, "Boundary")
Map.addLayer(buildings_fc.style(**buildings_viz), {}, "Buildings")
Map.addLayer(merged_fc.style(**merged_viz), {}, "Merged")
Map.addLayer(boxes_fc.style(**boxes_viz), {}, "Boxes")

Map

In [31]:
# Step through each of the boxes. Use the "reject" option to mark the CAFOs,
# we'll use the returned list to label them afterwards

cafos1 = explore.loop_over_buildings(boxes, sentinel=sentinel)

Working on feature 1 of 245


Unnamed: 0,geometry
0,"POLYGON ((-76.2145 -13.33859, -76.2108 -13.338..."


here


Enter reject to reject, exit to exit, or any key to continue  
Working on feature 2 of 245


Unnamed: 0,geometry
1,"POLYGON ((-76.21377 -13.33713, -76.21007 -13.3..."


here


Enter reject to reject, exit to exit, or any key to continue  
Working on feature 3 of 245


Unnamed: 0,geometry
2,"POLYGON ((-76.21614 -13.33624, -76.21245 -13.3..."


here


Enter reject to reject, exit to exit, or any key to continue  
Working on feature 4 of 245


Unnamed: 0,geometry
3,"POLYGON ((-76.21304 -13.33563, -76.20935 -13.3..."


here


Enter reject to reject, exit to exit, or any key to continue  
Working on feature 5 of 245


Unnamed: 0,geometry
4,"POLYGON ((-76.2154 -13.3344, -76.21171 -13.334..."


here


Enter reject to reject, exit to exit, or any key to continue  
Working on feature 6 of 245


Unnamed: 0,geometry
5,"POLYGON ((-76.21224 -13.33404, -76.20855 -13.3..."


here


Enter reject to reject, exit to exit, or any key to continue  
Working on feature 7 of 245


Unnamed: 0,geometry
6,"POLYGON ((-76.2147 -13.33294, -76.21101 -13.33..."


here


Enter reject to reject, exit to exit, or any key to continue  
Working on feature 8 of 245


Unnamed: 0,geometry
7,"POLYGON ((-76.21151 -13.33257, -76.20781 -13.3..."


here


Enter reject to reject, exit to exit, or any key to continue  
Working on feature 9 of 245


Unnamed: 0,geometry
8,"POLYGON ((-76.21398 -13.33145, -76.21029 -13.3..."


here


Enter reject to reject, exit to exit, or any key to continue  
Working on feature 10 of 245


Unnamed: 0,geometry
9,"POLYGON ((-76.23777 -13.32485, -76.23408 -13.3..."


here


Enter reject to reject, exit to exit, or any key to continue  
Working on feature 11 of 245


Unnamed: 0,geometry
10,"POLYGON ((-76.22383 -13.32049, -76.22014 -13.3..."


here


Enter reject to reject, exit to exit, or any key to continue  
Working on feature 12 of 245


Unnamed: 0,geometry
11,"POLYGON ((-76.22204 -13.31791, -76.21835 -13.3..."


here


Enter reject to reject, exit to exit, or any key to continue  exit


In [None]:

cafos1 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
cafos2 = [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 72, 73, 76, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 89, 91, 92, 93, 94, 95, 96, 97, 98, 99]
cafos3 = [100, 101, 102, 103, 107, 109, 110, 111, 114, 115, 116, 117, 118, 119, 124, 126, 128, 129, 131, 132, 133, 134, 135, 136, 137, 138, 141, 142, 144, 146, 149]
cafos4 = [150, 151, 152, 155, 156, 157, 158, 159, 160, 161, 170, 172, 173, 174, 175, 177, 178, 179, 180, 181, 185, 186, 190, 191, 192, 193, 194, 199]
cafos5 = [200, 201, 202, 203, 204, 205, 206, 207, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 227, 228, 229, 230, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244]
cafos = cafos1 + cafos2 + cafos3 + cafos4 + cafos5

In [None]:
# Create a df where each row contains the centroid of one of the above boxes,
# is labelled as Unknown CAFO or Non-farm, and has the same columns as all the
# training datasets (Mexico, Iowa, etc.)

# This df doesn't identify any particular building as the main or central one
# in any of the boxes, or contain info about building sizes etc. It could be
# made to, but for now it does not.

def make_final_df(area, boxes, cafos):
  candidates = boxes.copy()

  centroids = candidates.to_crs(area["CRS"]).geometry.centroid.to_crs("EPSG:4326")
  candidates.loc[:, "geometry"] = centroids

  candidates.loc[candidates.index.isin(cafos), 'Farm type'] = "Unknown CAFO"
  candidates.loc[~candidates.index.isin(cafos), 'Farm type'] = "Non-farm"
  coldict = {"Dataset name": area["Name"], "Parent coords": None,\
             "Number of animals": np.nan, "Area (sq m)": np.nan,\
             "Length (m)": np.nan, "Aspect ratio": np.nan}
  for col, val in coldict.items():
    candidates.loc[:, col] = val

  candidates = explore.re_order(candidates)
  print(f'Saving {len(candidates[candidates["Farm type"] == "Unknown CAFO"])} farm coords')
  print(f'Saving {len(candidates[candidates["Farm type"] == "Non-farm"])} non-farm coords')

  path = "/content/drive/MyDrive/CAFO_data/forTraining/interim_files/"
  candidates.to_pickle(f'{path}{area["Name"]}_bldgs.pkl')

In [None]:
make_final_df(where, boxes, cafos)

Saving 194 farm coords
Saving 51 non-farm coords
