# Processing Microsoft's building footprints data by county

### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd

---

### Get buildings data

In [3]:
# Download state file from here: https://github.com/microsoft/USBuildingFootprints

### Convert the Microsoft GeoJSON to a shapefile

In [4]:
# !ogr2ogr -f "ESRI Shapefile" ../../data/GIS/cabuildings.shp ../../data/GIS/californiabuildings.geojson -progress

### Read the new data format

In [5]:
src = gpd.read_file("../../data/GIS/cabuildings.shp")

### Drop columns we don't need

In [6]:
src.drop(["release", "capture_da"], axis=1, inplace=True)

---

### Simplify counties GeoJSON file

In [7]:
# !mapshaper -i ../../data/GIS/ca-counties.geojson -simplify percentage=.25  -o ../../data/GIS/ca-counties-simplified.geojson

### Read the simplified counties file

In [8]:
ca_counties = gpd.read_file("../../data/GIS/ca-counties-simplified.geojson")

### Clean up the counties file

In [9]:
ca_counties.columns = ca_counties.columns.str.strip().str.lower().str.replace(" ", "_")

In [10]:
ca_counties[["countyfips", "name"]].sort_values("name", ascending=True).head()

Unnamed: 0,countyfips,name
50,1,Alameda
27,3,Alpine
7,5,Amador
44,7,Butte
8,9,Calaveras


In [11]:
ca_counties = ca_counties[["name", "countyfips", "geometry"]].copy()

---

### Merge the counties with their buildings

In [12]:
counties_buildings = gpd.sjoin(src, ca_counties, how="inner", op="within")

In [13]:
counties_buildings.drop(["index_right"], axis=1, inplace=True)

### How many buildings in each county?

In [15]:
counties_buildings.head()

Unnamed: 0,geometry,name,countyfips
14,"POLYGON ((-114.14520 34.28770, -114.14517 34.2...",San Bernardino,71
15,"POLYGON ((-114.16526 34.25458, -114.16525 34.2...",San Bernardino,71
16,"POLYGON ((-114.16532 34.25362, -114.16532 34.2...",San Bernardino,71
17,"POLYGON ((-114.16900 34.25050, -114.16912 34.2...",San Bernardino,71
18,"POLYGON ((-114.17083 34.24926, -114.17061 34.2...",San Bernardino,71


In [20]:
bldg_counties = (
    counties_buildings.groupby(["name", "countyfips"]).agg("count").reset_index()
)

In [21]:
bldg_counties.sort_values("geometry", ascending=False).head()

Unnamed: 0,name,countyfips,geometry
18,Los Angeles,37,2525265
36,San Diego,73,883488
32,Riverside,65,786378
29,Orange,59,757044
35,San Bernardino,71,685786


### Slice them up and save county files

In [22]:
for county in counties_buildings["countyfips"].unique():
    counties_buildings[counties_buildings["countyfips"] == county].to_file(
        f"output/places/" + county + ".geojson", driver="GeoJSON"
    )