In [1]:
import pandas as pd
import geopandas as gpd

### Save dbi permits as feather 

And convert some columns to reduce space.

In [3]:
dbi = gpd.read_feather('./data/dbi_permits.geofeather')

In [4]:
date_cols = [c for c in dbi.columns if 'date' in c]
dbi[date_cols] = dbi[date_cols].apply(lambda x: pd.to_datetime(x, errors='coerce'))
cost_cols = [c for c in dbi.columns if 'cost' in c]
dbi[cost_cols] = dbi[cost_cols].apply(lambda x: pd.to_numeric(x, errors='coerce'))
unit_cols = [c for c in dbi.columns if 'unit' in c]
dbi[unit_cols] = dbi[unit_cols].apply(lambda x: pd.to_numeric(x, errors='coerce'))

In [5]:
dbi.to_feather('./data/dbi_permits.geofeather')

### Geospatial Blue Sky

In [17]:
bluesky = pd.read_csv('./data/Blue Sky Code and Inputs/SF_Logistic_Data.csv')

allParcels = gpd.read_feather('./data/allParcels.geofeather')
allParcels.date_map_a = pd.to_datetime(allParcels.date_map_a)
allParcels.date_rec_a = pd.to_datetime(allParcels.date_rec_a)

# Double check this line.
allParcels = allParcels[allParcels.date_map_a.dt.year.notna()]

allParcels = allParcels[~(pd.to_datetime(allParcels.date_map_d).dt.year < bluesky.year.min())]
allParcels = allParcels[~(pd.to_datetime(allParcels.date_rec_d).dt.year < bluesky.year.min())]
allParcels = allParcels[~(allParcels.date_rec_a.dt.year >= bluesky.year.max())]
allParcels = allParcels[~(allParcels.date_map_a.dt.year >= bluesky.year.max())]

In [18]:
allParcels['year'] = allParcels.date_map_a.dt.year

In [19]:
allParcels = allParcels.sort_values('year')

In [20]:
bluesky['year'] = bluesky['year'].astype(float)
allParcels['year'] = allParcels.date_map_a.dt.year.astype(float)

In [26]:
bluesky.shape

(2450918, 17)

In [79]:
df = pd.merge_asof(bluesky, allParcels[['geometry', 'year', 'mapblklot']], 
                   left_by='MapBlkLot_Master', right_by='mapblklot', on='year')

In [80]:
df.geometry.isna().sum()

38940

In [81]:
cantID = df[df.geometry.isna()]

In [82]:
cantID.shape

(38940, 19)

In [83]:
canID = pd.merge(cantID.drop(columns=['geometry', 'mapblklot']), 
                  allParcels[['geometry', 'mapblklot', 'year']], 
                  left_on='MapBlkLot_Master', right_on='mapblklot')

In [84]:
canID = canID.sort_values(['MapBlkLot_Master', 'year_x', 'year_y']).drop_duplicates(['MapBlkLot_Master', 'year_x'])

In [85]:
canID.drop(columns='year_y', inplace=True)

In [86]:
canID.rename({'year_x': 'year'}, axis=1, inplace=True)

In [103]:
canID.columns

Index(['MapBlkLot_Master', 'year', 'Developed', 'Historic',
       'Residential_Dummy', 'Zillow_Price_Real', 'Const_FedReserve_Real',
       'Envelope_1000', 'Upzone_Ratio', 'zp_OfficeComm', 'zp_DensRestMulti',
       'zp_FormBasedMulti', 'zp_PDRInd', 'zp_Public', 'zp_Redev', 'zp_RH2',
       'zp_RH3_RM1', 'geometry', 'mapblklot'],
      dtype='object')

In [90]:
canID.geometry.isna().sum()

0

In [107]:
df = df.merge(canID[['MapBlkLot_Master', 'year', 'mapblklot', 'geometry']],
         on=['MapBlkLot_Master', 'year'], how='left', suffixes=('', '_df2'))

In [108]:
# Use df2 values where available; otherwise, use df1 values
df['geometry'] = df['geometry_df2'].combine_first(df['geometry'])
df['mapblklot'] = df['mapblklot_df2'].combine_first(df['mapblklot'])

# Drop the temporary columns
df.drop(columns=['geometry_df2', 'mapblklot_df2'], inplace=True)

In [112]:
df = gpd.GeoDataFrame(df, crs=allParcels.crs)

In [118]:
df.to_feather('geobluesky.geofeather')

In [None]:
# Save to feather

### Geospatial merge between tax data and zillow neighborhoods