In [3]:
import pandas as pd
import geopandas as gpd
import dask_geopandas as dgpd

# Save tax data as feather

In [26]:
tax = gpd.read_file('./data/tax.geojson')
tax.to_feather('./data/tax_untouched.geofeather')

In [31]:
tax = gpd.read_feather('./data/tax.geofeather')

In [19]:
# Reduce file size by casting to numeric
to_numeric = ['percent_of_ownership', 'year_property_built',
              'number_of_rooms', 'number_of_stories', 'misc_exemption_value',
              'number_of_units', 'property_area', 'basement_area',
              'assessed_improvement_value', 'assessed_land_value', 'assessed_personal_property_value',
              'assessed_fixtures_value', 'homeowner_exemption_value',
              'lot_frontage', 'number_of_bathrooms', 'lot_depth',  'lot_area',
              'number_of_bedrooms', 'closed_roll_year', 'volume_number']

tax[to_numeric] = tax[to_numeric].apply(lambda x: pd.to_numeric(x, errors='coerce'))

# Remove redundant columns (These columns use codes described in plain english by other columns)
tax = tax.drop(['assessor_neighborhood_code', 'property_class_code', 'exemption_code', 'use_code'], axis=1)

# Remove other irrelevant columns
tax = tax.drop(['row_id', 'data_as_of', 'data_loaded_at'], axis=1)

In [32]:
tax[['supervisor_district_2012', 'supervisor_district']] = tax[['supervisor_district_2012', 'supervisor_district']].astype(str)
to_category = tax.columns[(tax.dtypes == 'object') | (tax.dtypes == 'category')].tolist()
tax[to_category] = tax[to_category].astype(str)

In [33]:
current_date = pd.to_datetime('2023-01-01')
time_difference = current_date - tax['current_sales_date']
tax['years_since_last_sale'] = (time_difference / pd.Timedelta(days=365.25))

In [35]:
for c in tax.columns:
    if c != 'geometry':
        tax[c].replace({'None': 'NaN', None: 'NaN'}, inplace=True)

In [36]:
tax.to_feather('./data/tax.geofeather')

### Geospatial merge between tax data and zillow neighborhoods

In [22]:
neigh = gpd.read_file('./data/ZillowNeighborhoods-CA/ZillowNeighborhoods-CA.shp')

sf = neigh[neigh.CITY.str.contains('San Francisco')]
sf_neighborhoods = sf[['NAME', 'geometry']]
sf_neighborhoods = sf_neighborhoods.rename(columns={'NAME': 'zillow_neighborhood_name'})

In [23]:
ddf = dgpd.from_geopandas(tax, npartitions=10)

In [24]:
ddf = ddf.to_crs(epsg=3310)

In [25]:
tax = ddf.compute()

In [26]:
sf_neighborhoods.to_crs(epsg=3310, inplace=True)

In [27]:
tax = tax.sjoin_nearest(sf_neighborhoods, how='left')

In [28]:
tax.drop(columns='index_right', inplace=True)

In [29]:
tax.columns

Index(['property_class_code_definition', 'lot_code', 'property_area',
       'volume_number', 'percent_of_ownership', 'misc_exemption_value',
       'zoning_code', 'year_property_built', 'analysis_neighborhood',
       'number_of_units', 'use_definition', 'closed_roll_year', 'status_code',
       'number_of_bedrooms', 'assessor_neighborhood', 'number_of_stories',
       'assessor_neighborhood_district', 'exemption_code_definition', 'block',
       'current_sales_date', 'lot_area', 'number_of_rooms', 'lot_depth',
       'assessed_personal_property_value', 'supervisor_district_2012',
       'number_of_bathrooms', 'construction_type', 'lot_frontage',
       'homeowner_exemption_value', 'tax_rate_area_code', 'lot',
       'property_location', 'parcel_number', 'assessed_fixtures_value',
       'supervisor_district', 'assessed_land_value', 'basement_area',
       'assessed_improvement_value', 'geometry', 'zillow_neighborhood_name'],
      dtype='object')

In [30]:
# Save in efficient format
tax.to_feather('./data/tax.geofeather')

### Save dbi permits as feather 

And convert some columns to reduce space.

In [3]:
dbi = gpd.read_feather('./data/dbi_permits.geofeather')

In [4]:
date_cols = [c for c in dbi.columns if 'date' in c]
dbi[date_cols] = dbi[date_cols].apply(lambda x: pd.to_datetime(x, errors='coerce'))
cost_cols = [c for c in dbi.columns if 'cost' in c]
dbi[cost_cols] = dbi[cost_cols].apply(lambda x: pd.to_numeric(x, errors='coerce'))
unit_cols = [c for c in dbi.columns if 'unit' in c]
dbi[unit_cols] = dbi[unit_cols].apply(lambda x: pd.to_numeric(x, errors='coerce'))

In [5]:
dbi.to_feather('./data/dbi_permits.geofeather')

### Geospatial Blue Sky

In [17]:
bluesky = pd.read_csv('./data/Blue Sky Code and Inputs/SF_Logistic_Data.csv')

allParcels = gpd.read_feather('./data/allParcels.geofeather')
allParcels.date_map_a = pd.to_datetime(allParcels.date_map_a)
allParcels.date_rec_a = pd.to_datetime(allParcels.date_rec_a)

# Double check this line.
allParcels = allParcels[allParcels.date_map_a.dt.year.notna()]

allParcels = allParcels[~(pd.to_datetime(allParcels.date_map_d).dt.year < bluesky.year.min())]
allParcels = allParcels[~(pd.to_datetime(allParcels.date_rec_d).dt.year < bluesky.year.min())]
allParcels = allParcels[~(allParcels.date_rec_a.dt.year >= bluesky.year.max())]
allParcels = allParcels[~(allParcels.date_map_a.dt.year >= bluesky.year.max())]

In [18]:
allParcels['year'] = allParcels.date_map_a.dt.year

In [19]:
allParcels = allParcels.sort_values('year')

In [20]:
bluesky['year'] = bluesky['year'].astype(float)
allParcels['year'] = allParcels.date_map_a.dt.year.astype(float)

In [26]:
bluesky.shape

(2450918, 17)

In [79]:
df = pd.merge_asof(bluesky, allParcels[['geometry', 'year', 'mapblklot']], 
                   left_by='MapBlkLot_Master', right_by='mapblklot', on='year')

In [80]:
df.geometry.isna().sum()

38940

In [81]:
cantID = df[df.geometry.isna()]

In [82]:
cantID.shape

(38940, 19)

In [83]:
canID = pd.merge(cantID.drop(columns=['geometry', 'mapblklot']), 
                  allParcels[['geometry', 'mapblklot', 'year']], 
                  left_on='MapBlkLot_Master', right_on='mapblklot')

In [84]:
canID = canID.sort_values(['MapBlkLot_Master', 'year_x', 'year_y']).drop_duplicates(['MapBlkLot_Master', 'year_x'])

In [85]:
canID.drop(columns='year_y', inplace=True)

In [86]:
canID.rename({'year_x': 'year'}, axis=1, inplace=True)

In [103]:
canID.columns

Index(['MapBlkLot_Master', 'year', 'Developed', 'Historic',
       'Residential_Dummy', 'Zillow_Price_Real', 'Const_FedReserve_Real',
       'Envelope_1000', 'Upzone_Ratio', 'zp_OfficeComm', 'zp_DensRestMulti',
       'zp_FormBasedMulti', 'zp_PDRInd', 'zp_Public', 'zp_Redev', 'zp_RH2',
       'zp_RH3_RM1', 'geometry', 'mapblklot'],
      dtype='object')

In [90]:
canID.geometry.isna().sum()

0

In [107]:
df = df.merge(canID[['MapBlkLot_Master', 'year', 'mapblklot', 'geometry']],
         on=['MapBlkLot_Master', 'year'], how='left', suffixes=('', '_df2'))

In [108]:
# Use df2 values where available; otherwise, use df1 values
df['geometry'] = df['geometry_df2'].combine_first(df['geometry'])
df['mapblklot'] = df['mapblklot_df2'].combine_first(df['mapblklot'])

# Drop the temporary columns
df.drop(columns=['geometry_df2', 'mapblklot_df2'], inplace=True)

In [112]:
df = gpd.GeoDataFrame(df, crs=allParcels.crs)

In [118]:
df.to_feather('geobluesky.geofeather')