In [319]:
import pandas as pd
import geopandas as gpd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import contextily as ctx
from shapely.wkt import load

import clean_utils

In [2]:
dbi = gpd.read_file('./data/Building Permits.geojson')

In [3]:
permits = gpd.read_file('./data/SF_Planning_Permitting_Data.geojson', low_memory=False)

In [312]:
clean_utils.clean_dates(permits)

In [5]:
parcels = pd.read_csv('./data/Blue Sky Code and Inputs/SF_Logistic_Data.csv')

In [304]:
allParcels = gpd.read_file('./data/Parcels   Active and Retired/parcels.shp')

In [7]:
sites = gpd.read_file('./data/site_inventory/xn--Bay_Area_Housing_Opportunity_Sites_Inventory__20072023_-it38a.shp')

In [323]:
import importlib
importlib.reload(clean_utils)

<module 'clean_utils' from 'C:\\Users\\sadamerdji\\Desktop\\dissertation\\clean_utils.py'>

### Training Set is RHNA 4

In [422]:
trainParcels = parcels[(parcels.year >= 2007) & (parcels.year < 2015)]
trainY = trainParcels.groupby('MapBlkLot_Master')['Developed'].agg(lambda x: x.ne(0).sum())
trainX = trainParcels[trainParcels.year == 2007]
trainY.sum()

253

No duplicative index.

In [424]:
nunique_lots = trainParcels[trainParcels.year == 2007].MapBlkLot_Master.nunique()
n_lots = trainParcels[trainParcels.year == 2007].shape[0]
assert nunique_lots == n_lots

trainDf = pd.merge(trainX.drop('Developed', axis=1), trainY, left_on='MapBlkLot_Master', right_index=True)
trainDf.Developed.value_counts()

0    152965
1       251
2         1
Name: Developed, dtype: int64

### Make BlueSky data geospatial

In [428]:
df = clean_utils.transform_bluesky_to_geospatial(trainDf)

In [429]:
df.CANTID_blklot_backup.notna().sum()

7756

### Developed parcels

In [364]:
built = df.loc[df.Developed > 0,]

In [365]:
type(built)

geopandas.geodataframe.GeoDataFrame

In [None]:
permits = clean_utils.get_rhna_permits()

In [None]:
round(built.MapBlkLot_Master.isin(permits.blocklot).mean(), 2)

In [386]:
success = ['complete', 'issued', 'approved', 'granted', 'issuing']
completed_projects = permits[permits['status'].isin(success)].copy()

## Use blklot and apn. Compare

In [387]:
built.columns

Index(['MapBlkLot_Master', 'mapblklot', 'blklot', 'active', 'geometry', 'year',
       'Historic', 'Residential_Dummy', 'Zillow_Price_Real',
       'Const_FedReserve_Real', 'Envelope_1000', 'Upzone_Ratio',
       'zp_OfficeComm', 'zp_DensRestMulti', 'zp_FormBasedMulti', 'zp_PDRInd',
       'zp_Public', 'zp_Redev', 'zp_RH2', 'zp_RH3_RM1', 'Developed',
       'CANTID_blklot_backup', 'CANTID_geometry_backup'],
      dtype='object')

In [388]:
completed_projects['blocklot'].nunique()

2564

In [411]:
dbi_units = completed_projects.groupby(['blocklot'], sort=False)['units'].median()

In [413]:
built_poisson_blklot = built.merge(dbi_units, how='inner', left_on='blklot', right_on='blocklot')
built_poisson_blklot.units.sum()

9417.0

In [414]:
built_poisson_mapblklot = built.merge(dbi_units, how='inner', left_on='mapblklot', right_on='blocklot')
built_poisson_mapblklot.units.sum()

9590.0

In [415]:
built_poisson_mapblklotm = built.merge(dbi_units, how='inner', left_on='MapBlkLot_Master', right_on='blocklot')
built_poisson_mapblklotm.units.sum()

9590.0

In [416]:
dbi_units = completed_projects.groupby(['blocklot'], sort=False)['units'].median()

built_poisson_apn = built.merge(dbi_units, how='inner', left_on='blklot', right_on='blocklot')
built_poisson_apn.units.sum()

9417.0

In [396]:
all_match1 = pd.concat((built_poisson_apn, 
                        built_poisson_mapblklotm, 
                        built_poisson_mapblklot,
                        built_poisson_blklot), axis=0)

In [397]:
all_match1.MapBlkLot_Master.nunique()

197

In [398]:
built_poisson_mapblklot.MapBlkLot_Master.nunique()

197

In [399]:
completed_projects.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [400]:
dbi_units = dbi_units.reset_index()

In [401]:
built = built.to_crs('EPSG:4326')

In [402]:
built_poisson_geo = gpd.sjoin(built, completed_projects, how="inner", predicate='contains')

In [403]:
all_match1 = all_match1.to_crs('EPSG:4326')

In [404]:
all_match1.MapBlkLot_Master.nunique()

197

In [405]:
built_poisson_geo.MapBlkLot_Master.nunique()

276

In [406]:
built_poisson_geo[list(trainDf.columns.values) + ['units']]

Unnamed: 0,MapBlkLot_Master,year,Historic,Residential_Dummy,Zillow_Price_Real,Const_FedReserve_Real,Envelope_1000,Upzone_Ratio,zp_OfficeComm,zp_DensRestMulti,zp_FormBasedMulti,zp_PDRInd,zp_Public,zp_Redev,zp_RH2,zp_RH3_RM1,Developed,units
280,0041103,2007,1,1,93.227099,92.120253,7.727339,2.000000,0,1,0,0,0,0,0,0,1,9.0
2522,0129013A,2007,0,1,93.227099,92.120253,3.466252,2.000000,0,1,0,0,0,0,0,0,1,2.0
3904,0184042,2007,0,1,93.227099,92.120253,11.762343,0.172653,0,1,0,0,0,0,0,0,1,23.0
4230,0196030,2007,0,0,93.227099,92.120253,9.604406,1.115106,1,0,0,0,0,0,0,0,1,12.0
4230,0196030,2007,0,0,93.227099,92.120253,9.604406,1.115106,1,0,0,0,0,0,0,0,1,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153076,8720117,2007,0,0,93.227099,92.120253,176.964381,1.920000,0,0,0,0,0,1,0,0,1,319.0
153076,8720117,2007,0,0,93.227099,92.120253,176.964381,1.920000,0,0,0,0,0,1,0,0,1,10.0
153076,8720117,2007,0,0,93.227099,92.120253,176.964381,1.920000,0,0,0,0,0,1,0,0,1,98.0
153076,8720117,2007,0,0,93.227099,92.120253,176.964381,1.920000,0,0,0,0,0,1,0,0,1,319.0


In [407]:
all_match = pd.concat((all_match1, built_poisson_geo), axis=0)

In [408]:
all_match.MapBlkLot_Master.nunique()

276

In [409]:
all_match_dbi = all_match[~all_match.MapBlkLot_Master.duplicated()]

In [410]:
all_match_dbi.MapBlkLot_Master.nunique()

276

I can capture all but 35 permits using DBI dataset.

#### How many matches do I get if I use SF Planning Permits

In the SF Planning Permits dataset, almost all mapblock lots are block + lot. Fewer are lot + block. And 29 have some non digit character I need to strip out.

Also, 7% blocklots are nans.

In [133]:
permits[
    ~((permits.mapblocklot == (permits.block + permits.lot))
       | permits.block.isna()
       | (permits.mapblocklot == (permits.lot + permits.block)))
].shape

(29, 139)

In [134]:
ppermits = permits

In [135]:
permits.mapblocklot.isna().mean()

0.07083529838209902

In [136]:
permits.related_building_permit

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
         ... 
256562    NaN
256563    NaN
256564    NaN
256565    NaN
256566    NaN
Name: related_building_permit, Length: 256567, dtype: object

In [137]:
permits.new_construction.value_counts()

CHECKED      2250
UNCHECKED     365
Name: new_construction, dtype: int64

In [146]:
ppermits.record_status.value_counts()[:35]

Closed                            106899
Closed - Approved                  67776
Closed - Informational             15156
Closed - CEQA Clearance Issued     14751
Closed - Abated                    13450
Closed - Issued                     6574
Closed - Withdrawn                  5562
Under Review                        5157
Open                                2905
Closed - Disapproved                2429
Closed - No Violation               2355
Closed - Cancelled                  2206
Closed - DR taken-Approved          1516
Withdrawn                           1342
Closed - Appeal Upheld              1266
Closed - DR not taken-Approved      1064
Closed - Appeal Denied               988
On Hold                              811
Submitted                            710
Approved                             664
Accepted                             512
Pending Review                       438
Permitted                            421
Complete                             222
Closed - Removed

In [138]:
for numerical in [c for c in ppermits.columns if 'num' in c]:
    ppermits[numerical] = pd.to_numeric(ppermits[numerical])

In [139]:
ppermits['NA_NUMBER_OF_UNITS_EXIST'] = ppermits['number_of_units_exist'].isna()
ppermits['units'] = ppermits['number_of_units'].fillna(0) - ppermits['number_of_units_exist'].fillna(0)

In [140]:
date_cols = [c for c in ppermits.columns if 'date' in c]
for date in date_cols:
    ppermits[date] = pd.to_datetime(ppermits[date], errors='coerce')

In [147]:
statuses = ['Closed - Approved', 'Closed',
            'Closed - Issued', 'Closed - DR taken-Approved', 
            'Closed - Appeal Upheld', 'Closed - DR not taken-Approved',
            'Approved', 'Permitted', 'Complete',
            'Accepted', 'Application Accepted', 'Closed - No DR action-Approved']
# Add / rename columns to fit ABAG format
ppermits['permyear'] =  ppermits['close_date'].dt.year
rhna_ppermits = ppermits[
    (ppermits['units'] > 0)
    & (ppermits['record_status'].isin(statuses)) 
    & ((ppermits['permyear'] >= 2007) & (ppermits['permyear'] <= 2015)
       | (ppermits['open_date'].dt.year >= 2007) & (ppermits['open_date'].dt.year <= 2015)
       | (ppermits['date_application_accepted'].dt.year >= 2007) & (ppermits['date_application_accepted'].dt.year <= 2015)
       | (ppermits['date_application_submitted'].dt.year >= 2007) & (ppermits['date_application_submitted'].dt.year <= 2015))
].copy()



In [148]:
built_poisson_geo2 = gpd.sjoin(built, rhna_ppermits, how="inner", predicate='contains')

In [149]:
built_poisson_blk2 = gpd.sjoin(built, rhna_ppermits, how="inner", predicate='contains')

In [150]:
built_poisson_geo2.shape

(123, 165)

In [151]:
plan_units = rhna_ppermits.groupby(['mapblocklot'], sort=False)['units'].median()
built_poisson_mapblklot2 = built.merge(plan_units, how='inner', left_on='mapblklot', right_on='mapblocklot')
built_poisson_mapblklot2.units.sum()

4944.5

In [152]:
built_poisson_mapblklot2.MapBlkLot_Master.nunique()

65

In [153]:
len(set(built_poisson_mapblklot2.MapBlkLot_Master) | set(built_poisson_geo2.MapBlkLot_Master))

104

In [154]:
all_match3 = pd.concat((all_match, built_poisson_mapblklot2, built_poisson_geo2), axis=0)

In [155]:
all_match3.MapBlkLot_Master.nunique()

276

In [156]:
all_match3 = all_match3[~all_match3.MapBlkLot_Master.duplicated()]

In [158]:
finalDf = all_match3[list(trainDf.columns.values) + ['units']]

In [160]:
cantID = built[~built.MapBlkLot_Master.isin(finalDf.MapBlkLot_Master)]

In [161]:
cantID.shape

(31, 23)

In [171]:
cantID

Unnamed: 0,MapBlkLot_Master,mapblklot,blklot,active,geometry,year,Historic,Residential_Dummy,Zillow_Price_Real,Const_FedReserve_Real,...,zp_DensRestMulti,zp_FormBasedMulti,zp_PDRInd,zp_Public,zp_Redev,zp_RH2,zp_RH3_RM1,Developed,CANTID_blklot_backup,CANTID_geometry_backup
13110,776044,776044,776044,F,"POLYGON ((-122.43625 37.77816, -122.43636 37.7...",2007.0,0.0,1.0,93.227099,92.120253,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,,
13204,785051,785051,785051,F,"POLYGON ((-122.42297 37.77923, -122.42290 37.7...",2007.0,0.0,0.0,93.227099,92.120253,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
13206,785053,785053,785053,T,"POLYGON ((-122.42244 37.77929, -122.42250 37.7...",2007.0,0.0,0.0,93.227099,92.120253,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
14562,855053,855053,855091,T,"POLYGON ((-122.42321 37.77227, -122.42336 37.7...",2007.0,0.0,0.0,93.227099,92.120253,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
19879,1100129,1100129,1100129,T,"POLYGON ((-122.43900 37.78169, -122.43901 37.7...",2007.0,0.0,0.0,93.227099,92.120253,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
84351,3508040,3508040,3508050,T,"POLYGON ((-122.41525 37.77595, -122.41555 37.7...",2007.0,0.0,1.0,93.227099,92.120253,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
85543,3542062,3542062,3542062,T,"POLYGON ((-122.43033 37.76664, -122.43045 37.7...",2007.0,1.0,0.0,93.227099,92.120253,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
86779,3568092,3568092,3568092,F,"POLYGON ((-122.42267 37.76404, -122.42268 37.7...",2007.0,0.0,1.0,93.227099,92.120253,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
88073,3589120,3589120,3589133,T,"POLYGON ((-122.42003 37.76147, -122.42005 37.7...",2007.0,0.0,0.0,93.227099,92.120253,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
89798,3616083,3616083,3616083,T,"POLYGON ((-122.41897 37.75627, -122.41896 37.7...",2007.0,1.0,0.0,93.227099,92.120253,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,


In [163]:
cantID[cantID.blklot.isin(rhna_permits[rhna_permits.units > 0].blocklot)].shape

(0, 23)

In [172]:
cantID.CANTID_blklot_backup.notna().sum()

0

In [174]:
built.CANTID_geometry_backup.notna().sum()


0

#### Can I identify missing permits by geo? Only one more.

In [166]:
cantID.crs == rhna_permits.crs

True

In [167]:
gpd.sjoin(cantID, rhna_permits).MapBlkLot_Master.unique()

array(['4044031'], dtype=object)

#### Do these unidentified parcels have diff geometries I can try in Parcels?

No. I tried looking at parcels where mapblocklot matched multiple rows in AllParcels, and none of those are the parcels that were developed 2007-2015.

In [190]:
remaining = dbi[
    (dbi['units'] > 0)
    & (dbi['permit_type'].isin([1, 2, 3, 8]))
    & (dbi.blocklot.isin(cantID.MapBlkLot_Master))]

In [213]:
remaining.blocklot.nunique()

9

In [248]:
dbi_units_remain = remaining.groupby(['blocklot'], sort=False)['units'].median()

built_poisson_remainder = cantID.merge(dbi_units_remain, how='inner', left_on='MapBlkLot_Master', right_on='blocklot')
built_poisson_remainder.MapBlkLot_Master.nunique()

9

In [249]:
built_poisson_remainder = built_poisson_remainder[list(trainDf.columns.values) + ['units']]

In [256]:
all_match_last = pd.concat((finalDf, built_poisson_remainder), axis=0)

In [257]:
all_match_last.shape

(285, 24)

In [258]:
remaining2 = dbi[
    (dbi['units'] > 0)
    & (dbi['permit_type'].isin([1, 2, 3, 8]))]

In [259]:
cantID2 = cantID[~cantID.MapBlkLot_Master.isin(built_poisson_remainder.MapBlkLot_Master)].copy()

In [260]:
lastgeo =  gpd.sjoin(cantID2, remaining2, how="inner", predicate='contains')

In [261]:
lastgeoUq = lastgeo[~lastgeo.MapBlkLot_Master.duplicated()]

In [262]:
lastgeoUq = lastgeoUq[list(trainDf.columns.values) + ['units']]

In [264]:
fdf = pd.concat((all_match_last, lastgeoUq), axis=0)

In [265]:
fdf.shape

(290, 24)

In [275]:
fdf['area'] = fdf.to_crs(5070).geometry.area

In [279]:
fdf.to_file('clean_built_data.geojson')

In [282]:
trainDf.MapBlkLot_Master.isin(fdf.MapBlkLot_Master).sum()

580