In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import contextily as ctx
from shapely.wkt import load

import clean_utils
from importlib import reload


In [14]:
reload(clean_utils)

<module 'clean_utils' from 'C:\\Users\\sadamerdji\\Desktop\\dissertation\\clean_utils.py'>

### Load Data

In [6]:
parcels = pd.read_csv('./data/Blue Sky Code and Inputs/SF_Logistic_Data.csv')

In [7]:
allParcels = gpd.read_file('./data/Parcels   Active and Retired/parcels.shp')

In [8]:
sites = gpd.read_file('./data/site_inventory/xn--Bay_Area_Housing_Opportunity_Sites_Inventory__20072023_-it38a.shp')

In [9]:
tax = pd.read_excel('./data/tax_assessor/2019.8.20__SF_ASR_Secured_Roll_Data_2007-2008.xlsx')

In [10]:
tax15 = pd.read_excel('./data/tax_assessor/2020.7.10_SF_ASR_Secured_Roll_Data_2015-2016.xlsx')

In [11]:
# These codes are the same in both 2007-2008 and 2015-2016
use_codes = pd.read_excel('./data/tax_assessor/2019.8.20__SF_ASR_Secured_Roll_Data_2007-2008.xlsx', 
                          sheet_name='Class & Use Code')
neighborhood_codes = pd.read_excel('./data/tax_assessor/2019.8.20__SF_ASR_Secured_Roll_Data_2007-2008.xlsx', 
                                   sheet_name='Neigborhood Code')

### Geospatial version of bluesky data

In [12]:
def get_basic_df(cycle):
    if cycle == 4:
        start_year = 2007
    else:
        start_year = 2015
    trainParcels = parcels[(parcels.year >= start_year) & (parcels.year < (start_year + 8))]
    trainY = trainParcels.groupby('MapBlkLot_Master')['Developed'].agg(lambda x: x.ne(0).sum())
    trainX = trainParcels[trainParcels.year == start_year]
    trainDf = pd.merge(trainX.drop('Developed', axis=1), trainY, left_on='MapBlkLot_Master', right_index=True)
    df = clean_utils.transform_bluesky_to_geospatial(trainDf, cycle)
    return df

In [15]:
df4 = get_basic_df(4)

In [16]:
df5 = get_basic_df(5)

In [17]:
df5.Developed.sum()

88

In [18]:
df4.Developed.sum()

253

#### Get dbi permits from past cycle

In [24]:
pipeline3 = clean_utils.get_pipeline_permits(cycle=3)

In [25]:
pipeline4 = clean_utils.get_pipeline_permits(cycle=4)

### Merge into bluesky data

In [26]:
def merge_pipeline(df, pipeline):
    df = pd.merge(df, pipeline, left_on='blklot', right_index=True, how='left')
    df[pipeline.columns] = df[pipeline.columns].fillna(0)
    return df

In [27]:
df4 = merge_pipeline(df4, pipeline3)

In [28]:
df5 = merge_pipeline(df5, pipeline4)

### Merge Tax Data

In [29]:
full_df4 = clean_utils.merge_tax(df4, tax, cycle=4, parcels=allParcels)

In [30]:
full_df5 = clean_utils.merge_tax(df5, tax15, cycle=5, parcels=allParcels)

In [34]:
full_df5.dtypes[~(full_df4.dtypes == full_df5.dtypes)]

ROOMS       object
STOREYNO    object
FBA         object
dtype: object

In [65]:
full_df5.ROOMS = pd.to_numeric(full_df5.ROOMS, 
                               errors='coerce',
                               downcast='integer')
full_df5.STOREYNO = pd.to_numeric(full_df5.STOREYNO,
                                  errors='coerce',
                                  downcast='integer')
full_df5.FBA = pd.to_numeric(full_df5.FBA, 
                             errors='coerce',
                             downcast='integer')

In [66]:
full_df5 = full_df5[full_df5.FBA.notna() &
                    full_df5.STOREYNO.notna() &
                    full_df5.ROOMS.notna()]

### Feature Engineering

#### Use Codes

In [68]:
use_codes = use_codes[~use_codes.isna().all(axis=1)]

In [69]:
use_codes.head()

Unnamed: 0,USE,DESC,CLASS,DESC.1
0,SRES,Single Family Residential,CO,Coop Units Unsegregated
1,SRES,Single Family Residential,COS,Coop Units Segregated
2,SRES,Single Family Residential,D,Dwelling
3,SRES,Single Family Residential,DBM,Dwelling BMR
4,SRES,Single Family Residential,LZ,Live/Work Condominium


In [70]:
use_codes = use_codes[use_codes['CLASS'] != 'CLASS']
use_lookup = use_codes.groupby('CLASS')['USE'].agg(list).to_dict()
use_lookup = {k: v[-1] for k, v in use_lookup.items()}

In [71]:
full_df4['general_use_code'] = full_df4.RP1CLACDE.map(use_lookup)
full_df5['general_use_code'] = full_df5.RP1CLACDE.map(use_lookup)

#### New features

In [72]:
full_df4['hasBMR'] = full_df4.RP1CLACDE.str.endswith('BM')
full_df5['hasBMR'] = full_df5.RP1CLACDE.str.endswith('BM')

#### Neighborhood Code

In [73]:
neighborhood_codes.NEIGHBORHOOD = neighborhood_codes.NEIGHBORHOOD.str.strip().str.lower().str.split(' ').str.join('_')

In [74]:
neighborhood_codes.head()

Unnamed: 0,DISTRICT,CODE,NEIGHBORHOOD,BOUNDRIES
0,1,1A,central_richmond,"South of California, Park Presidio, south of F..."
1,1,1B,inner_richmond,"South of California, Arguello, south of Fulton..."
2,1,1C,jordan_park/laurel_heights,"California, west of Presidio, Geary, Arguello"
3,1,1D,lake_--the_presidio,"West and south of Presidio Terrace, Arguello, ..."
4,1,1E,outer_richmond,"The Ocean, west of 32nd Avenue, south of Fulton"


In [75]:
neighborhoods = {k: v[0] for k, v in neighborhood_codes.groupby('CODE')['NEIGHBORHOOD'].agg(list).to_dict().items()}
neighborhoods = {(k if len(k) == 3 else '0'+k): v for k, v in neighborhoods.items()} 

In [76]:
def lookup_neighborhood(x):
    return neighborhoods.get(x, x)

In [77]:
full_df4['neighborhood'] = full_df4.RP1NBRCDE.apply(lookup_neighborhood)
full_df5['neighborhood'] = full_df5.RP1NBRCDE.apply(lookup_neighborhood)

In [78]:
districts = {k: v[0] for k, v in neighborhood_codes.groupby('CODE')['DISTRICT'].agg(list).to_dict().items()}
districts = {(k if len(k) == 3 else '0'+k): v for k, v in districts.items()} 

In [79]:
def lookup_district(x):
    return 'district' + str(districts.get(x, x))

In [80]:
full_df4['district'] = full_df4['RP1NBRCDE'].apply(lookup_district)
full_df5['district'] = full_df5['RP1NBRCDE'].apply(lookup_district)

In [81]:
full_df4 = full_df4.drop('RP1NBRCDE', axis=1)
full_df5 = full_df5.drop('RP1NBRCDE', axis=1)

### Treat inclusion in site inventory as a feature

In [82]:
full_df4 = clean_utils.get_site_inventory_feature(full_df4, sites, cycle=4)
full_df5 = clean_utils.get_site_inventory_feature(full_df5, sites, cycle=4)

### Drop constant columns

In [83]:
full_df4 = full_df4.drop([c for c in full_df4.columns if 'CANTID' in c], axis=1)
full_df5 = full_df5.drop([c for c in full_df5.columns if 'CANTID' in c], axis=1)

In [84]:
full_df4 = full_df4.drop(full_df4.columns[full_df4.nunique() <= 1], axis=1)
full_df5 = full_df5.drop(full_df5.columns[full_df5.nunique() <= 1], axis=1)

In [85]:
full_df4.to_csv('cleaned_rhna4_data.csv')
full_df5.to_csv('cleaned_rhna5_data.csv')

In [88]:
parcels

Unnamed: 0,MapBlkLot_Master,year,Developed,Historic,Residential_Dummy,Zillow_Price_Real,Const_FedReserve_Real,Envelope_1000,Upzone_Ratio,zp_OfficeComm,zp_DensRestMulti,zp_FormBasedMulti,zp_PDRInd,zp_Public,zp_Redev,zp_RH2,zp_RH3_RM1
0,0001001,2001,0,0,0,66.843479,74.025851,42.497730,2.000000,0,0,0,0,1,0,0,0
1,0005001,2001,0,1,0,66.843479,74.025851,86.389615,1.666667,1,0,0,0,0,0,0,0
2,0011008,2001,0,0,0,66.843479,74.025851,72.496845,0.365177,1,0,0,0,0,0,0,0
3,0004002,2001,0,0,0,66.843479,74.025851,39.108627,1.666667,1,0,0,0,0,0,0,0
4,0012003A,2001,0,0,0,66.843479,74.025851,53.853557,1.666667,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450913,VACSTWIL,2016,0,0,0,100.000000,100.000000,56.306155,5.200000,0,1,0,0,0,0,0,0
2450914,VACSTMIN,2016,0,0,0,100.000000,100.000000,3.405669,1.920000,1,0,0,0,0,0,0,0
2450915,VACST22S,2016,0,0,0,100.000000,100.000000,2.000000,0.315050,0,0,0,0,1,0,1,0
2450916,9900502,2016,0,1,0,100.000000,100.000000,763.745552,1.666667,1,0,0,0,1,0,0,0


In [89]:
tax

Unnamed: 0,PROPLOC,RP1NBRCDE,RP1PRCLID,RP1VOLUME,RP1CLACDE,YRBLT,BATHS,BEDS,ROOMS,STOREYNO,...,RP1STACDE,RP1EXMVL2,RP1EXMVL1,ROLLYEAR,RECURRSALD,RP1FXTVAL,RP1IMPVAL,RP1LNDVAL,RP1PPTVAL,MapBlkLot_Master
0,0000 0000 0000,08H,0001 001,1,G,1900,0,0,0,0,...,N,0,0,7,0,0,0,0,0,0001001
1,0000 0000 0000,08H,0002 001,1,G,1900,0,0,0,0,...,N,0,0,7,0,0,0,0,0,0002001
2,0000 0000 0000,08H,0004 002,1,V,0,0,0,0,0,...,N,0,0,7,0,0,0,0,0,0004002
3,0000 0000 0000,08H,0005 001,1,G,1900,0,0,0,0,...,N,0,0,7,0,0,0,0,0,0005001
4,0000 0000 0000,08H,0006 001,1,G,1900,0,0,0,0,...,N,0,0,7,0,0,0,0,0,0006001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197773,0000 0000 0000,,9999 163,45,,0,0,0,0,0,...,,0,0,7,80101,0,0,0,0,9999163
197774,0000 0000 0000,,9999 164,45,,0,0,0,0,0,...,,0,0,7,80101,0,0,0,0,9999164
197775,0000 0000 0000,,9999 165,45,,0,0,0,0,0,...,,0,0,7,80101,0,0,0,0,9999165
197776,0000 0000 0000,,9999 166,45,,0,0,0,0,0,...,,0,0,7,80101,0,0,0,0,9999166


In [91]:
dbi = clean_utils.get_dbi_data()

In [94]:
dbi.permit_type_definition.value_counts()

otc alterations permit                 891927
additions alterations or repairs       266538
sign - erect                            19410
new construction wood frame             12928
demolitions                              7129
wall or painted sign                     3956
new construction                         2420
grade or quarry or fill or excavate       819
Name: permit_type_definition, dtype: int64