In [217]:
import pandas as pd
import geopandas as gpd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import contextily as ctx
from shapely.wkt import load

import clean_utils
from importlib import reload

In [313]:
reload(clean_utils)

<module 'clean_utils' from 'C:\\Users\\sadamerdji\\Desktop\\dissertation\\clean_utils.py'>

### Load Data

In [2]:
plan_permits = gpd.read_file('./data/SF_Planning_Permitting_Data.geojson', low_memory=False)

In [3]:
clean_utils.clean_dates(plan_permits)

In [4]:
parcels = pd.read_csv('./data/Blue Sky Code and Inputs/SF_Logistic_Data.csv')

In [5]:
allParcels = gpd.read_file('./data/Parcels   Active and Retired/parcels.shp')

In [6]:
sites = gpd.read_file('./data/site_inventory/xn--Bay_Area_Housing_Opportunity_Sites_Inventory__20072023_-it38a.shp')

In [107]:
tax = pd.read_excel('./data/tax_assessor/2019.8.20__SF_ASR_Secured_Roll_Data_2007-2008.xlsx')

In [269]:
tax15 = pd.read_excel('./data/tax_assessor/2020.7.10_SF_ASR_Secured_Roll_Data_2015-2016.xlsx')

In [285]:
# These codes are the same in both 2007-2008 and 2015-2016
use_codes = pd.read_excel('./data/tax_assessor/2019.8.20__SF_ASR_Secured_Roll_Data_2007-2008.xlsx', 
                          sheet_name='Class & Use Code')
neighborhood_codes = pd.read_excel('./data/tax_assessor/2019.8.20__SF_ASR_Secured_Roll_Data_2007-2008.xlsx', 
                                   sheet_name='Neigborhood Code')

### Geospatial version of bluesky data

In [208]:
def get_basic_df(cycle):
    if cycle == 4:
        start_year = 2007
    else:
        start_year = 2015
    trainParcels = parcels[(parcels.year >= start_year) & (parcels.year < (start_year + 8))]
    trainY = trainParcels.groupby('MapBlkLot_Master')['Developed'].agg(lambda x: x.ne(0).sum())
    trainX = trainParcels[trainParcels.year == start_year]
    trainDf = pd.merge(trainX.drop('Developed', axis=1), trainY, left_on='MapBlkLot_Master', right_index=True)
    df = clean_utils.transform_bluesky_to_geospatial(trainDf, cycle)
    return df

In [297]:
df4 = get_basic_df(4)

In [298]:
df5 = get_basic_df(5)

In [301]:
df5.Developed.sum()

88

In [300]:
df4.Developed.sum()

253

#### Get dbi permits from past cycle

In [231]:
pipeline3 = clean_utils.get_pipeline_permits(cycle=3, dbi=dbi)

In [233]:
pipeline4 = clean_utils.get_pipeline_permits(cycle=4, dbi=dbi)

### Merge into bluesky data

In [237]:
def merge_pipeline(df, pipeline):
    df = pd.merge(df, pipeline, left_on='blklot', right_index=True, how='left')
    df[pipeline.columns] = df[pipeline.columns].fillna(0)
    return df

In [302]:
df4 = merge_pipeline(df4, pipeline3)

In [303]:
df5 = merge_pipeline(df5, pipeline4)

### Merge Tax Data

In [304]:
full_df4 = clean_utils.merge_tax(df4, tax, cycle=4, parcels=allParcels)

In [314]:
full_df5 = clean_utils.merge_tax(df5, tax15, cycle=5, parcels=allParcels)

### Feature Engineering

#### Use Codes

In [166]:
use_codes = use_codes[~use_codes.isna().all(axis=1)]

In [167]:
use_codes.head()

Unnamed: 0,USE,DESC,CLASS,DESC.1
0,SRES,Single Family Residential,CO,Coop Units Unsegregated
1,SRES,Single Family Residential,COS,Coop Units Segregated
2,SRES,Single Family Residential,D,Dwelling
3,SRES,Single Family Residential,DBM,Dwelling BMR
4,SRES,Single Family Residential,LZ,Live/Work Condominium


In [173]:
use_codes = use_codes[use_codes['CLASS'] != 'CLASS']
use_lookup = use_codes.groupby('CLASS')['USE'].agg(list).to_dict()
use_lookup = {k: v[-1] for k, v in use_lookup.items()}

In [318]:
full_df4['general_use_code'] = full_df4.RP1CLACDE.map(use_lookup)
full_df5['general_use_code'] = full_df5.RP1CLACDE.map(use_lookup)

#### New features

In [319]:
full_df4['hasBMR'] = full_df4.RP1CLACDE.str.endswith('BM')
full_df5['hasBMR'] = full_df5.RP1CLACDE.str.endswith('BM')

#### Neighborhood Code

In [320]:
neighborhood_codes.NEIGHBORHOOD = neighborhood_codes.NEIGHBORHOOD.str.strip().str.lower().str.split(' ').str.join('_')

In [321]:
neighborhood_codes.head()

Unnamed: 0,DISTRICT,CODE,NEIGHBORHOOD,BOUNDRIES
0,1,1A,central_richmond,"South of California, Park Presidio, south of F..."
1,1,1B,inner_richmond,"South of California, Arguello, south of Fulton..."
2,1,1C,jordan_park/laurel_heights,"California, west of Presidio, Geary, Arguello"
3,1,1D,lake_--the_presidio,"West and south of Presidio Terrace, Arguello, ..."
4,1,1E,outer_richmond,"The Ocean, west of 32nd Avenue, south of Fulton"


In [322]:
neighborhoods = {k: v[0] for k, v in neighborhood_codes.groupby('CODE')['NEIGHBORHOOD'].agg(list).to_dict().items()}
neighborhoods = {(k if len(k) == 3 else '0'+k): v for k, v in neighborhoods.items()} 

In [323]:
def lookup_neighborhood(x):
    return neighborhoods.get(x, x)

In [324]:
full_df4['neighborhood'] = full_df4.RP1NBRCDE.apply(lookup_neighborhood)
full_df5['neighborhood'] = full_df5.RP1NBRCDE.apply(lookup_neighborhood)

In [325]:
districts = {k: v[0] for k, v in neighborhood_codes.groupby('CODE')['DISTRICT'].agg(list).to_dict().items()}
districts = {(k if len(k) == 3 else '0'+k): v for k, v in districts.items()} 

In [326]:
def lookup_district(x):
    return 'district' + str(districts.get(x, x))

In [327]:
full_df4['district'] = full_df4['RP1NBRCDE'].apply(lookup_district)
full_df5['district'] = full_df5['RP1NBRCDE'].apply(lookup_district)

In [328]:
full_df4 = full_df4.drop('RP1NBRCDE', axis=1)
full_df5 = full_df5.drop('RP1NBRCDE', axis=1)

### Treat inclusion in site inventory as a feature

In [329]:
full_df4 = clean_utils.get_site_inventory_feature(full_df4, sites, cycle=4)
full_df5 = clean_utils.get_site_inventory_feature(full_df5, sites, cycle=4)

### Drop constant columns

In [332]:
full_df4 = full_df4.drop([c for c in full_df4.columns if 'CANTID' in c], axis=1)
full_df5 = full_df5.drop([c for c in full_df5.columns if 'CANTID' in c], axis=1)

In [333]:
full_df4 = full_df4.drop(full_df4.columns[full_df4.nunique() <= 1], axis=1)
full_df5 = full_df5.drop(full_df5.columns[full_df5.nunique() <= 1], axis=1)

In [335]:
full_df4.to_csv('cleaned_rhna4_data.csv')
full_df5.to_csv('cleaned_rhna5_data.csv')

In [342]:
full_df4[full_df4.RP1LNDVAL == max(full_df4.RP1LNDVAL)].iloc[:, 20:]

Unnamed: 0,pipeline2,pipeline3,pipeline4,pipeline5,pipeline6,pipeline7,pipeline8,PROPLOC,RP1PRCLID,RP1VOLUME,...,RP1FXTVAL,RP1IMPVAL,RP1LNDVAL,RP1PPTVAL,area,general_use_code,hasBMR,neighborhood,district,inInventory
5071,0.0,33.0,0.0,0.0,0.0,0.0,17.0,0000 0555 CALIFORNIA ST0000,0259 026,3,...,0,612000000,256020000,1052350,10640.232701,COMO,False,financial_district_north,district8,False
