In [217]:
import pandas as pd
import geopandas as gpd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import contextily as ctx
from shapely.wkt import load

import clean_utils
from importlib import reload

In [224]:
reload(clean_utils)

<module 'clean_utils' from 'C:\\Users\\sadamerdji\\Desktop\\dissertation\\clean_utils.py'>

### Load Data

In [2]:
plan_permits = gpd.read_file('./data/SF_Planning_Permitting_Data.geojson', low_memory=False)

In [3]:
clean_utils.clean_dates(plan_permits)

In [4]:
parcels = pd.read_csv('./data/Blue Sky Code and Inputs/SF_Logistic_Data.csv')

In [5]:
allParcels = gpd.read_file('./data/Parcels   Active and Retired/parcels.shp')

In [6]:
sites = gpd.read_file('./data/site_inventory/xn--Bay_Area_Housing_Opportunity_Sites_Inventory__20072023_-it38a.shp')

In [107]:
tax = pd.read_excel('./data/tax_assessor/2019.8.20__SF_ASR_Secured_Roll_Data_2007-2008.xlsx')

In [165]:
use_codes = pd.read_excel('./data/tax_assessor/2019.8.20__SF_ASR_Secured_Roll_Data_2007-2008.xlsx', 
                          sheet_name='Class & Use Code')
neighborhood_codes = pd.read_excel('./data/tax_assessor/2019.8.20__SF_ASR_Secured_Roll_Data_2007-2008.xlsx', 
                                   sheet_name='Neigborhood Code')

### Geospatial version of bluesky data

In [208]:
def get_basic_df(cycle):
    if cycle == 4:
        start_year = 2007
    else:
        start_year = 2015
    trainParcels = parcels[(parcels.year >= start_year) & (parcels.year < (start_year + 8))]
    trainY = trainParcels.groupby('MapBlkLot_Master')['Developed'].agg(lambda x: x.ne(0).sum())
    trainX = trainParcels[trainParcels.year == start_year]
    trainDf = pd.merge(trainX.drop('Developed', axis=1), trainY, left_on='MapBlkLot_Master', right_index=True)
    df = clean_utils.transform_bluesky_to_geospatial(trainDf, cycle)
    return df

In [235]:
df4 = get_basic_df(4)

In [236]:
df5 = get_basic_df(5)

#### Get dbi permits from past cycle

In [231]:
pipeline3 = clean_utils.get_pipeline_permits(cycle=3, dbi=dbi)

In [233]:
pipeline4 = clean_utils.get_pipeline_permits(cycle=4, dbi=dbi)

### Merge into bluesky data

In [237]:
def merge_pipeline(df, pipeline):
    df = pd.merge(df, pipeline, left_on='blklot', right_index=True, how='left')
    df[pipeline.columns] = df[pipeline.columns].fillna(0)
    return df

In [None]:
merge_pipeline(df4, pipeline3)

In [None]:
merge_pipeline(df5, pipeline4)

In [93]:
df['area'] = df.to_crs(5070).geometry.area

### Merge Tax Data

98% of this dataset is easy to identify.

In [109]:
def clean_apn(apn):
    apn = ''.join(apn.split(' '))
    if len(apn) < 9:
        return apn
    block_length = 4
    #if apn[block_length].isalpha():
    #    return apn[:block_length] + apn[block_length+1:]
    return apn
 
    
tax['MapBlkLot_Master'] = tax.RP1PRCLID.apply(clean_apn)

df.MapBlkLot_Master.isin(tax.MapBlkLot_Master).mean().round(2)

In [110]:
cantID = df[~df.MapBlkLot_Master.isin(tax.MapBlkLot_Master)]

In [128]:
cantID.shape

(3514, 33)

In [131]:
cantID.Developed.sum()

150

The remaining 2% is harder.

In [125]:
# Make sure I filter out irrel parcels before doing this.
allParcels2 = allParcels.dissolve(by='mapblklot')
allParcels2 = allParcels2[['geometry']]
taxGeo = allParcels2.merge(tax,
                           how='inner',
                           right_on='MapBlkLot_Master',
                           left_on='mapblklot',
                           validate='one_to_one')

In [141]:
canID = gpd.sjoin(cantID, taxGeo, how="inner", predicate='intersects')

In [142]:
canID = canID.drop_duplicates('MapBlkLot_Master_left')

In [148]:
canID['MapBlkLot_Master'] = canID['MapBlkLot_Master_left']

In [150]:
canID = canID.drop(['MapBlkLot_Master_left', 'MapBlkLot_Master_right', 'index_right'], axis=1)

### Merge with Tax

In [155]:
full_df = df.merge(tax, how='inner', on='MapBlkLot_Master')

In [156]:
full_df = pd.concat((full_df, canID), axis=0)

In [160]:
full_df.Developed.sum()

253

In [164]:
trainDf.Developed.sum()

253

### Feature Engineering

#### Use Codes

In [166]:
use_codes = use_codes[~use_codes.isna().all(axis=1)]

In [167]:
use_codes.head()

Unnamed: 0,USE,DESC,CLASS,DESC.1
0,SRES,Single Family Residential,CO,Coop Units Unsegregated
1,SRES,Single Family Residential,COS,Coop Units Segregated
2,SRES,Single Family Residential,D,Dwelling
3,SRES,Single Family Residential,DBM,Dwelling BMR
4,SRES,Single Family Residential,LZ,Live/Work Condominium


In [173]:
use_codes = use_codes[use_codes['CLASS'] != 'CLASS']
use_lookup = use_codes.groupby('CLASS')['USE'].agg(list).to_dict()
use_lookup = {k: v[-1] for k, v in use_lookup.items()}

In [175]:
full_df['general_use_code'] = full_df.RP1CLACDE.map(use_lookup)

#### New features

In [176]:
full_df['hasBMR'] = full_df.RP1CLACDE.str.endswith('BM')

#### Neighborhood Code

In [177]:
neighborhood_codes.NEIGHBORHOOD = neighborhood_codes.NEIGHBORHOOD.str.strip().str.lower().str.split(' ').str.join('_')

In [178]:
neighborhood_codes.head()

Unnamed: 0,DISTRICT,CODE,NEIGHBORHOOD,BOUNDRIES
0,1,1A,central_richmond,"South of California, Park Presidio, south of F..."
1,1,1B,inner_richmond,"South of California, Arguello, south of Fulton..."
2,1,1C,jordan_park/laurel_heights,"California, west of Presidio, Geary, Arguello"
3,1,1D,lake_--the_presidio,"West and south of Presidio Terrace, Arguello, ..."
4,1,1E,outer_richmond,"The Ocean, west of 32nd Avenue, south of Fulton"


In [179]:
neighborhoods = {k: v[0] for k, v in neighborhood_codes.groupby('CODE')['NEIGHBORHOOD'].agg(list).to_dict().items()}
neighborhoods = {(k if len(k) == 3 else '0'+k): v for k, v in neighborhoods.items()} 

In [180]:
def lookup_neighborhood(x):
    return neighborhoods.get(x, x)

In [181]:
full_df['neighborhood'] = full_df.RP1NBRCDE.apply(lookup_neighborhood)

In [182]:
districts = {k: v[0] for k, v in neighborhood_codes.groupby('CODE')['DISTRICT'].agg(list).to_dict().items()}
districts = {(k if len(k) == 3 else '0'+k): v for k, v in districts.items()} 

In [183]:
def lookup_district(x):
    return 'district' + str(districts.get(x, x))

In [184]:
full_df['district'] = full_df['RP1NBRCDE'].apply(lookup_district)

In [185]:
full_df = full_df.drop('RP1NBRCDE', axis=1)

### Drop constant columns

In [186]:
full_df.shape[0] 

153204

In [187]:
full_df = full_df.drop(trainX.columns[trainX.nunique() <= 1], axis=1)

In [199]:
backups = [c for c in full_df.columns if 'CANTID' in c]

In [207]:
full_df.drop(backups, axis=1).to_csv('cleaned_rhna4_data.csv')