In [12]:
import pandas as pd
import geopandas as gpd
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
from statsmodels.tools.tools import add_constant
import numpy as np
import statsmodels.api as sm

### Load data

Load dataset with snapshot of 2014 pipeline snapshot and outcomes.

In [2]:
mv_history = pd.read_csv('./data/pipeline_history.csv')

In [3]:
mv_history.head()

Unnamed: 0,Address,Pg,Built,Units
0,420 San Antonio,3,1.0,373
1,2580 California St,4,0.26,632
2,1701 ECR,7,1.0,24
3,1101 ECR,8,1.0,52
4,801 ECR,8,1.0,164


Load site inventory datasets.

In [4]:
si_geo = gpd.read_file('./data/MV_Site_Inventory/MV_Site_Inventory.shp')

In [5]:
si = pd.read_csv('./data/hcd_table_a.csv', low_memory=False)
si = si[~si.isna().all(axis=1)]
si = si[:-1]

In [65]:
permits = gpd.read_file('./data/all_permits.json')

In [7]:
zoning = gpd.read_file('./data/Zoning_Districts/Zoning_Districts.shp')

### Pipeline History Analysis

In [8]:
sum(mv_history.Units * mv_history.Built) / mv_history.Units.sum()

0.6719286871961102

In [9]:
pearsonr(mv_history.Units, mv_history.Built)

(-0.24156975644156864, 0.3048589611212049)

### Pipeline Predictions

In [14]:
reg = sm.Logit(mv_history.Built, add_constant(mv_history.Units)).fit_regularized()

def predict_success(n_units):
    """P(devs) adjusted for number of units in project."""
    return reg.predict([1, n_units])

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.4578744955743205
            Iterations: 50
            Function evaluations: 480
            Gradient evaluations: 50


In [15]:
def predict_success_floor(n_units):
    """
    P(devs) adjusted for number of units in project, with a floor that 
    pipeline sites are at least as likely to be developed as opportunity sites.
    """
    return max(reg.predict([1, n_units]), .206)

In [16]:
pipeline = si[si['Optional Information 1'].str.contains('Pending')]

This dataset excludes approved projects, which is why it has less total capacity than the draft claims.

In [17]:
pipeline['Total Capacity'].sum()

6913.0

In [18]:
pipeline['Site Status'].value_counts()

Pending Project    66
Name: Site Status, dtype: int64

#### Pipeline capacity with plain Logistic Regression

In [19]:
n_approved_units = 1847
n_approved_li_units = 226
p_devs = pipeline['Total Capacity'].apply(predict_success)

In [20]:
sum(p_devs * pipeline['Total Capacity']).item() + n_approved_units

3961.4157200269683

In [21]:
sum(p_devs * pipeline['Lower Income Capacity']).item() + n_approved_li_units

1145.655594727962

1847 is the number of total units in approved projects. This capacity is excluded from excel sheet.

#### Pipeline capacity with constraint that P(dev | pipeline) > P(dev | opp) for all sites

In [22]:
p_devs = pipeline['Total Capacity'].apply(predict_success_floor)

In [23]:
sum(p_devs * pipeline['Total Capacity']).item() + n_approved_units

4651.169153669649

In [24]:
sum(p_devs * pipeline['Lower Income Capacity']).item() + n_approved_li_units

1253.8843652965609

### Opportunity Sites

In [25]:
opps = si[~si['Site Status'].str.contains('Pending')]

In [27]:
opps['Zoning Designation (Current)'].value_counts()

P(38) - El Camino Real Precise Plan    55
P(41) - East Whisman Precise Plan      15
P(40) - San Antonio Precise Plan        9
P(39) - North Bayshore Precise Plan     6
CN                                      5
CRA                                     4
P(27) - Grant-Phyllis Precise Plan      2
P(19) - Downtown Precise Plan           1
Name: Zoning Designation (Current), dtype: int64

Find opportunity sites for each of four major precise plan areas.

In [28]:
ecr = opps[opps['Zoning Designation (Current)'].str.contains("El Camino Real")]

In [29]:
ew = opps[opps['Zoning Designation (Current)'].str.contains("East Whisman")]

In [30]:
sa = opps[opps['Zoning Designation (Current)'].str.contains("San Antonio")]

In [31]:
nb = opps[opps['Zoning Designation (Current)'].str.contains("North Bayshore")]

There are 55 opportunity sites in El Camino Real Precise Plan, 15 in the East Whisman precise plan, 9 in the San Antonio precise plan, and 6 in the North Bayshore precise plan.

In [32]:
len(ecr), len(ew), len(sa), len(nb)

(55, 15, 9, 6)

### Cleaning permits dataset

Half of older permits have same geometry.

In [66]:
permits.apn = permits.apn.str.split('-').str.join('')

In [67]:
permits.apn = permits.apn.str.replace('Â\xa0', '').values.tolist()

In [68]:
permits.apn = permits.apn.str.strip()

In [69]:
permits = permits[(~permits.apn.duplicated()) | (permits.apn.isnull())]

In [70]:
pp_permits = gpd.sjoin(permits, zoning.to_crs('EPSG:4326'))['PRECPLAN'].value_counts()

In [71]:
ecr_ppid = 'P(38)'
nbs_ppid = 'P(39)'
sa_ppid = 'P(40)'
ew_ppid = 'P(41)'

In [79]:
ecr_yrs, ew_yrs, sa_yrs, nbs_yrs = 8, 3, 8, 5

In [80]:
ecr_exp = pp_permits[ecr_ppid] / ecr_yrs * 8

In [81]:
nbs_exp = pp_permits[nbs_ppid] / ew_yrs * 8

In [82]:
sa_exp = pp_permits[sa_ppid] / sa_yrs * 8

In [83]:
ew_exp = pp_permits[ew_ppid] / nbs_yrs * 8

In [84]:
for pp, name, expect in zip([ecr, ew, sa, nb], ['ecr', 'ew', 'sa', 'nb'], [ecr_exp, ew_exp, sa_exp, nbs_exp]):
    print('For', name, 'the city claims', len(pp), 'projects in 8 years', ' but historical trends suggest')
    print(int(round(expect, 0)), "is more reasonable. That'd discount their site capacity claims by", round(1 - (int(round(expect,0)) / len(pp)), 2), '%')
    print('\n')

For ecr the city claims 55 projects in 8 years  but historical trends suggest
25 is more reasonable. That'd discount their site capacity claims by 0.55 %


For ew the city claims 15 projects in 8 years  but historical trends suggest
3 is more reasonable. That'd discount their site capacity claims by 0.8 %


For sa the city claims 9 projects in 8 years  but historical trends suggest
7 is more reasonable. That'd discount their site capacity claims by 0.22 %


For nb the city claims 6 projects in 8 years  but historical trends suggest
5 is more reasonable. That'd discount their site capacity claims by 0.17 %




In [85]:
for pp, name, expect in zip([ecr, ew, sa, nb], ['ecr', 'ew', 'sa', 'nb'], [ecr_exp, ew_exp, sa_exp, nbs_exp]):
    print('For', name, 'the city claims', len(pp), 'projects in 8 years, but historical trends suggest')
    print(int(round(expect, 0)), "is more reasonable. That'd inflate their site capacity claims by", 
          round(len(pp) / expect, 3)*100, '%')
    print('\n')

For ecr the city claims 55 projects in 8 years, but historical trends suggest
25 is more reasonable. That'd inflate their site capacity claims by 220.00000000000003 %


For ew the city claims 15 projects in 8 years, but historical trends suggest
3 is more reasonable. That'd inflate their site capacity claims by 468.79999999999995 %


For sa the city claims 9 projects in 8 years, but historical trends suggest
7 is more reasonable. That'd inflate their site capacity claims by 128.6 %


For nb the city claims 6 projects in 8 years, but historical trends suggest
5 is more reasonable. That'd inflate their site capacity claims by 112.5 %


