# Analysis from Brian Goggin (uses newly-produced data as of September 2017)

In [None]:
import pandas as pd
import logging
import dateutil
from dateutil import parser
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from geopandas import GeoDataFrame
from shapely.geometry import Point
%matplotlib inline
pd.set_option('display.max_columns', 500)

In [None]:
#import dataset of project times from "data" folder
file = "../../../data/cleaned/all_quarters__one_record_per_project.csv"
df = pd.read_csv(file)

In [None]:
#First, filter out those projects that are exclusively non-residential (defined as those without units)
df = df[df['units'] > 0]

In [None]:
#Next, keep only those that reached completion at some point over the time period
df=df[pd.notnull(df['comp_date'])]

In [None]:
df.shape

In [None]:
df=df[pd.notnull(df['firstfiled'])]

In [None]:
df.shape

In [None]:
#drop duplicates for now, until manual cleaning is done
df= df[~ df.duplicated('dbi_permit', keep=False)]

In [None]:
df.shape

In [None]:
#convert completion date to date variables type
df['project_duration_days'].describe()

In [None]:
df['project_time_years']=df['project_duration_days']/365

In [None]:
df['project_time_years'].describe()

In [None]:
plt.figure(1, figsize=(8,6), )
plt.suptitle('Figure 1. SF Housing Development Times', fontsize=20)
plt.xlabel('Development Time (Years)', fontsize = 20)
plt.ylabel('Number of Developments', fontsize = 20)
plt.xticks(fontsize = 14)
plt.yticks(fontsize = 14)
ax = plt.hist(df['project_time_years'], bins=25, alpha=.6, color='g')
plt.savefig('../../../hitogram.png')
plt.show()

In [None]:
# Project review time, normalized by amount of units
#create unit-year category so that we normalize by units when comparing geographies
df['years_per_unit']=df['project_time_years']/df['units']

In [None]:
df['years_per_unit'].describe()

In [None]:
plt.figure(1, figsize=(8,6), )
plt.suptitle('Figure 2. SF Housing Development Times', fontsize=20)
plt.xlabel('Years per Unit Added', fontsize=20)
plt.ylabel('Number of Developments', fontsize=20)
plt.xticks(fontsize = 14)
plt.yticks(fontsize = 14)
ax = plt.hist(df['years_per_unit'], bins=25, alpha=.6, color='g')
plt.savefig('../../../hitogram_yearunits.png')
plt.show()

In [None]:
#Create dataset of net units added with 
list = []
uniqueid = 0
for index, row in df.iterrows():
    amount = int(row['units'])
    for units in range(0, amount):
        dictionary = {}
        dictionary['id']=uniqueid+1
        dictionary['time']=row['project_time_years']
        list.append(dictionary)
        uniqueid=uniqueid+1
df_units = pd.DataFrame(list)

plt.figure(1, figsize=(10,10), )
plt.suptitle('SF Housing Development Times', fontsize=20)
plt.xlabel('Review Time (Years)', fontsize = 20)
plt.ylabel('Number of Units Added', fontsize = 20)
ax = plt.hist(df_units['time'], bins=25, alpha=.6, color='g')
plt.savefig('../../../hitogram_units.png')
plt.show()

In [None]:
df_units['time'].describe()

In [None]:
df['units'].describe()

# Plot by unit category

In [None]:
unit_cutoffs=[10, 50, 100, 200]
def unit_categories(value):
    if value['units'] <unit_cutoffs[0]:
        field = '0'+'-'+str(unit_cutoffs[0])
    elif (value['units'] >=unit_cutoffs[0]) & (value['units'] <unit_cutoffs[1]):
        field = str(unit_cutoffs[0])+'-'+str(unit_cutoffs[1])
    elif (value['units']>= unit_cutoffs[1]) & (value['units'] <unit_cutoffs[2]):
        field = str(unit_cutoffs[1])+'-'+str(unit_cutoffs[2])
    elif (value['units']>= unit_cutoffs[2]) & (value['units'] <200):
        field = str(unit_cutoffs[2])+'-'+str(unit_cutoffs[3])
    elif value['units']>=200:
        field = '>'+str(unit_cutoffs[3])
    return field
    
df['unitcat']=df.apply(unit_categories, axis=1)

In [None]:
df['unitcat'].value_counts()

In [None]:
cat_medians=[]
categories=['0-10', '10-50', '50-100', '100-200', '>200']
for cat in categories:
    median = df[df['unitcat']==cat]['project_time_years'].median()
    cat_medians.append(median)
objects = df['unitcat'].unique()
y_pos = np.arange(len(objects))

In [None]:
df['unitcat'].unique()

In [None]:
plt.figure(1, figsize=(8,6), )
plt.suptitle('Figure 5. Development Time by Size Category', fontsize=20)
plt.ylabel('Median Development Time (Years)', fontsize=20)
plt.xlabel('Units Added', fontsize=20)
plt.bar(y_pos, cat_medians, align='center', alpha=0.5)
plt.xticks(y_pos, categories, fontsize=14)
plt.yticks(fontsize = 14)
plt.savefig('../../../bar_chart_times.png')
plt.show()

# Scatter Plot

In [None]:
plt.figure(1, figsize=(10,10))
plt.scatter(df['units'], df['project_time_years'], s=100)
plt.suptitle('Figure 4. Development Time by Units Added', fontsize=20)
plt.ylabel('Development Time (Years)', fontsize = 18)
plt.xlabel('Number of Units Added', fontsize = 18)
plt.xticks(fontsize=14)
plt.yticks(fontsize = 14)
plt.xlim([0,800])
plt.ylim([0,25])
plt.savefig('../../../scatter.png')

In [None]:
# Top 5 longest projects
df.sort_values('project_time_years', ascending = False)[0:5]

In [None]:
# Top 5 longest projects
df.sort_values('project_time_years', ascending = True)[0:5]

# Breakdown by place. Possibly by average time per unit

In [None]:
#first, convert points to geodataframe
crs = {'init' :'epsg:4326'}
geometry = [Point(xy) for xy in zip(df.x, df.y)]
devs = GeoDataFrame(df, crs=crs, geometry=geometry)
devs = devs.to_crs({'init': 'epsg:4326'}) 

In [None]:
devs.shape

In [None]:
#import neighborhoods
neighborhoods = gpd.read_file('../../../data/gis/41_neighborhoods/41_neighborhoods.shp')

In [None]:
#convert boundaries to geographic coordinate system to conform to points
neighborhoods = neighborhoods.to_crs({'init': 'epsg:4326'}) 

In [None]:
neighborhoods.shape

In [None]:
#First, spatial join between points and neighborhood boundaries. Set 'how' to 'left' to preserve all developments
df_nb = gpd.sjoin(devs, neighborhoods, how = 'inner', op='within')
df_nb.shape

# Breakdown time by stage of process

In [None]:
df = df[pd.notnull(df['BP_date'])]

In [None]:
df.shape

In [None]:
df = df[pd.notnull(df['con_date'])]

In [None]:
df.shape

In [None]:
df['permit_time']=df.apply(lambda x: ((dateutil.parser.parse(x['BP_date']) - dateutil.parser.parse(x['first_date'])).days)/365, axis=1)
df['bp_time']=df.apply(lambda x: ((dateutil.parser.parse(x['con_date']) - dateutil.parser.parse(x['BP_date'])).days)/365, axis=1)
df['con_time']=df.apply(lambda x: ((dateutil.parser.parse(x['comp_date']) - dateutil.parser.parse(x['con_date'])).days)/365, axis=1)

In [None]:
df['bp_time'].describe()

In [None]:
df['con_time'].describe()

In [None]:
df['project_time_years'].describe()

In [None]:
plt.figure(1, figsize=(16,14), )
plt.suptitle("Figure 3. SF Development Times by Stage", fontsize=24)

ax = plt.subplot(221)
ax.set_title("Entitlement Time", fontsize=18)
ax.set_xlabel('Time in Years', fontsize=18)
ax.set_ylabel('Number of Developments', fontsize=18)
plt.xlim([0,14])
plt.ylim([0,80])
plt.xticks(fontsize = 16)
plt.yticks(fontsize = 16)
ax.hist(df['permit_time'], bins=50, color='b')

ax = plt.subplot(222)
ax.set_title("Construction Prep Time", fontsize=18)
ax.set_xlabel('Time in Years', fontsize=18)
ax.set_ylabel('Number of Developments', fontsize=18)
plt.xlim([0,14])
plt.ylim([0,80])
plt.xticks(fontsize = 16)
plt.yticks(fontsize = 16)
ax.hist(df['bp_time'], bins=50, color='r')

ax = plt.subplot(223)
ax.set_title("Construction Time", fontsize=18)
ax.set_xlabel('Time in Years', fontsize=18)
ax.set_ylabel('Number of Developments', fontsize=18)
plt.xlim([0,14])
plt.ylim([0,80])
plt.xticks(fontsize = 16)
plt.yticks(fontsize = 16)
ax.hist(df['con_time'], bins=50, color='g')

plt.savefig('../../../big_hist.png')

In [None]:
ent_medians=[]
bp_medians=[]
cons_medians=[]
categories=['0-10', '10-50', '50-100', '100-200', '>200']
for cat in categories:
    median1 = df[df['unitcat']==cat]['permit_time'].median()
    median2 = df[df['unitcat']==cat]['bp_time'].median()
    median3 = df[df['unitcat']==cat]['con_time'].median()
    ent_medians.append(median1)
    bp_medians.append(median2)
    cons_medians.append(median3)
objects = df['unitcat'].unique()
y_pos = np.arange(len(objects))

In [None]:
plt.figure(1, figsize=(16,14), )
plt.suptitle("Figure 6. Development Stages by Size Category", fontsize=24)

ax = plt.subplot(221)
ax.set_title('Median Entitlement Time', fontsize=18)
ax.set_xlabel('Number of Units', fontsize=18)
ax.set_ylabel('Years', fontsize=18)
ax.set_xticks(y_pos)
ax.set_xticklabels(categories)
ax.set_ylim([0,4])
ax.bar(y_pos, ent_medians, align='center', color='b')

ax = plt.subplot(222)
ax.set_title('Median Construction Prep Time', fontsize=18)
ax.set_xlabel('Number of Units', fontsize=18)
ax.set_ylabel('Years', fontsize=18)
ax.set_xticks(y_pos)
ax.set_xticklabels(categories)
ax.set_ylim([0,4])
ax.bar(y_pos, bp_medians, align='center', color='r')

ax = plt.subplot(223)
ax.set_title('Median Construction  Time', fontsize=18)
ax.set_xlabel('Number of Units', fontsize=18)
ax.set_ylabel('Years', fontsize=18)
ax.set_xticks(y_pos)
ax.set_xticklabels(categories)
ax.set_ylim([0,4])
ax.bar(y_pos, cons_medians, align='center', color='g')

plt.savefig('../../../big_bar.png')

# Create Neighborhood Graphs

In [None]:
#create separate geodataframe for just those with all dates
df_nb_full = df_nb[pd.notnull(df_nb['BP_date'])]

In [None]:
# for these graphs, drop if neighborhood has sample less than 10
for nhood in df_nb_full['nhood'].value_counts().index:
    if df_nb_full[df_nb_full['nhood']==nhood]['BP_date'].count() <10:
        df_nb_full=df_nb_full[df_nb_full['nhood']!=nhood]

In [None]:
df_nb_full['permit_time']=df_nb_full.apply(lambda x: ((dateutil.parser.parse(x['BP_date']) - dateutil.parser.parse(x['first_date'])).days)/365, axis=1)


In [None]:
df_nb_full['ptime_unityears']=df_nb_full['permit_time']/df_nb_full['units']

In [None]:
p_medians=[]
pcategories = df_nb_full['nhood'].unique().tolist()
pcategories = sorted(pcategories, key=lambda x: df_nb_full[df_nb_full['nhood']==x]['ptime_unityears'].median())

In [None]:
for cat in pcategories:
    median = df_nb_full[df_nb_full['nhood']==cat]['ptime_unityears'].median()
    p_medians.append(median)
y_pos=np.arange(len(pcategories))

In [None]:
plt.figure(1, figsize=(12,8))
plt.suptitle('Figure 8. SF Entitlement Times by Neighborhood', fontsize=20)
plt.ylabel('Median Entitlement Time (Years per Unit)', fontsize=18)
plt.xlabel('Neighborhood', fontsize=18)
plt.bar(y_pos, p_medians, align='center', alpha=1, width=0.8, color='r')
plt.xticks(y_pos, pcategories, rotation=90, fontsize=12)
plt.savefig('../../../nhood2.png')
plt.show()

In [None]:
cat_medians=[]
categories=df_nb_full['nhood'].unique().tolist()
categories = sorted(categories, key=lambda x: df_nb_full[df_nb_full['nhood']==x]['years_per_unit'].median())

In [None]:
for cat in categories:
    median = df_nb_full[df_nb_full['nhood']==cat]['years_per_unit'].median()
    cat_medians.append(median)

In [None]:
y_pos=np.arange(len(categories))

In [None]:
plt.figure(1, figsize=(12,8))
plt.suptitle('Figure 7. SF Development Times by Neighborhood', fontsize=20)
plt.ylabel('Median Development Time (Years per Unit)', fontsize=18)
plt.xlabel('Neighborhood', fontsize=18)
plt.bar(y_pos, cat_medians, align='center', alpha=1, width=0.8, color='b')
plt.xticks(y_pos, categories, rotation=90, fontsize=12)
plt.savefig('../../../nhood1.png')
plt.show()

In [None]:
df_nb_full.shape