In [63]:
import csv
import dateutil
import glob
import json
import os
import pandas as pd
import logging
import numpy as np
from dateutil import parser
pd.set_option('display.max_columns', 500)

In [64]:
logging.basicConfig(level=logging.INFO)

In [65]:
def load_csv_with_mapping(csvfile, column_mapping):
    mapped_data = []
    for row in csv.DictReader(csvfile):
        mapped_row = dict((column_mapping[k], v) for k, v in row.items())
        mapped_data.append(mapped_row)
    return mapped_data

In [66]:
def clean_df(df):
    # Standardize best_stat
    df.best_stat = df.best_stat.apply(lambda x: x.strip().upper())

    # Standardize best_date
    df.best_date = df.best_date.apply(maybe_format_date)

    # Standardize APN
    # Some APN values are "APN XXXXXX", some are just "XXXXXXX", standardize this.
    records_with_apn = df.apn.notnull()
    df.apn = df.apn[records_with_apn].apply(lambda x: x.split()[-1])

    # Standardize Lat/Long
    records_with_location_attribute = df.location.notnull()
    if 'x' not in df.columns:
        df.x = df.location[records_with_location_attribute].apply(get_lat_from_glob)
        df.y = df.location[records_with_location_attribute].apply(get_long_from_glob)
    elif 'x' in df.columns:
        df.x[records_with_location_attribute] = df.location[records_with_location_attribute].apply(get_lat_from_glob)
        df.y[records_with_location_attribute] = df.location[records_with_location_attribute].apply(get_long_from_glob)

    #Brian: Not necessary for now. We can lump this into the lines above by changing the key files (I think)
    #records_with_geography_attribute = df.geography.notnull()
    #df.x = df.geography[records_with_geography_attribute].apply(get_lat_from_glob)
    #df.y = df.geography[records_with_geography_attribute].apply(get_long_from_glob)

    # Get address into separate fields for cases where it is concatenated with the lat,long
    if 'address' not in df.columns:
        records_with_address_in_location_attribute = df.location.apply(lambda x: not pandas.isnull(x) and '\n' in x)
        df.address = df.location[records_with_address_in_location_attribute].apply(get_address_from_glob)
    else:
        df['address2'] = df.address #df.address2 syntax would result in chained assignment
        records_with_address_in_location_attribute = df.location.apply(lambda x: not pandas.isnull(x) and '\n' in x) #and df.address.apply(lambda x: pandas.isnull(x)) 
        df.address[records_with_address_in_location_attribute] = df.location[records_with_address_in_location_attribute].apply(get_address_from_glob)
        records_with_address_field = df.address2.notnull() #need additional layer of cleaning because some filed have zip codes in location field instead of lat long. These usually have separate address field to use. 
        df.address[records_with_address_field] = df.address2[records_with_address_field]

    # Standardize address by uppercasing and removing punctuation
    df.address = df.address.apply(standardize_address)

    # Classify neighborhoods using point-in-polygon approach. Brian: Skipping this for now
    #df['classified_neighborhood'] = geo_classifier.classify_df(df)

    return df

In [86]:
column_mappings = []
list = ['2016Q1.txt']
for mapping_filename in list:
    with open('raw/columnnames/' + mapping_filename, 'r') as mapping_file:
        column_mapping = {}
        column_mapping['year'] = mapping_filename[0:4]
        column_mapping['quarter'] = mapping_filename[5:6]
        for row in csv.DictReader(mapping_file):
            column_mapping[row['key']] = row['value']
        column_mappings.append(column_mapping)
        

In [100]:
csvfile = "raw/San_Francisco_Development_Pipeline_2016_Quarter_1_v2.csv"
with open(csvfile, 'r', encoding='mac_roman') as csvfile:
    housing_data = load_csv_with_mapping(csvfile, column_mapping)
    df = pd.DataFrame(housing_data)

In [101]:
df.head()

Unnamed: 0,address,aff,affnet,alias,apn,best_date,best_stat,caseno,contactphone,cost,cult_inst_educ,dbi_permit,dbidesc,district,districtname,entitled,firstfiled,fullname,heightlimit,location,medical,neighborhood,net_cult_inst_educ,net_medical,net_office,net_prod_dist_rep,net_ret_ent,net_visitor,office,planarea,planner,plndesc,prod_dist_rep,proj_type,propuse,ret_ent,sponsor_name,supdist,units,unitsnet,visitor,zoning
0,1 ARDATH COURT (COMMUNITY CENTER),0,0,,APN 4712008,12/17/15,BP FILED,2015-003310PRJ,415-702-1609,"$600,000.00",5588,202000000000.0,"ERECT 1-STORY, NO BASEMENT, TYPE 5-1, COMMUNIT...",South Bayshore,"RESIDENTIAL- HOUSE, TWO FAMILY",0,11/17/15,Hafsa Burt,40-X,"(37.7342431939, -122.382430)",0,Bayview,5588,0,0,0,0,0,0,Bayview Hunters Point,JDISALVO,New construction of a community center to be l...,0,CIE,RECREATION BLDG,0,HB&A Architects,SUPERVISORIAL DISTRICT 10,0,0,0,RH-2
1,1 EDGAR AV,0,0,,APN 6978069,5/8/15,BP FILED,,,"$300,000.00",0,202000000000.0,CONSTRUCT (N) 3-STORY SINGLE FAMILY DWELLING UNIT,Ingleside,"RESIDENTIAL- HOUSE, ONE FAMILY",0,5/8/15,,40-X,"(37.7203939917, -122.452521)",0,Ocean View,0,0,0,0,0,0,0,,,,0,Resident,1 FAMILY DWELLING,0,,SUPERVISORIAL DISTRICT 11,1,1,0,RH-1
2,1 FRANKLIN ST,5,5,,APN 0837003,3/28/16,CONSTRUCTION,2010.0102,415-252-7063,"$11,000,000.00",0,201000000000.0,CONSTRUCTION OF 8-STORY 35 DWELLING UNITS W/RE...,Buena Vista,MODERATE SCALE NEIGHBORHOOD COMMERCIAL TRANSIT...,-1,12/3/09,Warner Schmalz,85-X,"(37.7744289168, -122.421110)",0,Downtown/Civic Center,0,0,0,0,2384,0,0,Market and Octavia,TFRYE,"Construct a new 8-story, 35-unit mixed-use bui...",0,Mixres,,2384,John Ramsbacher,SUPERVISORIAL DISTRICT 5,35,35,0,NCT-3
3,1 HENRY ADAMS ST,0,0,,APN 3911001,3/17/16,CONSTRUCTION,2012.0701,415-788-2777,"$45,000,000.00",0,201000000000.0,Multiple Permits,South of Market,URBAN MIXED USE,-1,5/31/12,Robert Meyers,68-X,"(37.7692014331, -122.403473)",0,South of Market,0,0,0,0,0,0,0,Showplace Square/Potrero Hill (EN),DSIDER,Demolish an existing building (Concourse Exhib...,0,Resident,APARTMENTS,0,Robert Meyers,SUPERVISORIAL DISTRICT 10,560,560,0,UMU
4,1 HORACE ST,0,0,,APN 6525001A,9/24/14,BP FILED,2014.1301,415-920-1839,"$550,000.00",0,201000000000.0,CONSTRUCT (N) 3-STORY SINGLE FAMILY RESIDENCE.,Mission,"RESIDENTIAL- HOUSE, TWO FAMILY",0,8/22/14,Yakuh Askew,40-X,"(37.7507228458, -122.414398)",0,Mission,0,0,0,0,0,0,0,Mission (EN),,Variance request to the rear yard requirement ...,0,Resident,1 FAMILY DWELLING,0,Chris Giouzelis,SUPERVISORIAL DISTRICT 9,2,1,0,RH-2


In [27]:
df['planning_filed'] = pd.to_datetime(df['planning_filed'])
df['dbi_filed'] = pd.to_datetime(df['dbi_filed'])

In [28]:
df = pd.read_csv('cleaned/all_quarters_merged_PRECLEAN.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [29]:
df['firstfiled'] = pd.to_datetime(df['firstfiled'])
df['planning_filed'] = pd.to_datetime(df['planning_filed'])
df['dbi_filed'] = pd.to_datetime(df['dbi_filed'])

In [48]:
df[['firstfiled', 'planning_filed', 'dbi_filed']][400:450]

Unnamed: 0,firstfiled,planning_filed,dbi_filed
400,2012-02-17 08:00:00,NaT,2012-02-17 08:00:00
401,2012-02-17 08:00:00,NaT,2012-02-17 08:00:00
402,2012-02-28 08:00:00,2012-02-28 08:00:00,2012-03-02 08:00:00
403,2012-02-29 08:00:00,NaT,2012-02-29 08:00:00
404,2012-03-05 08:00:00,NaT,2012-03-05 08:00:00
405,2012-03-05 08:00:00,NaT,2012-03-05 08:00:00
406,2012-03-12 07:00:00,NaT,2012-03-12 07:00:00
407,2012-03-14 07:00:00,NaT,2012-03-14 07:00:00
408,2012-03-19 07:00:00,NaT,2012-03-19 07:00:00
409,2012-03-26 07:00:00,NaT,2012-03-26 07:00:00


In [39]:
def rule(value):
    if pd.isnull(value['firstfiled']):
        if pd.isnull(value['planning_filed']) & pd.notnull(value['dbi_filed']):
            return value['dbi_filed']
        elif pd.isnull(value['dbi_filed']) & pd.notnull(value['planning_filed']):
            return value['planning_filed']
        elif pd.notnull(value['dbi_filed']) & pd.notnull(value['planning_filed']):
            return value[['planning_filed', 'dbi_filed']].min()
    else:
        return value['firstfiled']
    
df['firstfiled'] = df.apply(rule, axis = 1)

In [56]:
# Check where first filed is missing (no first date information)
df[df['firstfiled'].isnull()][['firstfiled', 'planning_filed', 'dbi_filed', 'report_quarter', 'report_year', 'address', 'apn']]

Unnamed: 0,firstfiled,planning_filed,dbi_filed,report_quarter,report_year,address,apn
538,NaT,NaT,NaT,1,2012,,1101007
1481,NaT,NaT,NaT,2,2012,,1101007
2152,NaT,NaT,NaT,3,2012,1301 Divisadero St,1101007
2352,NaT,NaT,NaT,4,2012,,1101007
3082,NaT,NaT,NaT,1,2013,,1101007
6215,NaT,NaT,NaT,4,2013,,8724001
7124,NaT,NaT,NaT,1,2014,,8724001
7991,NaT,NaT,NaT,2,2014,,8724001
8807,NaT,NaT,NaT,3,2014,,8724001
9024,NaT,NaT,NaT,4,2014,515 JOHN MUIR DR,7282005


In [187]:
df['planning_filed'] = pd.to_datetime(df['planning_filed'])
df['dbi_filed'] = pd.to_datetime(df['dbi_filed'])

In [188]:
planning_not_dbi = df['planning_filed'].notnull() & df['dbi_filed'].isnull()
df['test'] = df['planning_filed'][planning_not_dbi]

In [189]:
dbi_not_planning = df['planning_filed'].isnull() & df['dbi_filed'].notnull()
df['test'][dbi_not_planning] = df['dbi_filed'][dbi_not_planning]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [190]:
both = df['planning_filed'].notnull() & df['dbi_filed'].notnull()
df['test'][both] = df[['planning_filed', 'dbi_filed']].min(axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [192]:
df[['planning_filed', 'dbi_filed', 'test']][10:50]

Unnamed: 0,planning_filed,dbi_filed,test
10,2004-07-08 07:00:00,2010-12-08 08:00:00,2004-07-08 07:00:00
11,2004-09-16 07:00:00,2010-12-30 08:00:00,2004-09-16 07:00:00
12,NaT,2004-11-09 08:00:00,2004-11-09 08:00:00
13,2006-09-28 07:00:00,2004-12-01 08:00:00,2004-12-01 08:00:00
14,2005-02-03 08:00:00,2004-12-27 08:00:00,2004-12-27 08:00:00
15,2005-03-16 08:00:00,2006-08-23 07:00:00,2005-03-16 08:00:00
16,NaT,2005-03-29 08:00:00,2005-03-29 08:00:00
17,2006-02-09 08:00:00,2005-05-03 07:00:00,2005-05-03 07:00:00
18,NaT,2005-05-27 07:00:00,2005-05-27 07:00:00
19,2005-09-01 07:00:00,2005-07-06 07:00:00,2005-07-06 07:00:00


In [None]:
#filename="raw/San_Francisco_Development_Pipeline_2014_Quarter_4.csv"
#housing_data = load_csv_with_mapping(filename, column_mapping)
#df = pandas.read_csv("raw/San_Francisco_Development_Pipeline_2014_Quarter_4.csv")
#df.head()

In [None]:
#Scratch code:
#df = pandas.read_csv("raw/San_Francisco_Development_Pipeline_2012_Quarter_4.csv")
#records_with_apn = df['Block Lot'].notnull()
#df['Block Lot'] = df['Block Lot'].apply(lambda x: x.split()[-1])
#construction = ['CONSTRUCTION']
#records_construction = df['Best Stat'].isin(construction)
#records_construction = df['Best Stat']=='CONSTRUCTION'
#records_construction = df['Best Stat'].notnull()
#df['Under Construction'] = df['Best Stat'][records_construction].apply(lambda x: "YES")
#df.head()
#records_construction.head()
#df[records_construction].head()