In [8]:
import csv
import dateutil
import glob
import json
import os
import pandas as pd
import logging
import numpy as np
from dateutil import parser
pd.set_option('display.max_columns', 500)

In [9]:
logging.basicConfig(level=logging.INFO)

In [10]:
    def load_csv_with_mapping(csvfile, column_mapping):
        mapped_data = []
        for row in csv.DictReader(csvfile):
            mapped_row = dict((column_mapping[k], v) for k, v in row.items())
            mapped_data.append(mapped_row)
        return mapped_data

In [33]:
def clean_df(df):
    # Standardize best_stat
    df.best_stat = df.best_stat.apply(lambda x: x.strip().upper())

    # Standardize best_date
    df.best_date = df.best_date.apply(maybe_format_date)

    # Standardize APN
    # Some APN values are "APN XXXXXX", some are just "XXXXXXX", standardize this.
    records_with_apn = df.apn.notnull()
    df.apn = df.apn[records_with_apn].apply(lambda x: x.split()[-1])

    # Standardize Lat/Long
    records_with_location_attribute = df.location.notnull()
    if 'x' not in df.columns:
        df.x = df.location[records_with_location_attribute].apply(get_lat_from_glob)
        df.y = df.location[records_with_location_attribute].apply(get_long_from_glob)
    elif 'x' in df.columns:
        df.x[records_with_location_attribute] = df.location[records_with_location_attribute].apply(get_lat_from_glob)
        df.y[records_with_location_attribute] = df.location[records_with_location_attribute].apply(get_long_from_glob)

    #Brian: Not necessary for now. We can lump this into the lines above by changing the key files (I think)
    #records_with_geography_attribute = df.geography.notnull()
    #df.x = df.geography[records_with_geography_attribute].apply(get_lat_from_glob)
    #df.y = df.geography[records_with_geography_attribute].apply(get_long_from_glob)

    # Get address into separate fields for cases where it is concatenated with the lat,long
    if 'address' not in df.columns:
        records_with_address_in_location_attribute = df.location.apply(lambda x: not pandas.isnull(x) and '\n' in x)
        df.address = df.location[records_with_address_in_location_attribute].apply(get_address_from_glob)
    else:
        df['address2'] = df.address #df.address2 syntax would result in chained assignment
        records_with_address_in_location_attribute = df.location.apply(lambda x: not pandas.isnull(x) and '\n' in x) #and df.address.apply(lambda x: pandas.isnull(x)) 
        df.address[records_with_address_in_location_attribute] = df.location[records_with_address_in_location_attribute].apply(get_address_from_glob)
        records_with_address_field = df.address2.notnull() #need additional layer of cleaning because some filed have zip codes in location field instead of lat long. These usually have separate address field to use. 
        df.address[records_with_address_field] = df.address2[records_with_address_field]

    # Standardize address by uppercasing and removing punctuation
    df.address = df.address.apply(standardize_address)

    # Classify neighborhoods using point-in-polygon approach. Brian: Skipping this for now
    #df['classified_neighborhood'] = geo_classifier.classify_df(df)

    return df

In [28]:
column_mappings = []
list = ['2011Q4.txt']
for mapping_filename in list:
    with open('raw/columnnames/' + mapping_filename, 'r') as mapping_file:
        column_mapping = {}
        column_mapping['year'] = mapping_filename[0:4]
        column_mapping['quarter'] = mapping_filename[5:6]
        for row in csv.DictReader(mapping_file):
            column_mapping[row['key']] = row['value']
        column_mappings.append(column_mapping)

In [29]:
column_mapping

{'ACTION_SEQ': 'action_seq',
 'Address': 'address',
 'BLOCK': 'block',
 'Best Date': 'best_date',
 'Best Stat': 'best_stat',
 'Block Lot': 'apn',
 'CONTACTADD': 'contactadd',
 'CONTACTCITY': 'contactcity',
 'CONTACTPHONE': 'contactphone',
 'Cult, Inst, Educ': 'cult_inst_educ',
 'DBI Filed': 'dbi_filed',
 'DBI Permit': 'dbi_permit',
 'DBI Project Description': 'dbi_project_description',
 'EntitlementStatus': 'entitled',
 'Expr1022': 'expr1022',
 'Expr1023': 'expr1023',
 'Expr1028': 'expr2018',
 'FULLNAME': 'fullname',
 'FirstFiled': 'firstfiled',
 'HEIGHTLIMIT': 'heightlimit',
 'LANDUSE': 'landuse',
 'LOT': 'lot',
 'Latitude': 'y',
 'Longitude': 'x',
 'Medical': 'medical',
 'Net Added SF': 'net_added_sf',
 'Net Added Units': 'net_added_units',
 'Net Cult, Inst, Educ': 'net_cult_inst_educ',
 'Net Medical': 'net_medical',
 'Net Office': 'net_office',
 'Net Prod, Dist, Rep': 'net_prod_dist_rep',
 'Net Ret, Ent': 'net_ret_ent',
 'Net Visitor': 'net_visitor',
 'ObjectID': 'objectid',
 'Offic

In [30]:
csvfile = "raw/San_Francisco_Development_Pipeline_2011_Quarter_4.csv"
with open(csvfile, 'r') as csvfile:
    housing_data = load_csv_with_mapping(csvfile, column_mapping)
    df = pd.DataFrame(housing_data)

In [31]:
df.head()

Unnamed: 0,action_seq,address,apn,best_date,best_stat,block,contactadd,contactcity,contactphone,cult_inst_educ,dbi_filed,dbi_permit,dbi_project_description,entitled,expr1022,expr1023,expr2018,firstfiled,fullname,heightlimit,landuse,lot,medical,neighborhood,net_added_sf,net_added_units,net_cult_inst_educ,net_gsf,net_medical,net_office,net_prod_dist_rep,net_ret_ent,net_visitor,objectid,office,parking,parkingnet,planning_filed,planning_id,planning_project_description,prod_dist_rep,ret_ent,sitearea,sort,sponsor_firm,sponsor_name,supdist,taz,tempselect,total_gsf_commercial,units,visitor,x,y,zoning,zoning_generalized,zoning_simplified
0,2,425 Mission St,3720001,01-Jul-08,PL Filed,3720,"201 MISSION ST, STE 2750","SAN FRANCISCO, CA 94105",597-4620,,,,,0,30-X/80-X,P,72.0,01-Jul-08,ROBERT BECK 597-4620,30-X/80-X,MIPS,1,,TB Combo,1742950,0,,1742950,,1700000,,42950.0,,1776,1700000,350.0,350.0,01-Jul-08,2008.0789,"Transbay Tower project is a 1,200-ft tall, 80-...",,43000.0,96376,Planning Filed,TJPA,ROBERT BECK,6.0,944,72.0,1743000,0.0,,-122.396833,37.790167,P,Public,P
1,5,300 16th St,8722001,20-Dec-11,PL Filed,8722,,,,6000.0,,,,0,MB-RA,MB-RA,,20-Dec-11,Salesforce.com,MB-RA,MIPS,1,,Mission Bay,1314998,0,6000.0,1314998,,1259680,,49318.0,,3149,1259680,,,20-Dec-11,2011.1423,To construct a new campus for the Salesforce.c...,,49318.0,522185,Planning Filed,Salesforce.com,Salesforce.com,6.0,930,,1314998,,,-122.389264,37.766969,MB-RA,Mixed Use,MB-RA
2,2,181 Fremont St,3719010,15-May-07,PL Filed,3719,"601 California, Suite 1310","San Francisco, CA 94108",415.421.8200,,,,,0,350-S,C-3-O(SD),72.0,15-May-07,Daniel Kingsley 415.421.8200,350-S,Mixres,10,,TB Combo,492866,140,,492866,,492866,,,,1773,530316,241.0,241.0,15-May-07,2007.0456,"66-story office mixed-use high-rise project, 7...",,,15313,Planning Filed,SKS Investments,Daniel Kingsley,6.0,943,72.0,530316,140.0,,-122.395424,37.789645,C-3-O(SD),Commercial,C-3-O(SD)
3,2,601 Townsend St,3799001,13-Oct-11,PL Filed,3799,"One Bush Street, Ste. 600","San Francisco, CA 94104",415.567.9000,,,,,0,68-X,UMU,,13-Oct-11,John Kelvin 415.567.9000,68-X,MIPS,1,,Showpl/Potrero,72600,0,,72600,,72600,,,,4398,288458,,,13-Oct-11,2011.1147,Conversion of basement level into office (appr...,,,0,Planning Filed,"Reuben & Junius, LLP",John Kelvin,,618,,288458,,,-122.4020173,37.7713728,UMU,Mixed Use,UMU
4,5,1100 Van Ness Ave,694005,10-Jun-10,PL Filed,694,"633 Folsom Street, 5th Floor","San Francisco, CA 94107",415-600-7206,900200.0,,,,0,130-V,RC-4,,10-Jun-10,Geoffrey Nelson 415-600-7206,130-V,Mixed,5,0.0,Downtown,702067,-25,900200.0,702067,0.0,19831,50182.0,-55493.0,-212653.0,3243,277873,,,10-Jun-10,2009.0885,Demolition of the exisitng Cathedral Hill hote...,50182.0,14747.0,104203,Planning Filed,,Geoffrey Nelson,6.0,700,,1243002,0.0,0.0,-122.421333,37.785753,RC-4,Mixed Use,RC-4


In [27]:
df['planning_filed'] = pd.to_datetime(df['planning_filed'])
df['dbi_filed'] = pd.to_datetime(df['dbi_filed'])

In [28]:
df = pd.read_csv('cleaned/all_quarters_merged_PRECLEAN.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [29]:
df['firstfiled'] = pd.to_datetime(df['firstfiled'])
df['planning_filed'] = pd.to_datetime(df['planning_filed'])
df['dbi_filed'] = pd.to_datetime(df['dbi_filed'])

In [48]:
df[['firstfiled', 'planning_filed', 'dbi_filed']][400:450]

Unnamed: 0,firstfiled,planning_filed,dbi_filed
400,2012-02-17 08:00:00,NaT,2012-02-17 08:00:00
401,2012-02-17 08:00:00,NaT,2012-02-17 08:00:00
402,2012-02-28 08:00:00,2012-02-28 08:00:00,2012-03-02 08:00:00
403,2012-02-29 08:00:00,NaT,2012-02-29 08:00:00
404,2012-03-05 08:00:00,NaT,2012-03-05 08:00:00
405,2012-03-05 08:00:00,NaT,2012-03-05 08:00:00
406,2012-03-12 07:00:00,NaT,2012-03-12 07:00:00
407,2012-03-14 07:00:00,NaT,2012-03-14 07:00:00
408,2012-03-19 07:00:00,NaT,2012-03-19 07:00:00
409,2012-03-26 07:00:00,NaT,2012-03-26 07:00:00


In [39]:
def rule(value):
    if pd.isnull(value['firstfiled']):
        if pd.isnull(value['planning_filed']) & pd.notnull(value['dbi_filed']):
            return value['dbi_filed']
        elif pd.isnull(value['dbi_filed']) & pd.notnull(value['planning_filed']):
            return value['planning_filed']
        elif pd.notnull(value['dbi_filed']) & pd.notnull(value['planning_filed']):
            return value[['planning_filed', 'dbi_filed']].min()
    else:
        return value['firstfiled']
    
df['firstfiled'] = df.apply(rule, axis = 1)

In [56]:
# Check where first filed is missing (no first date information)
df[df['firstfiled'].isnull()][['firstfiled', 'planning_filed', 'dbi_filed', 'report_quarter', 'report_year', 'address', 'apn']]

Unnamed: 0,firstfiled,planning_filed,dbi_filed,report_quarter,report_year,address,apn
538,NaT,NaT,NaT,1,2012,,1101007
1481,NaT,NaT,NaT,2,2012,,1101007
2152,NaT,NaT,NaT,3,2012,1301 Divisadero St,1101007
2352,NaT,NaT,NaT,4,2012,,1101007
3082,NaT,NaT,NaT,1,2013,,1101007
6215,NaT,NaT,NaT,4,2013,,8724001
7124,NaT,NaT,NaT,1,2014,,8724001
7991,NaT,NaT,NaT,2,2014,,8724001
8807,NaT,NaT,NaT,3,2014,,8724001
9024,NaT,NaT,NaT,4,2014,515 JOHN MUIR DR,7282005


In [187]:
df['planning_filed'] = pd.to_datetime(df['planning_filed'])
df['dbi_filed'] = pd.to_datetime(df['dbi_filed'])

In [188]:
planning_not_dbi = df['planning_filed'].notnull() & df['dbi_filed'].isnull()
df['test'] = df['planning_filed'][planning_not_dbi]

In [189]:
dbi_not_planning = df['planning_filed'].isnull() & df['dbi_filed'].notnull()
df['test'][dbi_not_planning] = df['dbi_filed'][dbi_not_planning]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [190]:
both = df['planning_filed'].notnull() & df['dbi_filed'].notnull()
df['test'][both] = df[['planning_filed', 'dbi_filed']].min(axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [192]:
df[['planning_filed', 'dbi_filed', 'test']][10:50]

Unnamed: 0,planning_filed,dbi_filed,test
10,2004-07-08 07:00:00,2010-12-08 08:00:00,2004-07-08 07:00:00
11,2004-09-16 07:00:00,2010-12-30 08:00:00,2004-09-16 07:00:00
12,NaT,2004-11-09 08:00:00,2004-11-09 08:00:00
13,2006-09-28 07:00:00,2004-12-01 08:00:00,2004-12-01 08:00:00
14,2005-02-03 08:00:00,2004-12-27 08:00:00,2004-12-27 08:00:00
15,2005-03-16 08:00:00,2006-08-23 07:00:00,2005-03-16 08:00:00
16,NaT,2005-03-29 08:00:00,2005-03-29 08:00:00
17,2006-02-09 08:00:00,2005-05-03 07:00:00,2005-05-03 07:00:00
18,NaT,2005-05-27 07:00:00,2005-05-27 07:00:00
19,2005-09-01 07:00:00,2005-07-06 07:00:00,2005-07-06 07:00:00


In [None]:
#filename="raw/San_Francisco_Development_Pipeline_2014_Quarter_4.csv"
#housing_data = load_csv_with_mapping(filename, column_mapping)
#df = pandas.read_csv("raw/San_Francisco_Development_Pipeline_2014_Quarter_4.csv")
#df.head()

In [None]:
#Scratch code:
#df = pandas.read_csv("raw/San_Francisco_Development_Pipeline_2012_Quarter_4.csv")
#records_with_apn = df['Block Lot'].notnull()
#df['Block Lot'] = df['Block Lot'].apply(lambda x: x.split()[-1])
#construction = ['CONSTRUCTION']
#records_construction = df['Best Stat'].isin(construction)
#records_construction = df['Best Stat']=='CONSTRUCTION'
#records_construction = df['Best Stat'].notnull()
#df['Under Construction'] = df['Best Stat'][records_construction].apply(lambda x: "YES")
#df.head()
#records_construction.head()
#df[records_construction].head()