In [87]:
from match_eas import join_data_on_address_GPS
import pandas as pd
import numpy as np
from sodapy import Socrata
import math
import pickle

# so I wanted to pull the data from data.sfgov directly but it was super time consuming 
# and I just downloaded the csv files

# Fire Incidents: https://data.sfgov.org/Public-Safety/Fire-Incidents/wr8u-xric
incidentdf = pd.read_csv('./raw_data/Fire_Incidents.csv', low_memory=False)

# https://data.sfgov.org/Housing-and-Buildings/-Known-Issue-Assessor-Historical-Secured-Property-/wv5m-vpq2
taxrolldf = pd.read_csv('./raw_data/_Known_Issue__Assessor_Historical_Secured_Property_Tax_Rolls.csv', low_memory=False)

# https://data.sfgov.org/Housing-and-Buildings/Fire-Inspections/wb4c-6hwj
inspectiondf = pd.read_csv('./raw_data/Fire_Inspections.csv', low_memory=False)

# https://data.sfgov.org/Housing-and-Buildings/Building-Permits/i98e-djp9#
# bldg_permits_df = pd.read_csv('./raw_data/Building_Permits.csv', low_memory=False)

# First thing we need to do is prepare dataframes to run through join_data_on_address_GPS function

In [88]:
""" Prepare fire incident data. """

# drop rows for which we don't know the location of the fire
# row_instance: string that describes each row, i.e. 'fire incidents'
def drop_nulls(df, row_instance):
    oglen = len(df)
    df = df[pd.notnull(df['Location'])]
    print('%i rows dropped' % (oglen - len(df)))
    print('%i %s' % (len(df), row_instance))
    # NaN Address values were breaking the join_data_on_address_GPS function
    if df.Address.isnull().any():
        df.Address.fillna("None", inplace=True)
    return df

incidentdf = incidentdf[['Primary Situation', 'Incident Number', 'Incident Date', 
                     'Address', 'Property Use', 'Location']]

incidentdf = drop_nulls(incidentdf, 'fire incidents')

incidentdf.columns = incidentdf.columns.str.replace(' ', '_')

# change location feature to two variables: Lat & Long
def split_coords(inputstr, ctype):
    coords = inputstr.strip('()').split(',')
    coords = [float(i) for i in coords]
    if ctype=='Latitude':
        return coords[0]
    else:
        return coords[1]


incidentdf['Latitude'] = incidentdf['Location'].apply(lambda x: split_coords(x, ctype='Latitude'))
incidentdf['Longitude'] = incidentdf['Location'].apply(lambda x: split_coords(x, ctype='Longitude'))
incidentdf.drop(['Location'], axis=1, inplace=True)

# and finally, reset index
incidentdf.reset_index(drop=True, inplace=True)

# now we can find the EAS code for each of the fire incidents 
# incidentdf.head()

63714 rows dropped
399011 fire incidents


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [89]:
""" Match fire incident data """ 

# commented to avoid running again
# matched_fire_incidents = join_data_on_address_GPS(df=incidentdf)

# store dataframe with pickle so we don't have to wait for matching function 
# to store:
# matched_fire_incidents.to_pickle('fire_incidents.pickle')

# to load:
matched_fire_incidents = pd.read_pickle('fire_incidents.pickle')

matched_fire_incidents.head()

Unnamed: 0,Primary_Situation,Incident_Number,Incident_Date,Property_Use,Latitude,Longitude,Address,EAS,Score
0,"711 - municipal alarm system, street box false",12044490,05/13/2012,963 - street or road in commercial area,37.776746,-122.407844,298 07TH ST,469080.0,86.0
1,"711 - municipal alarm system, street box false",11101416,11/01/2011,"960 - street, other",37.789619,-122.420497,1507 PINE ST,287429.0,86.0
2,"700 - false alarm or false call, other",12077793,08/22/2012,"400 - residential, other",37.796023,-122.439983,2824 SCOTT ST,284635.0,100.0
3,"711 - municipal alarm system, street box false",7021713,03/15/2007,"962 - residential street, road or residential dr",37.712995,-122.4021,1 TEDDY AVE,474800.0,86.0
4,"113 cooking fire, confined to container",15065212,06/20/2015,161 restaurant or cafeteria,37.754149,-122.480073,1521 NORIEGA ST,479659.0,82.0


In [90]:
""" Prepare tax roll dataset. 

Construction Type Codes: 
    {D: 'Wood or steel studs in bearing wall, full or partial open wood or steel frame, 
        primarily combustible construction', 
     C: 'Masonry or concrete load-bearing walls with or without pilasters', 
     B: 'Reinforced concrete columns and beams. Fire resistant construction',
     A: 'Structural steel columns and beams, fireproofed with masonry, concrete, plaster,
        or other noncombustible material',
     S: 'Metal bents, columns, girders without fireproofing. Generally incombustible',
     WOO*: 'Wood',
     STE*: 'Steel'}

* don't see these definitions listed so I'm making assumptions                          
""" 

taxrolldf = taxrolldf[['Year Property Built', 'Number of Bathrooms', 'Number of Bedrooms', 'Number of Rooms',
                        'Number of Rooms', 'Number of Stories', 'Number of Units', 'Construction Type',
                        'Property Area in Square Feet', 'Neighborhoods - Analysis Boundaries', 
                        'Closed Roll Assessed Improvement Value', 'Closed Roll Assessed Land Value', 
                        'Location', 'Property Location']]

taxrolldf.rename(columns={'Neighborhoods - Analysis Boundaries': 'Neighborhood', 
                          'Property Location': 'Address'}, inplace=True)

taxrolldf = drop_nulls(taxrolldf, 'properties in SF')

taxrolldf.columns = taxrolldf.columns.str.replace(' ', '_')

taxrolldf['Latitude'] = taxrolldf['Location'].apply(lambda x: split_coords(x, ctype='Latitude'))
taxrolldf['Longitude'] = taxrolldf['Location'].apply(lambda y: split_coords(y, ctype='Longitude'))

taxrolldf.drop(['Location'], axis=1, inplace=True)

taxrolldf.reset_index(drop=True, inplace=True)

taxrolldf.head()

829 rows dropped
1817067 properties in SF


Unnamed: 0,Year_Property_Built,Number_of_Bathrooms,Number_of_Bedrooms,Number_of_Rooms,Number_of_Rooms.1,Number_of_Stories,Number_of_Units,Construction_Type,Property_Area_in_Square_Feet,Neighborhood,Closed_Roll_Assessed_Improvement_Value,Closed_Roll_Assessed_Land_Value,Address,Latitude,Longitude
0,2005.0,2.0,2,5,5,0,0,,1670,Financial District/South Beach,760483.0,1140725.0,0000 0188 MINNA ST0024C,37.786291,-122.401375
1,1907.0,3.0,3,6,6,1,1,D,1450,Haight Ashbury,346562.0,519843.0,0000 1006 COLE ST0000,37.764694,-122.449439
2,1900.0,0.0,0,0,0,0,0,D,0,Japantown,17572.0,182948.0,0000 0000VWEBSTER ST0000,37.786008,-122.43065
3,1982.0,3.0,3,5,5,2,1,C,1037,Western Addition,295002.0,295002.0,0000 0601 VAN NESS AV0044,37.781386,-122.421406
4,2004.0,0.0,0,0,0,0,0,,1185,South of Market,249383.0,424483.0,0000 1221 HARRISON ST0014,37.773103,-122.408674


In [65]:
""" Match tax roll data. """

# # takes ages!
# matched_taxroll_data = join_data_on_address_GPS(df=taxrolldf)

# # to store:
# matched_taxroll_data.to_pickle('taxroll_data.pickle')

# to load:
matched_taxroll_data = pd.read_pickle('taxroll_data.pickle')

In [91]:
""" Prepare fire inspection dataset. 

Variable Definitions:
    Battalion: Emergency Response District (9 Fire Emergency Response districts)
    Station Area: Fire Station First Response Area associated with address of incident
    Box: Fire box associated with address of incident. More than 2,400 boxes in the city
    
"""

inspectiondf = inspectiondf[['Address', 'Battalion', 'Station Area', 'Fire Prevention District',
                                'Inspection Status', 'Location']]

inspectiondf = drop_nulls(inspectiondf, 'fire inspections')

inspectiondf.columns = inspectiondf.columns.str.replace(' ', '_')

inspectiondf['Latitude'] = inspectiondf['Location'].apply(lambda x: split_coords(x, ctype='Latitude'))
inspectiondf['Longitude'] = inspectiondf['Location'].apply(lambda x: split_coords(x, ctype='Longitude'))
inspectiondf.drop(['Location'], axis=1, inplace=True)

inspectiondf.reset_index(drop=True, inplace=True)

# inspectiondf.head()

3532 rows dropped
230028 fire inspections


In [92]:
""" Match fire inspection data. """

# commented to avoid rerunning
# matched_inspections = join_data_on_address_GPS(df=inspectiondf)

# to store:
# matched_inspections.to_pickle('inspection_data.pickle')

# to load:
matched_inspections = pd.read_pickle('inspection_data.pickle')

matched_inspections.head()

Unnamed: 0,Battalion,Station Area,Fire Prevention District,Inspection Status,Latitude,Longitude,Address,EAS,Score
0,4,3,04,completed,37.787297,-122.418083,1014 LARKIN ST,471952.0,90.0
1,4,38,04,completed,37.790479,-122.423064,1700 CALIFORNIA ST,286833.0,97.0
2,1,13,01S,completed,37.79472,-122.395452,,,
3,1,2,01N,pending,37.793953,-122.407202,855 CLAY ST,458457.0,90.0
4,2,6,02S,completed,37.765472,-122.425956,255 DOLORES ST,359429.0,97.0


In [94]:
# and now turn them back into CSV files to send easily

matched_fire_incidents.to_csv('raw_data/matched_fire_incidents.csv')

matched_inspections.to_csv('raw_data/matched_inspections.csv')

matched_taxroll_data.to_csv('raw_data/matched_taxroll_data.csv')

In [12]:


""" Function to fetch data from SODA API. Parameters consist of the API endpoint, upper limit on the dataset's 
    number of rows, number of rows to fetch at a time (limit), and a list to store the results """

# def fetch_data(endpoint, upperlim, limit, lst):
#     # personal app token
#     appToken = "GhH2TZgjbeJ5y6Zz2jAW65Jam"
#     # loop through to grab data in batches
#     for batch in range(0, upperlim, limit):
#         query = ("https://data.sfgov.org/resource/%s.json?$$app_token=%s&$limit=%i&$offset=%i" % 
#                                                      (endpoint, appToken, limit, batch))
#         rawdata = pd.read_json(query)
#         lst.append(rawdata)
#         if len(lst)%10==0:
#             print('%s batches collected' % len(lst))
#     return pd.concat(lst)

# """ Fetch fire incident dataset. """

# fire_incident_endpoint = "wbb6-uh78"
# # we'll grab dataset in batches and append to list; then concatenate
# fire_incidents = []
# # batch size
# limit = 5000
# # fire incident dataset is 463K rows
# upperlim = 500000

# # careful... takes a long time to fetch the whole dataset
# incidentdf = fetch_data(fire_incident_endpoint, upperlim, limit, fire_incidents)


# """ Fetch taxroll dataset. """

# tax_endpoint = "fk72-cxc3"

# taxdata_list = []

# upperlim = 2000000

# fetch_data(tax_endpoint, upperlim, limit, taxdata_list)

URLError: <urlopen error [Errno 8] nodename nor servname provided, or not known>