# Distance From Firestation
This notebook computes the closest firestation for each EAS and adds the distance to the spreadsheet.

In [1]:
import pandas as pd
import os
pd.set_option("display.max_columns", 100)

In [2]:
DATA_URL = 'data/'
FILE_NAME = 'masterdf_20170920.csv'

In [3]:
df = pd.read_csv(os.path.join(DATA_URL, FILE_NAME), index_col=0, low_memory=False)

In [4]:
df.head()

Unnamed: 0,Incident Date,EAS,Incident_Year,Incident_Cat,Incident_Dummy,Neighborhood,Location_y,Address,Building_Cat,Yr_Property_Built,Num_Bathrooms,Num_Bedrooms,Num_Rooms,Num_Stories,Num_Units,Perc_Ownership,Land_Value,Property_Area,Assessed_Improvement_Val,Tot_Rooms,landval_psqft,count potential fire control,count all complaints,count all complaints not corrected,count potential fire control not corrected,count fire emergency safety,count potential fire cause,count fire emergency safety not corrected,count potential fire cause not corrected
0,2015-06-20,451005.0,2015.0,COOKING FIRE,1.0,SUNSET/PARKSIDE,"(37.7543289339354, -122.480327187833)",1532 NORIEGA ST,COMMERCIAL USE,1989.0,0.0,0.0,11.0,3.0,2.777778,1.0,438434.3,4135.0,262181.666667,11.0,106.030069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2010-11-28,360149.0,2010.0,COOKING FIRE,1.0,MISSION,"(37.7645472195468, -122.418358468789)",135 CAPP ST,APARTMENT,1908.0,0.0,0.0,36.0,3.0,12.0,1.0,1365665.0,9318.0,566375.428571,36.0,146.56203,3.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0
2,2011-04-26,360149.0,2011.0,COOKING FIRE,1.0,MISSION,"(37.7645472195468, -122.418358468789)",135 CAPP ST,APARTMENT,1908.0,0.0,0.0,36.0,3.0,12.0,1.0,1365665.0,9318.0,566375.428571,36.0,146.56203,3.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0
3,2006-03-09,360149.0,2006.0,BUILDING FIRE,1.0,MISSION,"(37.7645472195468, -122.418358468789)",135 CAPP ST,APARTMENT,1908.0,0.0,0.0,36.0,3.0,12.0,1.0,1365665.0,9318.0,566375.428571,36.0,146.56203,3.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0
4,2004-05-28,360149.0,2004.0,OUTDOOR FIRE,1.0,MISSION,"(37.7645472195468, -122.418358468789)",135 CAPP ST,APARTMENT,1908.0,0.0,0.0,36.0,3.0,12.0,1.0,1365665.0,9318.0,566375.428571,36.0,146.56203,3.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0


In [5]:
len(df)

195308

## Fire Stations
* Distance of closest fire station to instance

### Getting address of all fire stations in SF
[sf-fire.org](http://sf-fire.org/fire-station-locations) lists all fire stations on their website. A simple webscraper should do the trick to get the addresses. 

In [6]:
import urllib2
from bs4 import BeautifulSoup
URL = "http://sf-fire.org/fire-station-locations"

In [7]:
def get_fire_station_addresses(URL):
    page = urllib2.urlopen(URL)
    soup = BeautifulSoup(page, "lxml")
    result_set = soup.find_all('div', attrs={"class" : "view-opensf-layout"})
    
    # list to store addresses in
    fire_station_adds = []
    for links in soup.find_all('a'):
        try:
            if 'propertymap' in links.get('href'):
                fire_station_adds.append(links.get('href').split("=")[2] + ", San Francisco")
        except TypeError: #omit empty results
            continue
    return fire_station_adds

In [8]:
fire_station_adds = get_fire_station_addresses(URL)

In [9]:
fire_station_adds[:5] #show first 5 addresses

['935 FOLSOM, San Francisco',
 '1340 POWELL, San Francisco',
 '1067 POST, San Francisco',
 '449 MISSION ROCK, San Francisco',
 '1301 TURK, San Francisco']

The site states 45 fire stations (Ranging from 1 to 51 with some numbers left out). Let's double check we got all 51 addresses.

In [10]:
len(fire_station_adds)

45

### Getting location information for fire stations
For all 45 fire stations in SF the geolocations will be fetched from two services geopy and google maps. Since both services are 'hit-or-miss' the method that's being used recursively works on all addresses it hasn't gotten a response yet. Once an address was returned, the address is being deleted from the list. If the list doesn't return any new values after 5 iterations, the process is being stopped.

In [11]:
import time
import random
import requests

def fetch_address_info(address_list, service='google', verbose=True, max_iter=5, timer=False):
    """
    Uses geopy iteratively until all addresses are stored.
    """
    def _fetch_geopy(address):
        try:
            tmp_result = geolocator.geocode(address)
        except Exception:
            tmp_result = []
        return tmp_result
    
    def _fetch_google(address):
        URL = "https://maps.googleapis.com/maps/api/geocode/json?address=" + address
        response = requests.get(URL)
        resp_json_payload = response.json()
        return resp_json_payload['results']
    
    address_dict = {}
    non_succ_set = list(address_list)
    iterations = 1
    len_counter = 1
    len_val = -1
    
    from geopy.geocoders import Nominatim
    geolocator = Nominatim()

    while non_succ_set:
        if len_val == len(non_succ_set):
            len_counter += 1
        len_val = len(non_succ_set)
        print "{} addresses in the queue (Iteration {})".format(len_val, iterations)
        for address in non_succ_set:
            fetch_verbose_string = "Fetching data for: " + address
            if service == 'google':
                address_dict[address] = _fetch_google(address)
            elif service == 'geopy':
                address_dict[address] = _fetch_geopy(address)
            else:
                raise AttributeError("You need to specify either 'google' or 'geopy' as service attribute.")
            if address_dict[address]:
                fetch_verbose_string += "\t\t\t ... successful"
                non_succ_set.remove(address)
            else:
                fetch_verbose_string += "\t\t\t ... not successful, queueing up again"
            if verbose:
                print fetch_verbose_string
            if timer:
                sleep_time = random.randint(2, 4) 
                time.sleep(sleep_time)
        iterations += 1
        if len_counter > max_iter-1:
            print "Termination: {} addresses couldn't be found".format(len_val)
            return address_dict
    return address_dict

In [12]:
refetch = False
if refetch:
    geopy_address_dict = fetch_address_info(fire_station_adds, service='geopy', verbose=False)

45 addresses in the queue (Iteration 1)
23 addresses in the queue (Iteration 2)
12 addresses in the queue (Iteration 3)
7 addresses in the queue (Iteration 4)
4 addresses in the queue (Iteration 5)
2 addresses in the queue (Iteration 6)
2 addresses in the queue (Iteration 7)
2 addresses in the queue (Iteration 8)
2 addresses in the queue (Iteration 9)
2 addresses in the queue (Iteration 10)
Termination: 2 addresses couldn't be found


The google api seems to have better result with a timing offset for each call.

In [13]:
if refetch:
    google_address_dict = fetch_address_info(fire_station_adds, service='google', verbose=False, timer=True)

45 addresses in the queue (Iteration 1)
24 addresses in the queue (Iteration 2)
13 addresses in the queue (Iteration 3)
6 addresses in the queue (Iteration 4)
3 addresses in the queue (Iteration 5)
2 addresses in the queue (Iteration 6)
1 addresses in the queue (Iteration 7)


In [None]:
import pickle

if refetch:
    with open('google_address_dict.pickle', 'wb') as handle:
        pickle.dump(google_address_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('geopy_address_dict.pickle', 'wb') as handle:
        pickle.dump(geopy_address_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

if not refetch:
    with open('google_address_dict.pickle', 'rb') as handle:
        google_address_dict = pickle.load(handle)
    with open('geopy_address_dict.pickle', 'rb') as handle:
        geopy_address_dict = pickle.load(handle)

Geopy has some issues with two of the addresses we ingested. Google does a better job at matching address and geolocation.

In [15]:
def get_lat_long(address_dict):
    lat_lng_dict = {}
    for address in address_dict:
        lat_lng_dict[address] = (address_dict[address][0]['geometry']['location']['lat'], 
                                 address_dict[address][0]['geometry']['location']['lng'])
    return lat_lng_dict

In [16]:
get_lat_long(google_address_dict) # additional step for google results to get latitude and longitude values

{'100 Hoffman Avenue, San Francisco': (37.7531106, -122.4410957),
 '1000 Ocean Avenue, San Francisco': (37.7232908, -122.4529247),
 '1067 POST, San Francisco': (37.7866445, -122.4193329),
 '109 Oak Street, San Francisco': (37.7749469, -122.4212156),
 '1091 Portola Drive, San Francisco': (37.7400312, -122.4586575),
 '1145 Stanyan Street, San Francisco': (37.7634628, -122.4526316),
 '1290 16th Avenue , San Francisco': (37.7639903, -122.4736446),
 '1295 Shafter Avenue, San Francisco': (37.7275225, -122.385028),
 '1298 Girard Street, San Francisco': (37.7166417, -122.4004505),
 '1301 TURK, San Francisco': (37.7804435, -122.430725),
 '1325 Leavenworth Street , San Francisco': (37.7933971, -122.4165178),
 '1340 POWELL, San Francisco': (37.7970513, -122.4099507),
 '1348 45th Avenue, San Francisco': (37.7614112, -122.5046487),
 '135 SANCHEZ, San Francisco': (37.767088, -122.4307689),
 '1415 Evans Avenue, San Francisco': (37.740742, -122.3853284),
 '1443 GROVE, San Francisco': (37.7754351, -122

In [17]:
addytest = get_lat_long(google_address_dict) # additional step for google results to get latitude and longitude values

### Calculate Haversine Distance

In [18]:
from math import pi,sqrt,sin,cos,atan2

def haversine(pos1, pos2):
    lat1 = float(pos1[0])
    long1 = float(pos1[1])
    lat2 = float(pos2[0])
    long2 = float(pos2[1])

    degree_to_rad = float(pi / 180.0)

    d_lat = (lat2 - lat1) * degree_to_rad
    d_long = (long2 - long1) * degree_to_rad

    a = pow(sin(d_lat / 2), 2) + cos(lat1 * degree_to_rad) * cos(lat2 * degree_to_rad) * pow(sin(d_long / 2), 2)
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    km = 6367 * c

    return km

In [19]:
pos1 = addytest[addytest.keys()[1]]
pos2 = addytest[addytest.keys()[2]]

In [23]:
def hav_all(row):
    first_second = row.split(",")
    first = float(first_second[0].split('(')[1])
    second = float(first_second[1].split(')')[0])
    pos1 = (first, second)
    
    min_distance = 12742.0 # diameter of earth in km
    for fire_station in addytest:
        distance = haversine(pos1, addytest[fire_station])
        if distance < min_distance:
            min_distance = distance
    return min_distance
    
df["distance_next_fire_dpt_hav"] = df["Location_y"].apply(hav_all)

In [25]:
df["distance_next_fire_dpt_hav"].describe()

count    195308.000000
mean          0.648027
std           0.312564
min           0.007126
25%           0.417233
50%           0.630113
75%           0.841236
max           2.471300
Name: distance_next_fire_dpt_hav, dtype: float64

# Hydrants
* Hydrant count in district
* closest hydrant to instance