In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import geocoder
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv("../project-data/train-parking.csv")
test = pd.read_csv("../project-data/test-no-label-parking.csv")

### make manual street fixes where data is to and from the same street

In [3]:
#fix streets using tim's manual fix
fixed = {'Geary Street,Jones Street,Geary Street':'Geary Street,Jones Street,Leavenworth Street',
 'Jones Street,Sutter Street,Jones Street': 'Jones Street,Sutter Street,Post Street',
 'Kearny Street,Kearny Street,Columbus Avenue': 'Kearny Street,Jackson Street,Columbus Avenue',
 'Kearny Street,Kearny Street,Post Street': 'Kearny Street,Maidne Lane,Post Street',
 'Kearny Street,Kearny Street,Sutter Street': 'Kearny Street,Post Street,Sutter Street',
 'Larkin Street,Larkin Street,Golden Gate Avenue': 'Larkin Street,Fulton Street,Golden Gate Avenue',
 "Larkin Street,O'Farrell Street,Larkin Street": "Larkin Street,O'Farrell Street,Myrtle Street",
 'Larkin Street,Sutter Street,Larkin Street': 'Larkin Street,Sutter Street,Bush Street',
 'Mason Street,Sutter Street,Mason Street': 'Mason Street,Sutter Street,Bush Street',
 'Mission Street,11th Street,Mission Street': 'Mission Street,11th Street,12th Street',
 'Montgomery Street,Jackson Street,Montgomery Street': 'Montgomery Street,Jackson Street,Washington Street',
 'Montgomery Street,Montgomery Street,Jackson Street': 'Montgomery Street,Pacific Avenue,Jackson Street',
 'Montgomery Street,Washington Street,Montgomery Street': 'Montgomery Street,Washington Street,Clay Street',
 'Post Street,Kearny Street,Post Street': 'Post Street,Kearny Street,Montgomery Street',
 'Stockton Street,Stockton Street,Jackson Street': 'Stockton Street,Washington Street,Jackson Street',
 'Stockton Street,Stockton Tunnel,Stockton Street': 'Stockton Street,Stockton Tunnel,Clay Street',
 'Sutter Street,Powell Street,Sutter Street': 'Sutter Street,Powell Street,Mason Street',
 'Van Ness Avenue,Golden Gate Avenue,Van Ness Avenue': 'Van Ness Avenue,Golden Gate Avenue,Turk Street',
 'Van Ness Avenue,Hayes Street,Van Ness Avenue': 'Van Ness Avenue,Hayes Street,Grove Street',
 'Van Ness Avenue,McAllister Street,Van Ness Avenue': 'Van Ness Avenue,McAllister Street,Golden Gate Avenue',
 'Van Ness Avenue,Van Ness Avenue,Bush Street': 'Van Ness Avenue,Sutter Avenue,Bush Street',
 'Van Ness Avenue,Van Ness Avenue,Eddy Street': 'Van Ness Avenue,Turk Street,Eddy Street',
 'Van Ness Avenue,Van Ness Avenue,Fell Street': 'Van Ness Avenue,Oak Street,Fell Street',
 'Van Ness Avenue,Van Ness Avenue,Golden Gate Avenue': 'Van Ness Avenue,McAllister Street,Golden Gate Avenue',
 'Van Ness Avenue,Van Ness Avenue,Grove Street': 'Van Ness Avenue,Hayes Street,Grove Street',
 'Van Ness Avenue,Van Ness Avenue,Turk Street': 'Van Ness Avenue,Golden Gate Avenue,Turk Street'}

In [4]:
#concatenate into one column
train["streets_concat"] = train.Street + "," + train.From + "," + train.To
test["streets_concat"] = test.Street + "," + test.From + "," + test.To

In [5]:
#map dictionaries to data fix streets
train['streets_concat'] = train['streets_concat'].map(fixed)
test['streets_concat'] = test['streets_concat'].map(fixed)

#there are NA's for the good ones -- fill those back
train['streets_concat'] = train['streets_concat'].fillna(train.Street + "," + train.From + "," + train.To)
test['streets_concat'] = test['streets_concat'].fillna(test.Street + "," + test.From + "," + test.To)

In [6]:
#replace cols with fixed vals
train['Street'] = train['streets_concat'].str.split(",", expand=True)[0]
train['From'] = train['streets_concat'].str.split(",", expand=True)[1]
train['To'] = train['streets_concat'].str.split(",", expand=True)[2]

test['Street'] = test['streets_concat'].str.split(",", expand=True)[0]
test['From'] = test['streets_concat'].str.split(",", expand=True)[1]
test['To'] = test['streets_concat'].str.split(",", expand=True)[2]

In [7]:
#delete unecessary column
del train['streets_concat']
del test['streets_concat']

### get longitude and latitude coordinates of streets

In [8]:
#add street_from and street_to intersections for geocoder to use
train['street_from'] = train['Street'] + " & " + train['From'] + ", San Francisco CA"
train['street_to'] = train['Street'] + " & " + train['To'] + ", San Francisco CA"

test['street_from'] = test['Street'] + " & " + test['From'] + ", San Francisco CA"
test['street_to'] = test['Street'] + " & " + test['To'] + ", San Francisco CA"

In [9]:
#build dictionaries of lat/long for street intersections
street_from_unique = train['street_from'].unique()
street_from_unique_test = test['street_from'].unique()
dfrom = dict(zip(street_from_unique, pd.Series(street_from_unique).apply(geocoder.google).apply(lambda x: x.latlng)))
dfrom = {k: v for k, v in dfrom.items() if v is not None}
print(f'There are {len(street_from_unique)} unique street-from intersections')
print(f'There are {len(street_from_unique_test)} unique street-from intersections in test')
print(f'from dict has {len(dfrom)} entries')

street_to_unique = train['street_to'].unique()
street_to_unique_test = train['street_to'].unique()
d2 = dict(zip(street_to_unique, pd.Series(street_to_unique).apply(geocoder.google).apply(lambda x: x.latlng)))
d2 = {k: v for k, v in d2.items() if v is not None}
print(f'There are {len(street_to_unique)} unique street-to intersections')
print(f'There are {len(street_to_unique_test)} unique street-to intersections in test')
print(f'to dict has {len(d2)} entries')

There are 88 unique street-from intersections
There are 88 unique street-from intersections in test
from dict has 81 entries
There are 91 unique street-to intersections
There are 91 unique street-to intersections in test
to dict has 83 entries


In [10]:
#map dictionaries to data to get lat/long
train['coord_from'] = train['street_from'].map(dfrom)

train['coord_to'] = train['street_to'].map(d2)

In [11]:
#find still empties in street_from and rerun
stillempty_from = train[train['coord_from'].isnull()]['street_from'].unique()

stillempty_to = train[train['coord_to'].isnull()]['street_to'].unique()

In [12]:
print(len(stillempty_from))
print(len(stillempty_to))

7
8


#### deal with street-from

In [13]:
new_from = {}
still_to_find = []
for name in stillempty_from:
    g = geocoder.google(name)
    if g.latlng is not None:
        new_from[name] = g.latlng
    else:
        still_to_find.append(name)

In [14]:
#combine dicts
dfrom2 = {**dfrom, **new_from}
print(len(new_from))
print(len(dfrom2))

6
87


In [15]:
still_to_find

['Taylor Street & Turk Street, San Francisco CA']

In [16]:
new_from = {}
still_to_find2 = []
for name in still_to_find:
    g = geocoder.google(name)
    if g.latlng is not None:
        new_from[name] = g.latlng
    else:
        still_to_find2.append(name)

In [17]:
#combine dicts
dfrom3 = {**dfrom2, **new_from}
print(len(new_from))
print(len(dfrom3))

1
88


In [18]:
new_from = {}
still_to_find3 = []
for name in still_to_find2:
    g = geocoder.google(name)
    if g.latlng is not None:
        new_from[name] = g.latlng
    else:
        still_to_find3.append(name)

In [19]:
#combine dicts
dfrom4 = {**dfrom3, **new_from}
print(len(new_from))
print(len(dfrom4))

0
88


In [20]:
still_to_find3

[]

#### deal with street-to

In [21]:
new_to = {}
still_to_find = []
for name in stillempty_to:
    g = geocoder.google(name)
    if g.latlng is not None:
        new_to[name] = g.latlng
    else:
        still_to_find.append(name)

In [22]:
#combine dicts
dto2 = {**d2, **new_to}
print(len(new_to))
print(len(dto2))

8
91


In [23]:
new_to = {}
still_to_find2 = []
for name in still_to_find:
    g = geocoder.google(name)
    if g.latlng is not None:
        new_to[name] = g.latlng
    else:
        still_to_find2.append(name)


In [24]:
#combine dicts
dto3 = {**dto2, **new_to}
print(len(new_to))
print(len(dto3))

0
91


In [25]:
new_to = {}
still_to_find3 = []
for name in still_to_find2:
    g = geocoder.google(name)
    if g.latlng is not None:
        new_to[name] = g.latlng
    else:
        still_to_find3.append(name)

In [26]:
#combine dicts
dto4 = {**dto3, **new_from}
len(dto4)

91

In [27]:
still_to_find2

[]

#### map dicts to data

In [28]:
#create dictionary for each coordinate
dfrom_lat = dict(zip(dfrom4.keys(),[dfrom4[k][0] for k in dfrom4]))
dfrom_long = dict(zip(dfrom4.keys(),[dfrom4[k][1] for k in dfrom4]))
print(len(dfrom_lat))
print(len(dfrom_long))

dto_lat = dict(zip(dto4.keys(),[dto4[k][0] for k in dto4]))
dto_long = dict(zip(dto4.keys(),[dto4[k][1] for k in dto4]))
print(len(dto_lat))
print(len(dto_long))

88
88
91
91


In [29]:
#map dictionaries to data to get lat/long
train['from_lat'] = train['street_from'].map(dfrom_lat)
test['from_lat'] = test['street_from'].map(dfrom_lat)

train['from_long'] = train['street_from'].map(dfrom_long)
test['from_long'] = test['street_from'].map(dfrom_long)

train['to_lat'] = train['street_to'].map(dto_lat)
test['to_lat'] = test['street_to'].map(dto_lat)

train['to_long'] = train['street_to'].map(dto_long) 
test['to_long'] = test['street_to'].map(dto_long) 

In [30]:
#check to make sure everything got filled
train[train['from_lat'].isnull()]['street_to'].unique()

array([], dtype=object)

#### add block midpoint

In [37]:
test.head()

Unnamed: 0,Street,From,To,Date,Time,Street.Length,street_from,street_to,from_lat,from_long,to_lat,to_long,block_coord,block_lat,block_long
0,Stockton Street,Stockton Tunnel,Clay Street,3/28/2014,16:34,35.786472,"Stockton Street & Stockton Tunnel, San Francis...","Stockton Street & Clay Street, San Francisco CA",37.792771,-122.407649,37.794064,-122.407922,"[37.7934179, -122.40778505]",37.793418,-122.407785
1,Van Ness Avenue,Hayes Street,Grove Street,3/28/2014,21:34,63.787968,"Van Ness Avenue & Hayes Street, San Francisco CA","Van Ness Avenue & Grove Street, San Francisco CA",37.777313,-122.419633,37.778279,-122.419829,"[37.777795600000005, -122.4197312]",37.777796,-122.419731
2,Van Ness Avenue,McAllister Street,Golden Gate Avenue,3/28/2014,19:50,56.007236,"Van Ness Avenue & McAllister Street, San Franc...","Van Ness Avenue & Golden Gate Avenue, San Fran...",37.780095,-122.42019,37.781046,-122.420392,"[37.7805708, -122.42029124999999]",37.780571,-122.420291
3,Mission Street,11th Street,12th Street,3/28/2014,20:02,139.6519,"Mission Street & 11th Street, San Francisco CA","Mission Street & 12th Street, San Francisco CA",37.774332,-122.417137,37.773067,-122.41872,"[37.77369985, -122.41792865]",37.7737,-122.417929
4,Hyde Street,Golden Gate Avenue,McAllister Street,3/28/2014,19:43,105.14411,"Hyde Street & Golden Gate Avenue, San Francisc...","Hyde Street & McAllister Street, San Francisco CA",37.781668,-122.415524,37.780706,-122.415334,"[37.781187, -122.41542895]",37.781187,-122.415429


In [33]:
#add midpoint
def midpoint(train):
    return ([(train["from_lat"] + train["to_lat"])/2, (train["from_long"] + train["to_long"])/2])

train['block_coord'] = train.apply(midpoint, axis=1)
train[['block_lat','block_long']] = pd.DataFrame(train['block_coord'].values.tolist(), index= train.index)

In [34]:
#add midpoint
def midpoint(test):
    return ([(test["from_lat"] + test["to_lat"])/2, (test["from_long"] + test["to_long"])/2])

test['block_coord'] = test.apply(midpoint, axis=1)
test[['block_lat','block_long']] = pd.DataFrame(test['block_coord'].values.tolist(), index= test.index)

In [39]:
#train = train.drop(['coord_from','coord_to','block_coord'], axis = 1)
#test = test.drop(['block_coord'], axis = 1)

### export to csv

In [42]:
train.to_csv("./geocoded_train.csv", index = False) 
test.to_csv("./geocoded_test.csv", index = False) 