In [None]:
# !pip freeze
# would check the name & version of all libraries available for import

In [None]:
import pandas as pd
from geopy import distance

In [None]:
df = pd.read_csv('/content/drive/My Drive/Metis Spring 2020/Hospitals.csv')
df_hosp = df.loc[(df.COUNTY == 'NEW YORK')]
df_hosp.sample(6)

In [None]:
stat_df = pd.read_csv('/content/drive/My Drive/Metis Spring 2020/MTAstations.csv')

In [None]:
stat_df['the_geom'].tail()

In [None]:
stat_coords_df = pd.DataFrame(stat_df.the_geom.str.split(' ',2).tolist(), columns = ['junk','long','lat'])

In [None]:
stat_coords_df.tail()

In [None]:
stat_df['long'] = pd.Series(stat_coords_df.long.str.strip('(')).astype(float)
stat_df['lat'] = pd.Series(stat_coords_df.lat.str.strip(')')).astype(float)
stat_df.tail(3)

In [None]:
stat_df.head()

In [None]:
stat_loc = pd.Series(zip(stat_df.lat, stat_df.long))
stat_loc.head()

In [None]:
stat_loc_df = pd.DataFrame(stat_df['NAME'])
stat_loc_df.head()

In [None]:
stat_loc_df['lines'] = stat_df['LINE']
stat_loc_df['coords'] = stat_loc
stat_loc_df.head()

Right. Haven't been doing any kind of job of commenting thus far. I have so far pulled in both the hospital and the MTA station location data and have cleaned up the latter to form ordered pairs suitable for geopy. I need to do that for the hospital data as well, then get to work writing a routine for finding the shortest distance from a station to one of the hospitals on the list.

We can either pick out the additional hospitals from Kings and Queens counties and whatever county names the Bronx and Staten Island have, or we can confine the station data to just Manhattan Island by finding some code somewhere that identifies points within a given polygon. PAG

First, let's knock out the closed hospitals.

In [None]:
still_open = (df_hosp.STATUS == 'OPEN')
still_open[5995] # The Addiction Institute of New York, which I noticed was 
                 # closed in my sample

In [None]:
df_hosp_open = df_hosp[still_open]
print(df_hosp.shape)
df_hosp_open.shape

In [None]:
df_hosp_open.head(2)

Now we'll zip up the coordinates for the hospital locations just like we did for station locations. The only problem is the indexing.

In [None]:
hosp_loc = pd.Series(zip(df_hosp_open.Y, df_hosp_open.X), index = df_hosp_open.index)
hosp_loc.head()

In [None]:
hosp_loc_df = pd.DataFrame(df_hosp_open['NAME'])
hosp_loc_df['address'] = df_hosp_open['ADDRESS']
hosp_loc_df['coords'] = hosp_loc

In [None]:
hosp_loc_df.head()

In [None]:
hosp_loc_df.info()

Now the task is basically to create columns in the station dataframe with the name and distance of the closest hospital to each station. Iterate over each station, find the distance to each hospital, save the name and distance of the closest hospital in the station dataframe.

In [None]:
# first let's see if we can get a single freaking distance

stat_loc_df['coords']

In [None]:
stat_loc_df[0:20]

In [None]:
stat_loc_df.iloc[0,2]

In [None]:
distance.distance(stat_loc_df.iloc[0,2],hosp_loc_df.iloc[0,2]).miles

In [None]:
# code to try the .apply(...) method to create a series of distances

def get_minimum_distance(location1, location_series, max_dist = 10):
  """Takes a coordinate tuple for location1, 
  a series of location names and coord tuples,
  and optionally a maximum distance.
  Outputs a list containing the distance to and index of the closest location
  in the series to location1."""
  dist = max_dist
  coords = None
  print(type(location_series))
  for location in location_series:
    mi_away = distance.distance(location1, location).miles
    if mi_away < dist:
      dist = mi_away
      coords = location
  return [dist, location_series[location_series == coords].index[0]]

In [None]:
stat_loc_df.shape

In [None]:
test_list = get_minimum_distance(stat_loc_df.iloc[0,2], hosp_loc_df.iloc[:,2])
test_list[1]

In [None]:
# pull the name corresponding to that nearest hospital to station 0
type(hosp_loc_df['NAME'].loc[test_list[1]])

Somehow, I have to iterate over each location in the station location dataframe, run this function, push the returned distance into one new column and the name called back by index into a second new column. There should be a way to do this with .apply().

Let's rewrite that function to make it easier.

In [None]:
def get_mindist_name(location1, loc_series, name_series, max_dist = 10):
  """Takes a coordinate tuple for location1, 
  a series of coord tuples, an identically indexed series of names,
  and optionally a maximum distance.
  Outputs a list containing the distance to and index of the closest location
  in the series to location1."""
  dist = max_dist
  coords = None
  for location in loc_series:
    mi_away = distance.distance(location1, location).miles
    if mi_away < dist:
      dist = mi_away
      coords = location
  name = name_series[loc_series[loc_series == coords].index[0]]
  return [dist, name]

In [None]:
stat_hosp_df = pd.DataFrame(stat_loc_df['coords'].apply(
    get_mindist_name, loc_series = hosp_loc_df['coords'], name_series =
    hosp_loc_df['NAME']).tolist())

In [None]:
stat_hosp_df.head(3)

In [None]:
stat_hosp_full_df = pd.concat([stat_loc_df, stat_hosp_df], axis=1, sort=False)
stat_hosp_full_df.sample(10)

In [None]:
stat_hosp_full_df.columns = ['Station Name', 'Lines', 'Lat / Long', 'Hospital Distance (mi)', 'Hospital ID']
stat_hosp_full_df.tail(3)

In [None]:
stat_hosp_full_df.to_csv('/content/drive/My Drive/Metis Spring 2020/station_hospital_distances.csv')