# What factors influence hospital popularity?

The state of New York has detailed databases about health care providers, which are updated quarterly and are freely available online on: https://www.health.ny.gov/

The data includes 


In [1]:
## import libraries
import requests

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import numpy as np # library to handle data in a vectorized manner
import geocoder # import geocoder to get latitude and longitude
import json # library to handle JSON files

import geopy.geocoders
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
geopy.geocoders.options.default_timeout = 60 # otherwise it will timeout at some point

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [2]:
## Define foursquare credentials and version (date)
CLIENT_ID = 'U1ALKNVSQ5OT5GCDQTE3NTUSVQVANLLFPZBOW3BMYR53AG20' # your Foursquare ID
CLIENT_SECRET = '2BGUAXJ14CEJNJQHSWRLNGA5IOJCMBSOAENHD5INTJOHLP45' # your Foursquare Secret
VERSION = '20191111'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: U1ALKNVSQ5OT5GCDQTE3NTUSVQVANLLFPZBOW3BMYR53AG20
CLIENT_SECRET:2BGUAXJ14CEJNJQHSWRLNGA5IOJCMBSOAENHD5INTJOHLP45


In [4]:
# Load in Institutional_Provider_Network_Data from the second quarter of 2019 of new york state. This dataset includes detailed information about health care
# providers in new york state, for detailed information see: https://www.health.ny.gov/health_care/managed_care/docs/dictionary.pdf
hosp_ny = pd.read_csv('../Institutional_Provider_Network_Data__2019_Quarter_2.csv', 
                      usecols=['Site Name', 'Address', 'Town/City', 'County FIPS', 'County Name', 'Zip Code', 'Latitude', 'Longitude', 'Designated Service', 
                              'Commercial Provider Indicator', 'Medicaid Provider Indicator', 'Plan Name', 'NYSOH Standard Essential Health Plan (EP) Indicator/Basic Health Program Indicator'], 
                      )
hosp_ny.head()

Unnamed: 0,Site Name,Address,Town/City,County FIPS,County Name,Zip Code,Latitude,Longitude,Designated Service,Commercial Provider Indicator,Medicaid Provider Indicator,NYSOH Standard Essential Health Plan (EP) Indicator/Basic Health Program Indicator,Plan Name
0,@PHARMACY.COM,7901 SE POWELL BLVD,PORTLAND,41051.0,Multnomah,97206.0,45.498412,-122.58035,760,1,0,0,"Crystal Run Health Plan, LLC"
1,02 SOLUTIONS,1406B ROUTE 9,CLIFTON PARK,36091.0,Saratoga,12065.0,42.825185,-73.733752,307,0,0,0,Nascentia Health Options
2,02 SOLUTIONS,1406B ROUTE 9,CLIFTON PARK,36091.0,Saratoga,12065.0,42.825185,-73.733752,307,0,0,0,Nascentia Health Options
3,ELDERWOOD AT HORNELL,1 BETHESDA DR,HORNELL,36101.0,Steuben,14843.0,42.346187,-77.660521,664,0,1,1,New York Quality Healthcare Corporation
4,ELDERWOOD AT HORNELL,1 BETHESDA DR,HORNELL,36101.0,Steuben,14843.0,42.346187,-77.660521,680,0,1,1,New York Quality Healthcare Corporation


In [5]:
hosp_ny.dtypes

Site Name                                                                              object
Address                                                                                object
Town/City                                                                              object
County FIPS                                                                           float64
County Name                                                                            object
Zip Code                                                                              float64
Latitude                                                                              float64
Longitude                                                                             float64
Designated Service                                                                      int64
Commercial Provider Indicator                                                           int64
Medicaid Provider Indicator                                 

In [6]:
len(hosp_ny)

611814

In [7]:
## This is still a lot to search through, so let's kick out places that are not hospitals
hosp_ny = hosp_ny[hosp_ny['Site Name'].str.lower().str.contains("hospital")]

In [10]:
# drop duplicate locations using latitude and longitude
hosp_ny.drop_duplicates(subset=['Latitude', 'Longitude'], inplace = True) # keeps only one occurence (the first by default) 
len(hosp_ny)

2190

In [11]:
# Get names of indexes for which column Age has value 30
del_indeces = hosp_ny[ hosp_ny['Latitude'] == 0 ].index 
del_indeces

Int64Index([406533], dtype='int64')

In [12]:
# Delete these row indexes from dataFrame
hosp_ny.drop(del_indeces, inplace=True)

In [143]:
results

{'meta': {'code': 400,
  'errorType': 'param_error',
  'errorDetail': 'Invalid geo coordinates (0.000000,0.000000)',
  'requestId': '5ddedd2914a126001bfd6d00'},
 'response': {}}

In [13]:
## Find the locations on Foursquare and get venue information using their API
fs_hospital_list = list()
radius = 50 # hospitals within 50 meters
server_errors = list()
for i in range(len(hosp_ny)):
    
    # create the API request URL
    lat = hosp_ny.Latitude.iloc[i]
    lng = hosp_ny.Longitude.iloc[i]
    search_query = "hospital"
    LIMIT = 10
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&query={}&ll={},{}&radius={}&limit={}'.format( 
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        search_query,
        lat, 
        lng, 
        radius, 
        LIMIT)

    # make the GET request
    results = requests.get(url).json()
    
    if results['meta']['code'] == 500:
        # keep track of servers errors (for which requests did we get them?)
        server_errors.append(i)
        print('Server error on {}: When requesting information for {}.'.format(i, hosp_ny.iloc[i, 0]))
    else:
        # put json results into dataframe
        hospitals = results['response']['groups'][0]['items']
        nearby_hospitals = json_normalize(hospitals)

        # accumulate dataframes from each foursquare API call in a list
        fs_hospital_list.append(nearby_hospitals)
        if (i % 10 == 0 and i < 100) or i % 100 == 0 or i == (len(hosp_ny)-1):
            print('Iteration {}: Requesting foursquare hospital information for {}.'.format(i, hosp_ny.iloc[i, 0]))

Iteration 0: Requesting foursquare hospital information for BROOKLYN HOSPITAL CENTER EXTENSION CLINIC.
Iteration 10: Requesting foursquare hospital information for ADVENTIST BOLINGBROOK HOSPITAL OUTPATIENT PHARMACY.
Iteration 20: Requesting foursquare hospital information for ALBANY MEDICAL CENTER HOSPITAL.
Iteration 30: Requesting foursquare hospital information for ALBANY MEMORIAL HOSPITAL.
Iteration 40: Requesting foursquare hospital information for ALLINA HEALTH REGINA HOSPITAL.
Iteration 50: Requesting foursquare hospital information for BROOKS-TLC HOSPITAL SYSTEM.
Iteration 60: Requesting foursquare hospital information for BAPTIST MEMORIAL HOSPITAL GOLDEN TRIANGLE OP PHARM.
Iteration 70: Requesting foursquare hospital information for ROCHESTER GENERAL HOSPITAL.
Iteration 80: Requesting foursquare hospital information for HEALTHALLIANCE HOSPITAL MARY'S AVENUE CAMPUS.
Iteration 90: Requesting foursquare hospital information for BINGHAMTON GENERAL HOSPITAL (UHS).
Iteration 100: Req

In [14]:
hosp_ny.head(2)

Unnamed: 0,Site Name,Address,Town/City,County FIPS,County Name,Zip Code,Latitude,Longitude,Designated Service,Commercial Provider Indicator,Medicaid Provider Indicator,NYSOH Standard Essential Health Plan (EP) Indicator/Basic Health Program Indicator,Plan Name
1967,BROOKLYN HOSPITAL CENTER EXTENSION CLINIC,771 61ST ST,BROOKLYN,36047.0,Kings,11220.0,40.635357,-74.010594,321,0,0,0,Commercial Travelers Life Insurance Company: C...
2654,SYOSSET HOSPITAL,4821 8TH AVE,BROOKLYN,36047.0,Kings,11220.0,40.641672,-74.002746,760,0,0,0,Centerlight Healthcare


In [15]:
## combine data frames from individual API calls into one large data frame
df_fs_hospitals = pd.concat(fs_hospital_list, sort = False)
df_fs_hospitals.head()

Unnamed: 0,reasons.count,reasons.items,referralId,venue.categories,venue.id,venue.location.address,venue.location.cc,venue.location.city,venue.location.country,venue.location.distance,venue.location.formattedAddress,venue.location.labeledLatLngs,venue.location.lat,venue.location.lng,venue.location.postalCode,venue.location.state,venue.name,venue.photos.count,venue.photos.groups,venue.location.crossStreet,venue.venuePage.id,venue.location.neighborhood
0,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4b7de0a9f964a5204dd82fe3-0,"[{'id': '4bf58dd8d48988d196941735', 'name': 'H...",4b7de0a9f964a5204dd82fe3,1 Norton Ave,US,Oneonta,United States,35,"[1 Norton Ave, Oneonta, NY 13820, United States]","[{'label': 'display', 'lat': 42.45789622011591...",42.457896,-75.052299,13820,NY,A.O. Fox Hospital,0,[],,,
0,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4e7cfcedf5b954e0f64bceb3-0,"[{'id': '4bf58dd8d48988d196941735', 'name': 'H...",4e7cfcedf5b954e0f64bceb3,1555 Long Pond Rd,US,Rochester,United States,38,"[1555 Long Pond Rd, Rochester, NY 14626, Unite...","[{'label': 'display', 'lat': 43.191376, 'lng':...",43.191376,-77.702024,14626,NY,Unity Health System TCC,0,[],,,
0,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-5bb3ce2d1c0b34002c393773-0,"[{'id': '4bf58dd8d48988d196941735', 'name': 'H...",5bb3ce2d1c0b34002c393773,89 Genesee St,US,Rochester,United States,6,"[89 Genesee St, Rochester, NY 14611, United St...","[{'label': 'display', 'lat': 43.14874918934384...",43.148749,-77.637162,14611,NY,Daisy Marquis Jones Walk-In Care Center,0,[],,,
1,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4b9fcec9f964a520894037e3-1,"[{'id': '4bf58dd8d48988d196941735', 'name': 'H...",4b9fcec9f964a520894037e3,89 Genesee St,US,Rochester,United States,6,"[89 Genesee St, Rochester, NY 14611, United St...","[{'label': 'display', 'lat': 43.14873864577942...",43.148739,-77.637156,14611,NY,St. Mary's Unity Hospital,0,[],,,
0,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-5b6c953edff815002c849ce7-0,"[{'id': '4bf58dd8d48988d196941735', 'name': 'H...",5b6c953edff815002c849ce7,130 E 77th St,US,New York,United States,17,"[130 E 77th St, New York, NY 10075, United Sta...","[{'label': 'display', 'lat': 40.77368709367118...",40.773687,-73.960336,10075,NY,Lenox Hill Hospital Heart & Lung,0,[],,,


In [16]:
## extract columns of interest
filtered_columns = ['venue.id', 'venue.location.address', 'venue.location.city', 
                    'venue.location.lat', 'venue.location.lng', 'venue.location.state', 
                    'venue.location.distance', 'venue.name', 'venue.location.formattedAddress']
df_fs_hospitals = df_fs_hospitals.loc[:, filtered_columns]
df_fs_hospitals.head()

Unnamed: 0,venue.id,venue.location.address,venue.location.city,venue.location.lat,venue.location.lng,venue.location.state,venue.location.distance,venue.name,venue.location.formattedAddress
0,4b7de0a9f964a5204dd82fe3,1 Norton Ave,Oneonta,42.457896,-75.052299,NY,35,A.O. Fox Hospital,"[1 Norton Ave, Oneonta, NY 13820, United States]"
0,4e7cfcedf5b954e0f64bceb3,1555 Long Pond Rd,Rochester,43.191376,-77.702024,NY,38,Unity Health System TCC,"[1555 Long Pond Rd, Rochester, NY 14626, Unite..."
0,5bb3ce2d1c0b34002c393773,89 Genesee St,Rochester,43.148749,-77.637162,NY,6,Daisy Marquis Jones Walk-In Care Center,"[89 Genesee St, Rochester, NY 14611, United St..."
1,4b9fcec9f964a520894037e3,89 Genesee St,Rochester,43.148739,-77.637156,NY,6,St. Mary's Unity Hospital,"[89 Genesee St, Rochester, NY 14611, United St..."
0,5b6c953edff815002c849ce7,130 E 77th St,New York,40.773687,-73.960336,NY,17,Lenox Hill Hospital Heart & Lung,"[130 E 77th St, New York, NY 10075, United Sta..."


In [17]:
## delete redundant hospitals
#df_fs_hospitals.drop_duplicates(subset='venue.id', keep="last", inplace = True)
#df_fs_hospitals["venue.id"].value_counts()
len(df_fs_hospitals)

1118

In [19]:
## Rename columns in foursquare hospital data frame
df_fs_hospitals.rename(columns={"venue.id": "fs_id",
                                "venue.location.address": "fs_address",
                                      "venue.location.city": "fs_city",
                                      "venue.location.lat": "fs_lat",
                                      "venue.location.lng": "fs_lng",
                                      "venue.location.state": "fs_state",
                                      "venue.location.distance": "fs_distance",
                                      "venue.location.formattedAddress": "fs_formattedAddress",
                                      "venue.name": "fs_name"}, inplace = True)
df_fs_hospitals.head()

Unnamed: 0,fs_id,fs_address,fs_city,fs_lat,fs_lng,fs_state,fs_distance,fs_name,fs_formattedAddress
0,4b7de0a9f964a5204dd82fe3,1 Norton Ave,Oneonta,42.457896,-75.052299,NY,35,A.O. Fox Hospital,"[1 Norton Ave, Oneonta, NY 13820, United States]"
0,4e7cfcedf5b954e0f64bceb3,1555 Long Pond Rd,Rochester,43.191376,-77.702024,NY,38,Unity Health System TCC,"[1555 Long Pond Rd, Rochester, NY 14626, Unite..."
0,5bb3ce2d1c0b34002c393773,89 Genesee St,Rochester,43.148749,-77.637162,NY,6,Daisy Marquis Jones Walk-In Care Center,"[89 Genesee St, Rochester, NY 14611, United St..."
1,4b9fcec9f964a520894037e3,89 Genesee St,Rochester,43.148739,-77.637156,NY,6,St. Mary's Unity Hospital,"[89 Genesee St, Rochester, NY 14611, United St..."
0,5b6c953edff815002c849ce7,130 E 77th St,New York,40.773687,-73.960336,NY,17,Lenox Hill Hospital Heart & Lung,"[130 E 77th St, New York, NY 10075, United Sta..."


In [20]:
## even similar locations have slightly different geospatial coordinates...
df_fs_hospitals.loc[df_fs_hospitals["fs_name"] == "Brylin ospital", :]

Unnamed: 0,fs_id,fs_address,fs_city,fs_lat,fs_lng,fs_state,fs_distance,fs_name,fs_formattedAddress


In [21]:
hosp_ny.loc[hosp_ny["Address"] == "1263 DELAWARE AVE","Site Name":"Longitude"]

Unnamed: 0,Site Name,Address,Town/City,County FIPS,County Name,Zip Code,Latitude,Longitude
73452,BRY LIN HOSPITAL,1263 DELAWARE AVE,BUFFALO,36029.0,Erie,14209.0,42.918387,-78.867668


In [25]:
df_fs_hospitals.head()

Unnamed: 0,fs_id,fs_address,fs_city,fs_lat,fs_lng,fs_state,fs_distance,fs_name,fs_formattedAddress
0,4b7de0a9f964a5204dd82fe3,1 Norton Ave,Oneonta,42.457896,-75.052299,NY,35,A.O. Fox Hospital,"[1 Norton Ave, Oneonta, NY 13820, United States]"
0,4e7cfcedf5b954e0f64bceb3,1555 Long Pond Rd,Rochester,43.191376,-77.702024,NY,38,Unity Health System TCC,"[1555 Long Pond Rd, Rochester, NY 14626, Unite..."
0,5bb3ce2d1c0b34002c393773,89 Genesee St,Rochester,43.148749,-77.637162,NY,6,Daisy Marquis Jones Walk-In Care Center,"[89 Genesee St, Rochester, NY 14611, United St..."
1,4b9fcec9f964a520894037e3,89 Genesee St,Rochester,43.148739,-77.637156,NY,6,St. Mary's Unity Hospital,"[89 Genesee St, Rochester, NY 14611, United St..."
0,5b6c953edff815002c849ce7,130 E 77th St,New York,40.773687,-73.960336,NY,17,Lenox Hill Hospital Heart & Lung,"[130 E 77th St, New York, NY 10075, United Sta..."


In [46]:
df_fs_hospitals['test'] = [','.join(map(str, l)) for l in df_fs_hospitals['fs_formattedAddress']]
df_fs_hospitals.head()

Unnamed: 0,fs_id,fs_address,fs_city,fs_lat,fs_lng,fs_state,fs_distance,fs_name,fs_formattedAddress,test
0,4b7de0a9f964a5204dd82fe3,1 Norton Ave,Oneonta,42.457896,-75.052299,NY,35,A.O. Fox Hospital,"[1 Norton Ave, Oneonta, NY 13820, United States]","1 Norton Ave,Oneonta, NY 13820,United States"
0,4e7cfcedf5b954e0f64bceb3,1555 Long Pond Rd,Rochester,43.191376,-77.702024,NY,38,Unity Health System TCC,"[1555 Long Pond Rd, Rochester, NY 14626, Unite...","1555 Long Pond Rd,Rochester, NY 14626,United S..."
0,5bb3ce2d1c0b34002c393773,89 Genesee St,Rochester,43.148749,-77.637162,NY,6,Daisy Marquis Jones Walk-In Care Center,"[89 Genesee St, Rochester, NY 14611, United St...","89 Genesee St,Rochester, NY 14611,United States"
1,4b9fcec9f964a520894037e3,89 Genesee St,Rochester,43.148739,-77.637156,NY,6,St. Mary's Unity Hospital,"[89 Genesee St, Rochester, NY 14611, United St...","89 Genesee St,Rochester, NY 14611,United States"
0,5b6c953edff815002c849ce7,130 E 77th St,New York,40.773687,-73.960336,NY,17,Lenox Hill Hospital Heart & Lung,"[130 E 77th St, New York, NY 10075, United Sta...","130 E 77th St,New York, NY 10075,United States"


In [47]:
df_fs_hospitals.rename(columns={"test": "fs_longAddress"}, inplace = True)

In [48]:
df_fs_hospitals.head()

Unnamed: 0,fs_id,fs_address,fs_city,fs_lat,fs_lng,fs_state,fs_distance,fs_name,fs_formattedAddress,fs_longAddress
0,4b7de0a9f964a5204dd82fe3,1 Norton Ave,Oneonta,42.457896,-75.052299,NY,35,A.O. Fox Hospital,"[1 Norton Ave, Oneonta, NY 13820, United States]","1 Norton Ave,Oneonta, NY 13820,United States"
0,4e7cfcedf5b954e0f64bceb3,1555 Long Pond Rd,Rochester,43.191376,-77.702024,NY,38,Unity Health System TCC,"[1555 Long Pond Rd, Rochester, NY 14626, Unite...","1555 Long Pond Rd,Rochester, NY 14626,United S..."
0,5bb3ce2d1c0b34002c393773,89 Genesee St,Rochester,43.148749,-77.637162,NY,6,Daisy Marquis Jones Walk-In Care Center,"[89 Genesee St, Rochester, NY 14611, United St...","89 Genesee St,Rochester, NY 14611,United States"
1,4b9fcec9f964a520894037e3,89 Genesee St,Rochester,43.148739,-77.637156,NY,6,St. Mary's Unity Hospital,"[89 Genesee St, Rochester, NY 14611, United St...","89 Genesee St,Rochester, NY 14611,United States"
0,5b6c953edff815002c849ce7,130 E 77th St,New York,40.773687,-73.960336,NY,17,Lenox Hill Hospital Heart & Lung,"[130 E 77th St, New York, NY 10075, United Sta...","130 E 77th St,New York, NY 10075,United States"


In [None]:
def isMatch(row):
    for i in row['b']:
        if i == row['a']:
            return 'Match'
    return 'Not Match'

df['c'] = df.apply(lambda x: isMatch(x), axis=1)
print(df)

In [50]:
df_fs_hospitals.fs_longAddress in hosp_ny.Address

TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [187]:
## I can use the geocoder library to find the gps coordinates by address in each dataset, hopefully after that they will
## match (this will mean I delete all datasets that do not have a valid address)

In [None]:
## set geocoder timeout quite high, to avoid errors
geopy.geocoders.options.default_timeout = 60 

In [23]:
## Let's start with the hosp_ny data, for this I need to combine the Address, with city and state
hosp_ny["Complete Address"] = hosp_ny["Address"] + ", " + hosp_ny["Town/City"]# + ", NY, USA"
hosp_ny["Complete Address"].head()

1967           771 61ST ST, BROOKLYN
2654          4821 8TH AVE, BROOKLYN
3645    1 FOXCARE CENTER DR, ONEONTA
3646          1 FOX CARE DR, ONEONTA
3647           1 NORTON AVE, ONEONTA
Name: Complete Address, dtype: object

In [24]:
addresses = hosp_ny.loc[:,"Complete Address"].unique()
len(addresses)

2178

In [234]:
## get latitude and longitude for each city in the hosp_states dataframe
latitudes = [None] * len(addresses)
longitudes = [None] * len(addresses)
lost_cities = list()
for i in range(len(addresses)):
    address = addresses[i]
    geolocator = Nominatim(user_agent="random_app_name")
    loc = geolocator.geocode(address)
    if loc:
        lat = loc.latitude
        lng = loc.longitude
        latitudes[i] = lat
        longitudes[i] = lng
        if (i % 10 == 0 and i < 100) or i % 100 == 0 or i == (len(addresses)-1):
            print('{}: The geograpical coordinates of {} are {}, {}.'.format(i, address, lat, lng))
    else:
        latitudes[i] = float('NaN')
        longitudes[i] = float('NaN')
        lost_cities.append(address)
        print('The geograpical coordinates of {} are not available.'.format(address))

0: The geograpical coordinates of 771 61ST ST, BROOKLYN are 40.6353895714286, -74.0107771428572.
The geograpical coordinates of 1 FOX CARE DR, ONEONTA are not available.
10: The geograpical coordinates of 500 REMINGTON BLVD, BOLINGBROOK are 41.68143085, -88.0845936435491.
The geograpical coordinates of 701 WINTHROP AVE, GLENDALE HEIGHT are not available.
The geograpical coordinates of 178 WASHINGTON AVENUE EXT, ALBANY are not available.
20: The geograpical coordinates of 66 HACKETT BLVD, ALBANY are 42.644409543877, -73.7805358552287.
The geograpical coordinates of 453 ROUTE 146, CLIFTON PARK are not available.
The geograpical coordinates of 1019 NEW LOUDON RD, COHOES are not available.
The geograpical coordinates of 600 NORTHERN BLVD, GREAT NECK are not available.
30: The geograpical coordinates of 279 TROY RD, RENSSELAER are 42.64009795, -73.7013338042844.
The geograpical coordinates of CAPITAL DISTRICT INTERNAL MEDICINE 1440, ALBANY are not available.
The geograpical coordinates of 8

KeyboardInterrupt: 

In [None]:
lost_cities

In [None]:
# create addresses data frame
df_addresses = pd.DataFrame({'address': addresses, 'lat': latitudes, 'lng': longitudes})
df_addresses.head()

In [None]:
df_fs_hospitals[df_fs_hospitals['Address'].notnull()]['Address'].apply(lambda s: s.replace('Ct', 'Court')

In [243]:
hosp_ny.head()

Unnamed: 0,Site Name,Address,Town/City,County FIPS,County Name,Zip Code,Latitude,Longitude,Designated Service,Commercial Provider Indicator,Medicaid Provider Indicator,NYSOH Standard Essential Health Plan (EP) Indicator/Basic Health Program Indicator,Plan Name,Complete Address
1967,BROOKLYN HOSPITAL CENTER EXTENSION CLINIC,771 61ST ST,BROOKLYN,36047.0,Kings,11220.0,40.635357,-74.010594,321,0,0,0,Commercial Travelers Life Insurance Company: C...,"771 61ST ST, BROOKLYN"
2654,SYOSSET HOSPITAL,4821 8TH AVE,BROOKLYN,36047.0,Kings,11220.0,40.641672,-74.002746,760,0,0,0,Centerlight Healthcare,"4821 8TH AVE, BROOKLYN"
3645,ARNOT HEALTH - ST JOSEPHS HOSPITAL LABORATORY,1 FOXCARE CENTER DR,ONEONTA,36077.0,Otsego,13820.0,42.454789,-75.060921,599,0,0,0,"Health Insurance Plan of Greater New York, Inc.","1 FOXCARE CENTER DR, ONEONTA"
3646,ARNOT HEALTH - ST JOSEPHS HOSPITAL LABORATORY,1 FOX CARE DR,ONEONTA,36077.0,Otsego,13820.0,42.456411,-75.029288,599,0,0,0,"Health Insurance Plan of Greater New York, Inc.","1 FOX CARE DR, ONEONTA"
3647,AURELIA OSBORN FOX MEMORIAL HOSPITAL,1 NORTON AVE,ONEONTA,36077.0,Otsego,13820.0,42.458118,-75.05199,11,0,0,0,"UnitedHealthcare of New York, Inc.: QHP","1 NORTON AVE, ONEONTA"


In [None]:
df_fs_hospitals

In [None]:
## create geo_dataframes including latitude and longitude in combined form so I can compare directly
!conda install --channel conda-forge geopandas
import geopandas
gdf_hosp_ny = geopandas.GeoDataFrame(
    hosp_ny, geometry=geopandas.points_from_xy(hosp_ny.Longitude, hosp_ny.Latitude))

In [None]:
## Finally, merge foursquare hospital list with Provider Network Data System (PNDS) list 
hosp_merged = pd.merge(left=hosp_ny,right=df_fs_hospitals, how='left', left_on='Site Name', right_on='fs_name')
hosp_merged

In [235]:
## check state names, adjust to fit with list from american hospital directory and kick out hospitals in adjacent states
df_fs_hospitals['fs_state'] = df_fs_hospitals['fs_state'].str.lower()
df_fs_hospitals['fs_state'].head()

0    ny
0    ny
0    ny
1    ny
0    ny
Name: fs_state, dtype: object

In [236]:
## what labels do we have for state?
df_fs_hospitals['fs_state'].unique()

array(['ny', 'ga', 'ar', 'wi', 'nc', 'ms', 'mo', 'mi', 'pa', 'ma',
       'massachusetts', 'mn', 'co', 'ca', 'al', 'il', 'new york', 'or',
       'in', 'oh', 'ky', 'michigan', 'nj', 'd.c.', 'minnesota', 'va',
       'ohio', 'ct', 'connecticut', 'md', 'fl', 'tennessee', 'tn', 'ia',
       'pennsylvania', 'az', 'ok', 'nd', 'nm', 'georgia', 'ut', 'sc',
       'south carolina', 'ne', 'colorado', 'wa', 'wisconsin', 'la',
       'illinois', 'tx', 'missouri', 'indiana', 'ks', 'nh'], dtype=object)

In [237]:
## substitute spelled-out new jersey or new york for abbreviations
df_fs_hospitals.loc[df_fs_hospitals['fs_state'] == 'new york', 'fs_state'] = 'ny'
## and kick out other states
df_fs_hospitals = df_fs_hospitals[df_fs_hospitals['fs_state'].isin(['ny'])]
## drop nans
df_fs_hospitals.dropna(inplace = True)
## reset index
df_fs_hospitals.reset_index(drop=True, inplace = True)
len(df_fs_hospitals)

538

In [238]:
## we should now only have ny as value for fs_state
df_fs_hospitals['fs_state'].unique()

array(['ny'], dtype=object)

In [239]:
df_fs_hospitals.head(1)

Unnamed: 0,fs_id,fs_address,fs_city,fs_lat,fs_lng,fs_state,fs_distance,fs_name
0,4b7de0a9f964a5204dd82fe3,1 NORTON AVE,Oneonta,42.457896,-75.052299,ny,35,A.O. Fox Hospital


In [240]:
hosp_ny.head(1)

Unnamed: 0,Site Name,Address,Town/City,County FIPS,County Name,Zip Code,Latitude,Longitude,Designated Service,Commercial Provider Indicator,Medicaid Provider Indicator,NYSOH Standard Essential Health Plan (EP) Indicator/Basic Health Program Indicator,Plan Name,Complete Address
1967,BROOKLYN HOSPITAL CENTER EXTENSION CLINIC,771 61ST ST,BROOKLYN,36047.0,Kings,11220.0,40.635357,-74.010594,321,0,0,0,Commercial Travelers Life Insurance Company: C...,"771 61ST ST, BROOKLYN"


In [241]:
## convert all address words to upper case in both dataframes
df_fs_hospitals['fs_address'] = df_fs_hospitals['fs_address'].str.upper()
hosp_ny['Address'] = hosp_ny['Address'].str.upper()


In [None]:
hosp_ny

In [None]:
## try to join dataframes by address. To do so first convert all abbreviations to full names, so when we find a
## blvd, convert to boulevard, rd = road, ave = avenue etc. To get an idea of what abbreviations exist check data
## visually:


In [None]:
edit[edit['Home'].notnull()]['Home'].apply(lambda s: s.replace('Ct', 'Court')



In [None]:
## Finally, merge foursquare hospital list with Provider Network Data System (PNDS) list 
hosp_merged = pd.merge(left=hosp_ny,right=df_fs_hospitals, how='left', left_on='Site Name', right_on='fs_name')
hosp_merged

In [90]:
## It's probably safer to convert all strings to lowercase
df_fs_hospitals['fs_name'] = df_fs_hospitals['fs_name'].str.upper()


In [74]:
hosp_ny.head()

Unnamed: 0,Site Name,Address,Town/City,County FIPS,County Name,Zip Code,Latitude,Longitude,Designated Service,Commercial Provider Indicator,Medicaid Provider Indicator,NYSOH Standard Essential Health Plan (EP) Indicator/Basic Health Program Indicator,Plan Name
1967,BROOKLYN HOSPITAL CENTER EXTENSION CLINIC,771 61ST ST,BROOKLYN,36047.0,Kings,11220.0,40.635357,-74.010594,321,0,0,0,Commercial Travelers Life Insurance Company: C...
3645,ARNOT HEALTH - ST JOSEPHS HOSPITAL LABORATORY,1 FOXCARE CENTER DR,ONEONTA,36077.0,Otsego,13820.0,42.454789,-75.060921,599,0,0,0,"Health Insurance Plan of Greater New York, Inc."
3646,ARNOT HEALTH - ST JOSEPHS HOSPITAL LABORATORY,1 FOX CARE DR,ONEONTA,36077.0,Otsego,13820.0,42.456411,-75.029288,599,0,0,0,"Health Insurance Plan of Greater New York, Inc."
3647,AURELIA OSBORN FOX MEMORIAL HOSPITAL,1 NORTON AVE,ONEONTA,36077.0,Otsego,13820.0,42.458118,-75.05199,11,0,0,0,"UnitedHealthcare of New York, Inc.: QHP"
3667,AURELIA OSBORN FOX MEMORIAL HOSPITAL - TRI TOW...,43 PEARL ST W,SIDNEY,36025.0,Delaware,13838.0,42.303881,-75.394419,11,0,0,0,Consolidated Health Plans: Cigna


In [75]:
df_fs_hospitals.head()

Unnamed: 0,fs_id,fs_city,fs_lat,fs_lng,fs_state,fs_name,fs_address
0,4b7de0a9f964a5204dd82fe3,Oneonta,42.457896,-75.052299,ny,A.O. FOX HOSPITAL,"oneonta, ny"
1,4c33793b66e40f47d810c88b,Sidney,42.304305,-75.395098,ny,TRI-TOWN REGIONAL HOSPITAL,"sidney, ny"
2,4c1b68b1b306c928db1462b7,Latham,42.738103,-73.785534,ny,ALBANY MEDICAL PRIMARY CARE OF INTERNAL MEDICI...,"latham, ny"
3,4bcc38853740b713b39d6365,New York,40.873344,-73.913031,ny,NEWYORK-PRESBYTERIAN/THE ALLEN HOSPITAL,"new york, ny"
4,520a502511d23e9294c427dd,New York,40.873451,-73.913857,ny,ALLEN HOSPITAL BREASTFEEDING FAIR,"new york, ny"


In [79]:
!conda install -c conda-forge fuzzywuzzy --yes 
from fuzzywuzzy import process

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\sbuer\Anaconda3

  added / updated specs:
    - fuzzywuzzy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    fuzzywuzzy-0.17.0          |             py_0          18 KB  conda-forge
    python-levenshtein-0.12.0  |py37hfa6e2cd_1001          80 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          98 KB

The following NEW packages will be INSTALLED:

  fuzzywuzzy         conda-forge/noarch::fuzzywuzzy-0.17.0-py_0
  python-levenshtein conda-forge/win-64::python-levenshtein-0.12.0-py37hfa6e2cd_1001



Downloading and Extracting Packages

fuzzywuzzy-0.17.0    | 18 KB     |            |   0% 
fuzzywuzzy-0.17.0    | 18 KB     | ########8  |  88%

In [None]:
str2Match = "apple inc"
strOptions = ["Apple Inc.","apple park","apple incorporated","iphone"]
Ratios = process.extract(str2Match,strOptions)
print(Ratios)
# You can also select the string with the highest matching percentage
highest = process.extractOne(str2Match,strOptions)
print(highest)

In [76]:
## Finally, merge foursquare hospital list with Provider Network Data System (PNDS) list 
hosp_merged = pd.merge(left=hosp_ny,right=df_fs_hospitals, how='left', left_on='Site Name', right_on='fs_name')
hosp_merged

Unnamed: 0,Site Name,Address,Town/City,County FIPS,County Name,Zip Code,Latitude,Longitude,Designated Service,Commercial Provider Indicator,Medicaid Provider Indicator,NYSOH Standard Essential Health Plan (EP) Indicator/Basic Health Program Indicator,Plan Name,fs_id,fs_city,fs_lat,fs_lng,fs_state,fs_name,fs_address
0,BROOKLYN HOSPITAL CENTER EXTENSION CLINIC,771 61ST ST,BROOKLYN,36047.0,Kings,11220.0,40.635357,-74.010594,321,0,0,0,Commercial Travelers Life Insurance Company: C...,,,,,,,
1,ARNOT HEALTH - ST JOSEPHS HOSPITAL LABORATORY,1 FOXCARE CENTER DR,ONEONTA,36077.0,Otsego,13820.0,42.454789,-75.060921,599,0,0,0,"Health Insurance Plan of Greater New York, Inc.",,,,,,,
2,ARNOT HEALTH - ST JOSEPHS HOSPITAL LABORATORY,1 FOX CARE DR,ONEONTA,36077.0,Otsego,13820.0,42.456411,-75.029288,599,0,0,0,"Health Insurance Plan of Greater New York, Inc.",,,,,,,
3,AURELIA OSBORN FOX MEMORIAL HOSPITAL,1 NORTON AVE,ONEONTA,36077.0,Otsego,13820.0,42.458118,-75.05199,11,0,0,0,"UnitedHealthcare of New York, Inc.: QHP",,,,,,,
4,AURELIA OSBORN FOX MEMORIAL HOSPITAL - TRI TOW...,43 PEARL ST W,SIDNEY,36025.0,Delaware,13838.0,42.303881,-75.394419,11,0,0,0,Consolidated Health Plans: Cigna,,,,,,,
5,LENOX HILL HOSPITAL,1155 NORTHERN BLVD,MANHASSET,36059.0,Nassau,11030.0,40.790982,-73.703647,914,1,0,0,"Oxford Health Plans (NY), Inc.: Metro",4a4cc69df964a52040ad1fe3,New York,40.773796,-73.961115,ny,LENOX HILL HOSPITAL,"new york, ny"
6,ADVENTIST BOLINGBROOK HOSPITAL OUTPATIENT PHAR...,500 REMINGTON BLVD,BOLINGBROOK,17197.0,Will,60440.0,41.679569,-88.083883,760,1,0,0,"Crystal Run Health Plan, LLC",,,,,,,
7,ADVENTIST GLENOAKS HOSPITAL OUTPATIENT PHARMACY,701 WINTHROP AVE,GLENDALE HEIGHT,17043.0,DuPage,60139.0,41.914654,-88.057238,760,1,0,0,"Crystal Run Health Plan, LLC",,,,,,,
8,ALBANY MEDICAL CENTER HOSPITAL,1367 WASHINGTON AVE,ALBANY,36001.0,Albany,12206.0,42.686774,-73.811284,914,0,0,0,"Health Insurance Plan of Greater New York, Inc.",,,,,,,
9,ALBANY MEDICAL CENTER HOSPITAL,178 WASHINGTON AVENUE EXT,ALBANY,36001.0,Albany,12203.0,42.699538,-73.851458,599,1,1,1,"Capital District Physicians' Health Plan, Inc....",,,,,,,


In [470]:
df_fs_hospitals.dtypes

fs_id          object
fs_city        object
fs_lat        float64
fs_lng        float64
fs_state       object
fs_name        object
fs_address     object
dtype: object

In [471]:
hosp_states.dtypes

name                      object
city                      object
staffed_beds               int64
total_discharges           int64
patient_days               int64
gross_patient_revenue     object
state                     object
address                   object
lat                      float64
lng                      float64
dtype: object

In [472]:
# Function to find all close matches of  
# input string in given list of possible strings 
from difflib import get_close_matches 
  
print(get_close_matches(hosp_states.loc[0,'name'], df_fs_hospitals.loc[:,'fs_name'])) 


['salem medical center', 'maimonides medical center', 'alice hyde medical center']


In [479]:
hosp_states.loc[0,'city']

'North Bergen'

In [481]:
print(get_close_matches(hosp_states.loc[0,'city'], df_fs_hospitals.loc[:,'fs_city']))

TypeError: object of type 'float' has no len()

In [414]:
venue_id = df_fs_hospitals.loc[1,'fs_id']
venue_id

'4af2b0def964a5203fe821e3'

In [425]:
url = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v={}'.format(venue_id, CLIENT_ID, CLIENT_SECRET, VERSION)
url    

'https://api.foursquare.com/v2/venues/4af2b0def964a5203fe821e3?client_id=U1ALKNVSQ5OT5GCDQTE3NTUSVQVANLLFPZBOW3BMYR53AG20&client_secret=2BGUAXJ14CEJNJQHSWRLNGA5IOJCMBSOAENHD5INTJOHLP45&v=20191111'

In [426]:
result = requests.get(url).json()
result['response']['venue']['likes']['count']

26

In [417]:
result['response']['venue']

{'id': '4af2b0def964a5203fe821e3',
 'name': "Mount Sinai St. Luke's",
 'contact': {'phone': '2125234000',
  'formattedPhone': '(212) 523-4000',
  'twitter': 'mountsinainyc',
  'facebook': '190647927632800',
  'facebookUsername': 'mountsinainyc',
  'facebookName': 'The Mount Sinai Hospital'},
 'location': {'address': '1111 Amsterdam Avenue',
  'crossStreet': 'at West 114th St.',
  'lat': 40.80557,
  'lng': -73.961321,
  'labeledLatLngs': [{'label': 'display', 'lat': 40.80557, 'lng': -73.961321}],
  'postalCode': '10025',
  'cc': 'US',
  'city': 'New York',
  'state': 'NY',
  'country': 'United States',
  'formattedAddress': ['1111 Amsterdam Avenue (at West 114th St.)',
   'New York, NY 10025',
   'United States']},
 'canonicalUrl': 'https://foursquare.com/v/mount-sinai-st-lukes/4af2b0def964a5203fe821e3',
 'categories': [{'id': '4bf58dd8d48988d196941735',
   'name': 'Hospital',
   'pluralName': 'Hospitals',
   'shortName': 'Hospital',
   'icon': {'prefix': 'https://ss3.4sqi.net/img/categ

In [418]:
# put json results into dataframe
test = results['response']['groups'][0]['items']
#nearby_hospitals = json_normalize(hospitals)

In [419]:
result = requests.get(url).json()
print(result['response']['venue'].keys())


dict_keys(['id', 'name', 'contact', 'location', 'canonicalUrl', 'categories', 'verified', 'stats', 'url', 'likes', 'dislike', 'ok', 'venueRatingBlacklisted', 'beenHere', 'specials', 'photos', 'reasons', 'description', 'storeId', 'page', 'hereNow', 'createdAt', 'tips', 'shortUrl', 'timeZone', 'listed', 'hours', 'popular', 'pageUpdates', 'inbox', 'attributes', 'bestPhoto', 'colors'])


In [448]:
result['response']['venues']['tips']['phrases']

KeyError: 'venues'

In [459]:
result['response']['venue']['tips']['groups']

[{'type': 'others',
  'name': 'All tips',
  'count': 26,
  'items': [{'id': '4d756820381fa35da227275d',
    'createdAt': 1299540000,
    'text': 'Best ER service',
    'type': 'user',
    'canonicalUrl': 'https://foursquare.com/item/4d756820381fa35da227275d',
    'lang': 'en',
    'likes': {'count': 6,
     'groups': [{'type': 'others',
       'count': 6,
       'items': [{'id': '44774553',
         'firstName': 'Jonathan',
         'lastName': 'Ramos',
         'gender': 'male',
         'photo': {'prefix': 'https://fastly.4sqi.net/img/user/',
          'suffix': '/44774553-ZG2Q5YOSUJFLAOGC.jpg'}},
        {'id': '8852447',
         'firstName': 'Santi',
         'gender': 'male',
         'photo': {'prefix': 'https://fastly.4sqi.net/img/user/',
          'suffix': '/8852447-3CQLDNH1SZVYIBWJ.jpg'}}]}],
     'summary': '6 likes'},
    'logView': True,
    'agreeCount': 2,
    'disagreeCount': 0,
    'todo': {'count': 0},
    'user': {'id': '6133945',
     'firstName': 'Jenna',
     'ge

In [None]:
## Not so sure if it actually makes sense to combine them anymore... simply continue only with the FS data
## What we need to know now is whether there is wifi access or not
for venue_id in df_fs_hospitals['fs_id']
    url = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v={}'.format(venue_id, CLIENT_ID, CLIENT_SECRET, VERSION)
    

In [336]:
## Now sub-select the hospitals from the original list (from the american hospital directory)
hosp_states.head()

Unnamed: 0,name,city,staffed_beds,total_discharges,patient_days,gross_patient_revenue,state,address,lat,lng
0,Palisades Medical Center,North Bergen,202,9575,41821,"$955,944",nj,"North Bergen, nj",40.804267,-74.012084
1,AtlantiCare Regional Medical Center - Mainland...,Pomona,0,0,0,$0,nj,"Pomona, nj",39.464194,-74.545266
2,AtlantiCare Regional Medical Center -Atlantic ...,Atlantic City,520,29917,129930,"$3,608,586",nj,"Atlantic City, nj",39.364285,-74.422935
3,Bayonne Medical Center,Bayonne,163,5083,23070,"$1,603,840",nj,"Bayonne, nj",40.668714,-74.114309
4,Bayshore Medical Center,Holmdel,169,7212,37434,"$853,103",nj,"Holmdel, nj",40.345109,-74.184032


In [78]:
len(df_filt)

50

In [2]:
tips = result['response']['tips']['items']

tip = result['response']['tips']['items'][0]
tip.keys()

NameError: name 'result' is not defined