In [30]:
from splinter import Browser
from bs4 import BeautifulSoup as bs
import re
from selenium import webdriver
import os
from time import sleep

In [31]:
!which chromedriver

/usr/local/bin/chromedriver


In [32]:
# Open browser, scroll to bottom, get the HTML
browser = webdriver.Chrome()
url = ("https://health.usnews.com/best-hospitals/rankings/cancer")
browser.get(url)

# Scroll to bottom of page w/ a pause - this causes all hospitals to load
SCROLL_PAUSE_TIME = 4

# Get scroll height
last_height = browser.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to bottom
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    sleep(SCROLL_PAUSE_TIME)

    # Calculate new scroll height and compare with last scroll height
    new_height = browser.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height


html_source = browser.page_source
browser.quit()

# Create soup object using HTML source
soup = bs(html_source, "html5lib")
#print(soup)


In [33]:
# Scrape the soup object to get the hospital names and remove whitespaces
# and convert to upperscase

top50_list = []
           
for name in soup.find_all('h3', {'class' : 'block-flush heading-large'}):
    top50_list.append(name.get_text().strip().upper())
                      
top50_list

    

['UNIVERSITY OF TEXAS MD ANDERSON CANCER CENTER',
 'MEMORIAL SLOAN-KETTERING CANCER CENTER',
 'MAYO CLINIC',
 "DANA-FARBER/BRIGHAM AND WOMEN'S CANCER CENTER",
 'CLEVELAND CLINIC',
 'JOHNS HOPKINS HOSPITAL',
 'SEATTLE CANCER ALLIANCE/UNIVERSITY OF WASHINGTON MEDICAL CENTER',
 'H. LEE MOFFITT CANCER CENTER AND RESEARCH INSTITUTE',
 'UCSF MEDICAL CENTER',
 'HOSPITALS OF THE UNIVERSITY OF PENNSYLVANIA-PENN PRESBYTERIAN',
 'MAYO CLINIC-PHOENIX',
 'MASSACHUSETTS GENERAL HOSPITAL',
 'NORTHWESTERN MEMORIAL HOSPITAL',
 'STANFORD HEALTH CARE-STANFORD HOSPITAL',
 'SITEMAN CANCER CENTER',
 'UNIVERSITY OF MICHIGAN HOSPITALS-MICHIGAN MEDICINE',
 'USC NORRIS CANCER HOSPITAL-KECK MEDICAL CENTER OF USC',
 'UNIVERSITY OF IOWA HOSPITALS AND CLINICS',
 'WAKE FOREST BAPTIST MEDICAL CENTER',
 'OHIO STATE UNIVERSITY JAMES CANCER HOSPITAL',
 'CITY OF HOPE HELFORD CLINICAL RESEARCH HOSPITAL',
 'UCLA MEDICAL CENTER',
 'UPMC PRESBYTERIAN SHADYSIDE',
 'MUSC HEALTH-UNIVERSITY MEDICAL CENTER',
 'NEW YORK-PRESBYTERI

In [34]:
import pandas as pd
import requests
import json
import pprint
from pandas.io.json import json_normalize
from fuzzywuzzy import fuzz
from tqdm import tqdm

In [35]:
# Base URL for the arcgis 
url = 'https://opendata.arcgis.com/datasets/a2817bf9632a43f5ad1c6b0c153b0fab_0.geojson'
data = requests.get(url)
data

<Response [200]>

In [36]:
hospitals_json = data.json()
hospitals_json['features'][0]

{'type': 'Feature',
 'properties': {'OBJECTID': 14510,
  'ID': '0010602129',
  'NAME': 'SPAULDING REHABILITATION HOSPITAL',
  'ADDRESS': '300 1ST AVE',
  'CITY': 'BOSTON',
  'STATE': 'MA',
  'ZIP': '02129',
  'ZIP4': 'NOT AVAILABLE',
  'TELEPHONE': '(617) 573-7000',
  'TYPE': 'REHABILITATION',
  'STATUS': 'OPEN',
  'POPULATION': 132,
  'COUNTY': 'SUFFOLK',
  'COUNTYFIPS': '25025',
  'COUNTRY': 'USA',
  'LATITUDE': 42.3785090760001,
  'LONGITUDE': -71.049055546,
  'NAICS_CODE': '622310',
  'NAICS_DESC': 'REHABILITATION HOSPITALS (EXCEPT ALCOHOLISM, DRUG ADDICTION)',
  'SOURCE': 'http://www.mass.gov/eohhs/gov/departments/dph/programs/hcq/healthcare-quality/',
  'SOURCEDATE': '2018-02-16T00:00:00.000Z',
  'VAL_METHOD': 'IMAGERY',
  'VAL_DATE': '2018-03-19T00:00:00.000Z',
  'WEBSITE': 'http://spauldingrehab.org/',
  'STATE_ID': '2321',
  'ALT_NAME': 'NOT AVAILABLE',
  'ST_FIPS': '25',
  'OWNER': 'NON-PROFIT',
  'TTL_STAFF': -999,
  'BEDS': 132,
  'TRAUMA': 'NOT AVAILABLE',
  'HELIPAD': 'NO

In [37]:
#for hosp in top50_list:
top50_json = []
count = 0
for i in range(len(top50_list)): # tdqm allows loop tracking
    for feature in hospitals_json['features']:
        
        if fuzz.partial_ratio(feature['properties']['NAME'], top50_list[i]) > 95 \
            or fuzz.partial_ratio(feature['properties']['ALT_NAME'], top50_list[i]) > 95:
            #top50_json['features'].append(feature)
            print(feature['properties']['NAME'])
            top50_json.append(feature['properties'])
            count = count +1
        


UNIVERSITY OF TEXAS M.D. ANDERSON CANCER CENTER
MAYO CLINIC
MAYO CLINIC HOSPITAL METHODIST CAMPUS
MAYO CLINIC HEALTH SYS AUSTIN
MAYO CLINIC HEALTH SYSTEM IN WAYCROSS, INC
MAYO CLINIC HLTH SYSTM FRANCISCAN HLTHCARE SPARTA
MAYO CLINIC HEALTH SYSTEM - NORTHLAND IN BARRON
MAYO CLINIC HEALTH SYSTEM- CHIPPEWA VALLEY  INC
MAYO CLINIC HEALTH SYSTEM EAU CLAIRE HOSPITAL
MAYO CLINIC HLTH SYSTEM- FRANCISCAN MED CTR
MAYO CLINIC HEALTH SYSTEM - RED CEDAR INC
MAYO CLINIC HEALTH SYSTEM-OAKRIDGE INC
MAYO CLINIC HEALTH SYS ALBT LE
MAYO CLINIC HEALTH SYS CF
MAYO CLINIC HEALTH SYS FAIRMNT
MAYO CLINIC HEALTH SYS MANKATO
MAYO CLINIC HEALTH SYS ST JAME
MAYO CLINIC HEALTH SYSTEM IN RED WING
MAYO CLINIC HEALTH SYSTEM S F
MAYO CLINIC ARIZONA
MAYO CLINIC HOSPITAL ROCHESTER ST MARY'S CAMPUS
MAYO CLINIC HEALTH SYS WASECA
MAYO CLINIC HEALTH SYS CF
MAYO CLINIC HEALTH SYS L C
CLEVELAND CLINIC AVON HOSPITAL
CLEVELAND CLINIC REHABILITATION HOSPITAL, LLC
CLEVELAND CLINIC HOSPITAL
CLEVELAND CLINIC CHILDREN'S HOSPITAL FOR

In [47]:
zip = '33612'
for feature in hospitals_json['features']:
    if fuzz.ratio(feature['properties']['ZIP'], zip) == 100:
        pprint.pprint(feature['properties'])

{'ADDRESS': '12902 MAGNOLIA DR',
 'ALT_NAME': 'NOT AVAILABLE',
 'BEDS': 206,
 'CITY': 'TAMPA',
 'COUNTRY': 'USA',
 'COUNTY': 'HILLSBOROUGH',
 'COUNTYFIPS': '12057',
 'HELIPAD': 'NOT AVAILABLE',
 'ID': '0015033612',
 'LATITUDE': 28.064098717,
 'LONGITUDE': -82.421308026,
 'NAICS_CODE': '622110',
 'NAICS_DESC': 'GENERAL MEDICAL AND SURGICAL HOSPITALS',
 'NAME': 'H LEE MOFFITT CANCER CTR & RESEARCH INST',
 'OBJECTID': 14924,
 'OWNER': 'NON-PROFIT',
 'POPULATION': 206,
 'SOURCE': 'http://www.floridahealthfinder.gov/facilitylocator/facilitysearch.aspx',
 'SOURCEDATE': '2018-02-26T00:00:00.000Z',
 'STATE': 'FL',
 'STATE_ID': '10080',
 'STATUS': 'OPEN',
 'ST_FIPS': '12',
 'TELEPHONE': '(813) 745-4673',
 'TRAUMA': 'NOT AVAILABLE',
 'TTL_STAFF': -999,
 'TYPE': 'GENERAL ACUTE CARE',
 'VAL_DATE': '2018-02-27T00:00:00.000Z',
 'VAL_METHOD': 'IMAGERY',
 'WEBSITE': 'http://www.moffitt.org',
 'ZIP': '33612',
 'ZIP4': 'NOT AVAILABLE'}
{'ADDRESS': '12502 USF PINE DR',
 'ALT_NAME': 'NOT AVAILABLE',
 'BED

In [39]:
for item in top50_json:
    pprint.pprint(item['NAME'])
#len(top50_json)

'UNIVERSITY OF TEXAS M.D. ANDERSON CANCER CENTER'
'MAYO CLINIC'
'MAYO CLINIC HOSPITAL METHODIST CAMPUS'
'MAYO CLINIC HEALTH SYS AUSTIN'
'MAYO CLINIC HEALTH SYSTEM IN WAYCROSS, INC'
'MAYO CLINIC HLTH SYSTM FRANCISCAN HLTHCARE SPARTA'
'MAYO CLINIC HEALTH SYSTEM - NORTHLAND IN BARRON'
'MAYO CLINIC HEALTH SYSTEM- CHIPPEWA VALLEY  INC'
'MAYO CLINIC HEALTH SYSTEM EAU CLAIRE HOSPITAL'
'MAYO CLINIC HLTH SYSTEM- FRANCISCAN MED CTR'
'MAYO CLINIC HEALTH SYSTEM - RED CEDAR INC'
'MAYO CLINIC HEALTH SYSTEM-OAKRIDGE INC'
'MAYO CLINIC HEALTH SYS ALBT LE'
'MAYO CLINIC HEALTH SYS CF'
'MAYO CLINIC HEALTH SYS FAIRMNT'
'MAYO CLINIC HEALTH SYS MANKATO'
'MAYO CLINIC HEALTH SYS ST JAME'
'MAYO CLINIC HEALTH SYSTEM IN RED WING'
'MAYO CLINIC HEALTH SYSTEM S F'
'MAYO CLINIC ARIZONA'
"MAYO CLINIC HOSPITAL ROCHESTER ST MARY'S CAMPUS"
'MAYO CLINIC HEALTH SYS WASECA'
'MAYO CLINIC HEALTH SYS CF'
'MAYO CLINIC HEALTH SYS L C'
'CLEVELAND CLINIC AVON HOSPITAL'
'CLEVELAND CLINIC REHABILITATION HOSPITAL, LLC'
'CLEVELAND CLI

In [40]:
with open('hospitals.json', 'w') as outfile:
    json.dump(top50_json, outfile, sort_keys=True, indent=4)