In [1]:
from splinter import Browser
from bs4 import BeautifulSoup as bs
import re
from selenium import webdriver
import os
from time import sleep

In [2]:
!which chromedriver

/usr/local/bin/chromedriver


In [3]:
# Open browser, scroll to bottom, get the HTML
browser = webdriver.Chrome()
url = ("https://www.ranker.com/list/u-s-news-best-hospitals-cancer/ceorick")
browser.get(url)

# Scroll to bottom of page w/ a pause - this causes all hospitals to load
SCROLL_PAUSE_TIME = 4

# Get scroll height
last_height = browser.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to bottom
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    sleep(SCROLL_PAUSE_TIME)

    # Calculate new scroll height and compare with last scroll height
    new_height = browser.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height


html_source = browser.page_source
browser.quit()

# Create soup object using HTML source
soup = bs(html_source, "html5lib")
#print(soup)


In [4]:
# Scrape the soup object to get the hospital names and remove whitespaces
# and convert to upperscase

top50_list = []
           
for title in soup.find_all(**{'class' : re.compile('^listItem__title.*')}):
    top50_list.append(title.get_text().strip().upper())
                      
top50_list

    

['THE UNIVERSITY OF TEXAS M. D. ANDERSON CANCER CENTER',
 'MEMORIAL SLOAN KETTERING CANCER CENTER',
 'MAYO CLINIC',
 'JOHNS HOPKINS HOSPITAL',
 'UNIVERSITY OF WASHINGTON MEDICAL CENTER',
 'DANA–FARBER CANCER INSTITUTE',
 'MASSACHUSETTS GENERAL HOSPITAL',
 'UNIVERSITY OF CALIFORNIA, SAN FRANCISCO MEDICAL CENTER',
 'CLEVELAND CLINIC',
 'RONALD REAGAN UCLA MEDICAL CENTER',
 'STANFORD HOSPITAL AND CLINICS',
 'DUKE UNIVERSITY HOSPITAL',
 'UNIVERSITY OF MICHIGAN HOSPITALS AND HEALTH CENTERS',
 'HOSPITAL OF THE UNIVERSITY OF PENNSYLVANIA',
 'UNIVERSITY OF CHICAGO MEDICAL CENTER',
 "BRIGHAM AND WOMEN'S HOSPITAL",
 'BARNES-JEWISH HOSPITAL/WASHINGTON UNIVERSITY',
 'VANDERBILT UNIVERSITY MEDICAL CENTER',
 'MOFFITT CANCER CENTER',
 'CITY OF HOPE',
 'UNIVERSITY OF MARYLAND MEDICAL CENTER',
 'UNIVERSITY OF IOWA HOSPITALS AND CLINICS',
 'YALE–NEW HAVEN HOSPITAL',
 'NEW YORK-PRESBYTERIAN UNIVERSITY HOSPITAL OF COLUMBIA AND CORNELL',
 'NYU LANGONE MEDICAL CENTER',
 'OHIO STATE UNIVERSITY JAMES CANCER H

In [5]:
import pandas as pd
import requests
import json
import pprint
from pandas.io.json import json_normalize
from fuzzywuzzy import fuzz
from tqdm import tqdm

In [6]:
# Base URL for the arcgis 
url = 'https://opendata.arcgis.com/datasets/a2817bf9632a43f5ad1c6b0c153b0fab_0.geojson'
data = requests.get(url)
data

<Response [200]>

In [11]:
hospitals_json = data.json()
hospitals_json['features'][1]

{'type': 'Feature',
 'properties': {'OBJECTID': 14511,
  'ID': '0012602537',
  'NAME': 'SPAULDING REHABILITATION HOSPITAL CAPE COD',
  'ADDRESS': '311 SERVICE ROAD',
  'CITY': 'SANDWICH',
  'STATE': 'MA',
  'ZIP': '02537',
  'ZIP4': 'NOT AVAILABLE',
  'TELEPHONE': '(508) 833-4000',
  'TYPE': 'REHABILITATION',
  'STATUS': 'OPEN',
  'POPULATION': 60,
  'COUNTY': 'BARNSTABLE',
  'COUNTYFIPS': '25001',
  'COUNTRY': 'USA',
  'LATITUDE': 41.7329844970001,
  'LONGITUDE': -70.4681028369999,
  'NAICS_CODE': '622310',
  'NAICS_DESC': 'REHABILITATION HOSPITALS (EXCEPT ALCOHOLISM, DRUG ADDICTION)',
  'SOURCE': 'http://www.mass.gov/eohhs/gov/departments/dph/programs/hcq/healthcare-quality/',
  'SOURCEDATE': '2018-02-16T00:00:00.000Z',
  'VAL_METHOD': 'IMAGERY WITH OTHER',
  'VAL_DATE': '2018-03-19T00:00:00.000Z',
  'WEBSITE': 'http://spauldingrehab.org/locations/sandwich-ma/',
  'STATE_ID': '2FXY',
  'ALT_NAME': 'NOT AVAILABLE',
  'ST_FIPS': '25',
  'OWNER': 'NON-PROFIT',
  'TTL_STAFF': -999,
  'BE

In [66]:
#for hosp in top50_list:
top50_json = []
count = 0
for i in range(len(top50_list)): # tdqm allows loop tracking
    for feature in hospitals_json['features']:
        
        if fuzz.partial_ratio(feature['properties']['NAME'], top50_list[i]) > 95 \
            or fuzz.partial_ratio(feature['properties']['ALT_NAME'], top50_list[i]) > 95:
            #top50_json['features'].append(feature)
            #print(feature['properties']['NAME'])
            top50_json.append(feature['properties'])
            count = count +1
        


In [67]:
for item in top50_json:
    pprint.pprint(item['NAME'])
#len(top50_json)

'UNIVERSITY OF TEXAS M.D. ANDERSON CANCER CENTER'
'MAYO CLINIC'
'MAYO CLINIC HOSPITAL METHODIST CAMPUS'
'MAYO CLINIC HEALTH SYS AUSTIN'
'MAYO CLINIC HEALTH SYSTEM IN WAYCROSS, INC'
'MAYO CLINIC HLTH SYSTM FRANCISCAN HLTHCARE SPARTA'
'MAYO CLINIC HEALTH SYSTEM - NORTHLAND IN BARRON'
'MAYO CLINIC HEALTH SYSTEM- CHIPPEWA VALLEY  INC'
'MAYO CLINIC HEALTH SYSTEM EAU CLAIRE HOSPITAL'
'MAYO CLINIC HLTH SYSTEM- FRANCISCAN MED CTR'
'MAYO CLINIC HEALTH SYSTEM - RED CEDAR INC'
'MAYO CLINIC HEALTH SYSTEM-OAKRIDGE INC'
'MAYO CLINIC HEALTH SYS ALBT LE'
'MAYO CLINIC HEALTH SYS CF'
'MAYO CLINIC HEALTH SYS FAIRMNT'
'MAYO CLINIC HEALTH SYS MANKATO'
'MAYO CLINIC HEALTH SYS ST JAME'
'MAYO CLINIC HEALTH SYSTEM IN RED WING'
'MAYO CLINIC HEALTH SYSTEM S F'
'MAYO CLINIC ARIZONA'
"MAYO CLINIC HOSPITAL ROCHESTER ST MARY'S CAMPUS"
'MAYO CLINIC HEALTH SYS WASECA'
'MAYO CLINIC HEALTH SYS CF'
'MAYO CLINIC HEALTH SYS L C'
'JOHNS HOPKINS HOSPITAL'
"SEATTLE CHILDREN'S HOSPITAL"
'UNIVERSITY OF WASHINGTON MEDICAL CENTER

In [69]:
with open('hospitals.json', 'w') as outfile:
    json.dump(top50_json, outfile, sort_keys=True, indent=4)