# Goal with this is to nail down the part of the scraper that actually retreives and parses each profile's webpage.

Only example scraping health.usnews.com (granted, I didn't search too much): https://medium.com/analytics-vidhya/web-scraping-best-childrens-hospitals-for-cancer-5574db6d4090

HTML formatter to help with HTML parsing: https://webformatter.com/html

In [1]:
# NB: There is some overlap between the libraries used in the following tests
# For Test A
from bs4 import BeautifulSoup
import requests

import json
import pandas as pd

import pprint

# For Test B
from urllib.request import Request, urlopen

# For Test C

# Test A

In [None]:
# Test A1
url = 'https://health.usnews.com/doctors/richard-duszak-130940'
prefix = 'https://health.usnews.com/'
user_agent = {'User-agent' : 'Mozilla/5.0'}

page = requests.get(url, headers=user_agent)
soup = BeautifulSoup(page.text, 'lxml')

In [None]:
# Test A2
div_tags = [d for ld in soup.find_all('div class="mb5"')]
div_text = div_tags[-1].text
div_data = json.loads(div_text.split("window['__PAGE_CONTEXT_QUERY_STATE__'] = ")[1].rstrip(';\n'))

matches = div_data.get('src/containers/pages/health/hospitals/search/index.js').get('data').get('matches')

# Test B

In [None]:
from urllib.request import Request, urlopen

url = 'https://health.usnews.com/'

req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()

#soup.findAll("div", {"class": "EducationAndExperience__Item-dbww3o-0 eUTnkN"})
# soup = BeautifulSoup(html_content, "lxml")
# print(soup.prettify())

In [None]:
soup = BeautifulSoup(webpage, "lxml")
print(soup.prettify())

# Take Aways
1. Ultimately I think I need to find a way to just request/pull the specific info from a page rather than the whole thing.
    - I think I can do this with py.scrapy
2. I should also look into cycling (a) IP address and (b) headers.

# Test C - This is the best so far

In [9]:
url = 'https://health.usnews.com/doctors/richard-duszak-130940'
prefix = 'https://health.usnews.com/'
user_agent = {'User-agent' : 'Mozilla/5.0'}
page = requests.get(url, headers=user_agent)
soup = BeautifulSoup(page.text, "html.parser")

# What I originally had before 2.2.22...
# my_divs = soup.findAll("div", {"class": "EducationAndExperience__Item-dbww3o-0 eUTnkN"})
# print(my_divs)

In [17]:
# Find all HTML relating to Education & Experience (includes med school & residency, licenses, etc.)
ed_ex_elements = soup.findAll("div", {"class": "EducationAndExperience__Item-dbww3o-0 eUTnkN"}) 
# Take a look at all the elements
for ed_ex_element in ed_ex_elements:
    print(ed_ex_element, end="\n"*2)

<div class="EducationAndExperience__Item-dbww3o-0 eUTnkN" spacing="4"><p class="Paragraph-sc-1iyax29-0 jyTtTz" size="5" spacing="0">University of Pennsylvania Health System</p><p class="Paragraph-sc-1iyax29-0 kCHkVX" color="#515767" size="5" spacing="0">Fellowship<span class="Span-sc-19wk4id-0 kukjnS" size="5">, <!-- -->Vascular and Interventional Radiology</span></p></div>

<div class="EducationAndExperience__Item-dbww3o-0 eUTnkN" spacing="4"><p class="Paragraph-sc-1iyax29-0 jyTtTz" size="5" spacing="0">Duke University Hospital</p><p class="Paragraph-sc-1iyax29-0 kCHkVX" color="#515767" size="5" spacing="0">Residency<span class="Span-sc-19wk4id-0 kukjnS" size="5">, <!-- -->Radiology</span></p></div>

<div class="EducationAndExperience__Item-dbww3o-0 eUTnkN" spacing="4"><p class="Paragraph-sc-1iyax29-0 jyTtTz" size="5" spacing="0">York Hospital</p><p class="Paragraph-sc-1iyax29-0 kCHkVX" color="#515767" size="5" spacing="0">Internship<span class="Span-sc-19wk4id-0 kukjnS" size="5">, <!

In [18]:
# using: https://realpython.com/beautiful-soup-web-scraper-python/

# Pick out the child element I'm interested in
for ed_ex_element in ed_ex_elements:
    organization_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 jyTtTz")
    education_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 kCHkVX")
    specialization_element = ed_ex_element.find("span", class_="Span-sc-19wk4id-0 kukjnS")
    print(organization_element)
    print(education_element)
    print(specialization_element)
    print()

<p class="Paragraph-sc-1iyax29-0 jyTtTz" size="5" spacing="0">University of Pennsylvania Health System</p>
<p class="Paragraph-sc-1iyax29-0 kCHkVX" color="#515767" size="5" spacing="0">Fellowship<span class="Span-sc-19wk4id-0 kukjnS" size="5">, <!-- -->Vascular and Interventional Radiology</span></p>
<span class="Span-sc-19wk4id-0 kukjnS" size="5">, <!-- -->Vascular and Interventional Radiology</span>

<p class="Paragraph-sc-1iyax29-0 jyTtTz" size="5" spacing="0">Duke University Hospital</p>
<p class="Paragraph-sc-1iyax29-0 kCHkVX" color="#515767" size="5" spacing="0">Residency<span class="Span-sc-19wk4id-0 kukjnS" size="5">, <!-- -->Radiology</span></p>
<span class="Span-sc-19wk4id-0 kukjnS" size="5">, <!-- -->Radiology</span>

<p class="Paragraph-sc-1iyax29-0 jyTtTz" size="5" spacing="0">York Hospital</p>
<p class="Paragraph-sc-1iyax29-0 kCHkVX" color="#515767" size="5" spacing="0">Internship<span class="Span-sc-19wk4id-0 kukjnS" size="5">, <!-- -->Transitional Year</span></p>
<span 

# Note 
the HTML tree for the education and experience section looks like it has an "< h3 >" associated with each < div class="mb5" >, where the div is for the whole subsection and the h3 says "Medical School and Residency" or etc. 
    
I could use this get specifically just the first one for med school and residency, somehow.