# Goal with this is to nail down the part of the scraper that actually retreives and parses each profile's webpage.

Only example scraping health.usnews.com (granted, I didn't search too much): https://medium.com/analytics-vidhya/web-scraping-best-childrens-hospitals-for-cancer-5574db6d4090

HTML formatter to help with HTML parsing: https://webformatter.com/html

In [1]:
# Once I'm at the radiologist's webpage
from bs4 import BeautifulSoup
import requests

import json
import pandas as pd

import pprint

from urllib.request import Request, urlopen

# Take Aways
1. Ultimately I think I need to find a way to just request/pull the specific info from a page rather than the whole thing.
    - I think I can do this with py.scrapy
2. I should also look into cycling (a) IP address and (b) headers.

In [2]:
url = 'https://health.usnews.com/doctors/richard-duszak-130940'
prefix = 'https://health.usnews.com/'
user_agent = {'User-agent' : 'Mozilla/5.0'}
page = requests.get(url, headers=user_agent)
soup = BeautifulSoup(page.text, "html.parser")

In [34]:
# Narrow down the HTML to the section I want using "id"
results = soup.find(id="experience")
print(results.prettify())

#NB could directly use:
# ed_ex_elements = soup.findAll("div", {"class": "EducationAndExperience__Item-dbww3o-0 eUTnkN"}) 

<section class="content__Section-sc-1g5jtse-0 iBkvsh" id="experience">
 <h2 class="Heading__HeadingStyled-sc-1w5xk2o-0-h2 jWflL Heading-sc-1w5xk2o-1 cRrhAX" size="4" spacing="4">
  Education &amp; Experience
 </h2>
 <div class="mb5">
  <h3 class="Heading__HeadingStyled-sc-1w5xk2o-0-h3 ipbGql Heading-sc-1w5xk2o-1 cRrhAX border-bottom pb1" size="2" spacing="3">
   Medical School &amp; Residency
  </h3>
  <div class="EducationAndExperience__Item-dbww3o-0 eUTnkN" spacing="4">
   <p class="Paragraph-sc-1iyax29-0 jyTtTz" size="5" spacing="0">
    University of Pennsylvania Health System
   </p>
   <p class="Paragraph-sc-1iyax29-0 kCHkVX" color="#515767" size="5" spacing="0">
    Fellowship
    <span class="Span-sc-19wk4id-0 kukjnS" size="5">
     ,
     <!-- -->
     Vascular and Interventional Radiology
    </span>
   </p>
  </div>
  <div class="EducationAndExperience__Item-dbww3o-0 eUTnkN" spacing="4">
   <p class="Paragraph-sc-1iyax29-0 jyTtTz" size="5" spacing="0">
    Duke University Ho

In [35]:
# Find all HTML relating to Education & Experience (includes med school & residency, licenses, etc.)
ed_ex_elements = results.find_all("div", class_= "EducationAndExperience__Item-dbww3o-0 eUTnkN")

# Take a look at all the elements
for ed_ex_element in ed_ex_elements:
    print(ed_ex_element, end="\n"*2)


<div class="EducationAndExperience__Item-dbww3o-0 eUTnkN" spacing="4"><p class="Paragraph-sc-1iyax29-0 jyTtTz" size="5" spacing="0">University of Pennsylvania Health System</p><p class="Paragraph-sc-1iyax29-0 kCHkVX" color="#515767" size="5" spacing="0">Fellowship<span class="Span-sc-19wk4id-0 kukjnS" size="5">, <!-- -->Vascular and Interventional Radiology</span></p></div>

<div class="EducationAndExperience__Item-dbww3o-0 eUTnkN" spacing="4"><p class="Paragraph-sc-1iyax29-0 jyTtTz" size="5" spacing="0">Duke University Hospital</p><p class="Paragraph-sc-1iyax29-0 kCHkVX" color="#515767" size="5" spacing="0">Residency<span class="Span-sc-19wk4id-0 kukjnS" size="5">, <!-- -->Radiology</span></p></div>

<div class="EducationAndExperience__Item-dbww3o-0 eUTnkN" spacing="4"><p class="Paragraph-sc-1iyax29-0 jyTtTz" size="5" spacing="0">York Hospital</p><p class="Paragraph-sc-1iyax29-0 kCHkVX" color="#515767" size="5" spacing="0">Internship<span class="Span-sc-19wk4id-0 kukjnS" size="5">, <!

In [36]:
# using: https://realpython.com/beautiful-soup-web-scraper-python/

# Pick out the child element I'm interested in
for ed_ex_element in ed_ex_elements:
    organization_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 jyTtTz").text
    education_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 kCHkVX").text
    print(organization_element)
    print(education_element)
    print() # v-space between elements

University of Pennsylvania Health System
Fellowship, Vascular and Interventional Radiology

Duke University Hospital
Residency, Radiology

York Hospital
Internship, Transitional Year

Pennsylvania State University College of Medicine
Medical School

NC State Medical License
Active through 1998

PA State Medical License
Active through 2014

MS State Medical License
Active through 2015

TN State Medical License
Active through 2015

SIR Fellow


AttributeError: 'NoneType' object has no attribute 'text'

In [42]:
# Gather the radiologists' data
radiologist_name = "Richard Duszak"
# create a list to store the data
scraped_data = [] 
for ed_ex_element in ed_ex_elements:
    # initialize the dictionary
    radiologist_details = {}
    
    # parse out the exact text we want
    organization_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 jyTtTz").text
    education_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 kCHkVX").text
   
    # add data to the dictionary
    radiologist_details['radiologist_name'] = radiologist_name
    radiologist_details['organization_element'] = organization_element
    radiologist_details['education_element'] = education_element
    
    # append the scraped data to the list
    scraped_data.append(radiologist_details)

AttributeError: 'NoneType' object has no attribute 'text'

In [43]:
# Create a data frame from the list of dictionaries and save as CSV
dataFrame = pd.DataFrame.from_dict(scraped_data)
dataFrame = to_csv('radiology_tainee_scraped_data.csv', index=False)

# View DF
dataFrame.head()