# Goal with this is to nail down the part of the scraper that actually retreives and parses each profile's webpage.

Only example scraping health.usnews.com (granted, I didn't search too much): https://medium.com/analytics-vidhya/web-scraping-best-childrens-hospitals-for-cancer-5574db6d4090

HTML formatter to help with HTML parsing: https://webformatter.com/html

In [1]:
# Scraping
from bs4 import BeautifulSoup
import requests

import json
import pandas as pd

import pprint

from urllib.request import Request, urlopen

from time import sleep
from random import randint

In [2]:
# Progress counter/tracker
from contextlib import contextmanager
from timeit import default_timer
import sys
from datetime import timedelta

# Define a timer w/ decoractor to use with the loop
@contextmanager
def elapsed_timer():
    start=default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start

# Notes
2. I should also look into cycling (a) IP address and (b) headers.

# Test with one radiologist

In [8]:
child_url = 'doctors/richard-duszak-130940'
prefix = 'https://health.usnews.com/'
user_agent = {'User-agent' : 'Mozilla/5.0'}
page = requests.get(prefix+child_url, headers=user_agent)
soup = BeautifulSoup(page.text, "html.parser")

In [9]:
# Narrow down the HTML to the section I want using "id"
results = soup.find(id="experience")
print(results.prettify())

#NB could directly use:
# ed_ex_elements = soup.findAll("div", {"class": "EducationAndExperience__Item-dbww3o-0 eUTnkN"}) 

<section class="content__Section-sc-1g5jtse-0 iBkvsh" id="experience">
 <h2 class="Heading-sc-1w5xk2o-0 hqXNgZ">
  Education &amp; Experience
 </h2>
 <div class="mb5">
  <h3 class="Heading-sc-1w5xk2o-0 hemROW border-bottom pb1">
   Medical School &amp; Residency
  </h3>
  <div class="EducationAndExperience__Item-dbww3o-0 eUTnkN" spacing="4">
   <p class="Paragraph-sc-1iyax29-0 eRvRyE" size="5" spacing="0">
    University of Southern California/LACUSC Medical Center
   </p>
   <p class="Paragraph-sc-1iyax29-0 hwNctc" color="#515767" size="5" spacing="0">
    Fellowship
    <span class="Span-sc-19wk4id-0 fgWqyH" size="5">
     ,
     <!-- -->
     University of Southern California/LAC+USC Medical Center
    </span>
   </p>
  </div>
  <div class="EducationAndExperience__Item-dbww3o-0 eUTnkN" spacing="4">
   <p class="Paragraph-sc-1iyax29-0 eRvRyE" size="5" spacing="0">
    Cedars-Sinai Medical Center
   </p>
   <p class="Paragraph-sc-1iyax29-0 hwNctc" color="#515767" size="5" spacing="0">

In [10]:
# Find all HTML relating to Education & Experience (includes med school & residency, licenses, etc.)
ed_ex_elements = results.find_all("div", class_= "EducationAndExperience__Item-dbww3o-0 eUTnkN")

# Pick out the child element I'm interested in
for ed_ex_element in ed_ex_elements:
    organization_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 eRvRyE").text
    try:
        education_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 hwNctc").text
    except:
        pass
    print(organization_element)
    print(education_element)
    print() # v-space between elements

University of Southern California/LACUSC Medical Center
Fellowship, University of Southern California/LAC+USC Medical Center

Cedars-Sinai Medical Center
Residency, Cedars-Sinai Medical Center

Tucson Hospitals Medical Education Program Inc
Internship, Tucson Hospitals Medical Education

CA State Medical License
Active through 2023

MD State Medical License
Active through 2019

KS State Medical License
Active through 2020

FL State Medical License
Active through 2021

MI State Medical License
Active through 2021

NJ State Medical License
Active through 2021

NY State Medical License
Active through 2021

PA State Medical License
Active through 2021

SC State Medical License
Active through 2021

TX State Medical License
Active through 2021

WA State Medical License
Active through 2021

MO State Medical License
Active through 2022

VA State Medical License
Active through 2022

IN State Medical License
Active through 2023

OR State Medical License
Active through 2023

AZ State Medical Lice

In [26]:
## Trying to feed the previous block into a DF vs. print

# Gather the radiologists' data
radiologist_name = "Richard Duszak"
radiologist_id = 1
# create a list to store the data
scraped_data = [] 
for ed_ex_element in ed_ex_elements:
    # initialize the dictionary
    radiologist_details = {}
    
    # parse out the exact text we want
    # organization_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 eRvRyE").text
    try:
        organization_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 eRvRyE").text
        education_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 hwNctc").text
    except:
        continue
   
    # add data to the dictionary
    radiologist_details['radiologist_id'] = radiologist_id
    radiologist_details['radiologist_name'] = radiologist_name
    radiologist_details['organization_element'] = organization_element
    radiologist_details['education_element'] = education_element
    
    # append the dictionaries to the list
    scraped_data.append(radiologist_details)
    
    
print(scraped_data)

[{'radiologist_name': 'Richard Duszak', 'organization_element': 'University of Pennsylvania Health System', 'education_element': 'Fellowship, University of Pennsylvania Health System'}, {'radiologist_name': 'Richard Duszak', 'organization_element': 'Duke University Hospital', 'education_element': 'Residency, Duke University Hospital'}, {'radiologist_name': 'Richard Duszak', 'organization_element': 'York Hospital', 'education_element': 'Internship, York Hospital'}, {'radiologist_name': 'Richard Duszak', 'organization_element': 'Pennsylvania State University College of Medicine', 'education_element': 'Medical School'}, {'radiologist_name': 'Richard Duszak', 'organization_element': 'NC State Medical License', 'education_element': 'Active through 1998'}, {'radiologist_name': 'Richard Duszak', 'organization_element': 'PA State Medical License', 'education_element': 'Active through 2014'}, {'radiologist_name': 'Richard Duszak', 'organization_element': 'MS State Medical License', 'education_e

In [94]:
## Create and format a DF from the list of dictionaries
df_scraped = pd.DataFrame.from_dict(scraped_data)

# Remove rows where License is recorded
df_scraped = df_scraped[df_scraped['organization_element'].str.contains("License") == False]

# Remove redundant organization data after comma in education_element
counter = 0
for x in df_scraped['education_element']:
    if "," in x:
        x = x.split(",")[:-1]
        df_scraped["education_element"][counter] = x[0] #remove the annoying brackets
    counter += 1
    
# View DF
df_scraped

Unnamed: 0,radiologist_name,organization_element,education_element
0,Richard Duszak,University of Pennsylvania Health System,Fellowship
1,Richard Duszak,Duke University Hospital,Residency
2,Richard Duszak,York Hospital,Internship
3,Richard Duszak,Pennsylvania State University College of Medicine,Medical School


# Testing how to also get verification data (e.g., name, city, etc.)

In [124]:
# Find all HTML from profile page header (includes name, address, etc.)
verif_elements = soup.find_all("div", class_= "Hero__ContentWrapper-sc-1lw4wit-0 eZBhPz mt4")

# Pick out the child elements I'm interested in
for verif_element in verif_elements:
    address_element = verif_element.find("p", class_="Paragraph-sc-1iyax29-0 ysuVA").text
    name_element = verif_element.find("h1", class_="Heading__HeadingStyled-sc-1w5xk2o-0 kYBDwy Heading-sc-1w5xk2o-1 Hero__Name-sc-1lw4wit-3 cRrhAX iZgYrY").text
    degree_element = verif_element.find("p", class_="Heading__HeadingStyled-sc-1w5xk2o-0-p kzoSbH Heading-sc-1w5xk2o-1 cRrhAX").text
    city_element = verif_element.find("p", class_="Paragraph-sc-1iyax29-0 Hero__Location-sc-1lw4wit-6 iOniyG jTBKbU").text
    gender_element = verif_element.find("p", class_="Paragraph-sc-1iyax29-0 ewEPVS flex").text
    gndr_xp_lang_element = verif_element.find("div", class_="Hero__MoreInfo-sc-1lw4wit-11 fLhfWJ").text
    print(address_element)
    print(name_element)
    print(degree_element)
    print(city_element)
    print(gender_element)
    print(gndr_xp_lang_element)

5665 Peachtree Dunwoody Rd, Atlanta, GA
Dr. Richard L. Duszak
MD
Atlanta, GA
Male
Male21+ Yrs ExperienceEnglish


In [120]:
# Gather the radiologists' data
radiologist_id = child_url
# Create lists to store the data
scraped_ed_ex_data = [] 
scraped_verif_data = []

# Source education and experience data
for ed_ex_element in ed_ex_elements:
    # Initialize the dictionary
    radiologist_details = {}
    
    # Parse out the exact text we want
    try:
        organization_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 eRvRyE").text
        education_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 hwNctc").text
    except:
        continue
   
    # Add data to the dictionary
    radiologist_details['radiologist_id'] = radiologist_id
    radiologist_details['organization_element'] = organization_element
    radiologist_details['education_element'] = education_element
    
    # Append the dictionaries to the list
    scraped_ed_ex_data.append(radiologist_details)

# Source verification data
for verif_element in verif_elements:
    # Initialize the dictionary
    radiologist_verif_info = {}
    
    # Parse out the exact text we want
    try:
        address_element = verif_element.find("p", class_="Paragraph-sc-1iyax29-0 ysuVA").text
        name_element = verif_element.find("h1", class_="Heading__HeadingStyled-sc-1w5xk2o-0 kYBDwy Heading-sc-1w5xk2o-1 Hero__Name-sc-1lw4wit-3 cRrhAX iZgYrY").text
        degree_element = verif_element.find("p", class_="Heading__HeadingStyled-sc-1w5xk2o-0-p kzoSbH Heading-sc-1w5xk2o-1 cRrhAX").text
        city_element = verif_element.find("p", class_="Paragraph-sc-1iyax29-0 Hero__Location-sc-1lw4wit-6 iOniyG jTBKbU").text
        gender_element = verif_element.find("p", class_="Paragraph-sc-1iyax29-0 ewEPVS flex").text
        gndr_xp_lang_element = verif_element.find("div", class_="Hero__MoreInfo-sc-1lw4wit-11 fLhfWJ").text
    except:
        continue
        
    # Add data to the dictionary
    radiologist_verif_info['radiologist_id'] = radiologist_id
    radiologist_verif_info['address'] = address_element
    radiologist_verif_info['name'] = name_element
    radiologist_verif_info['degree'] = degree_element
    radiologist_verif_info['city'] = city_element
    radiologist_verif_info['gender'] = gender_element
    radiologist_verif_info['years_xp'] = gndr_xp_lang_element
    
    # Append the dictionaries to the list
    scraped_verif_data.append(radiologist_verif_info)


In [125]:
scraped_ed_ex_data[0:3]

[{'radiologist_id': 'doctors/richard-duszak-130940',
  'organization_element': 'University of Pennsylvania Health System',
  'education_element': 'Fellowship, University of Pennsylvania Health System'},
 {'radiologist_id': 'doctors/richard-duszak-130940',
  'organization_element': 'Duke University Hospital',
  'education_element': 'Residency, Duke University Hospital'},
 {'radiologist_id': 'doctors/richard-duszak-130940',
  'organization_element': 'York Hospital',
  'education_element': 'Internship, York Hospital'}]

In [123]:
scraped_verif_data

[{'radiologist_id': 'doctors/richard-duszak-130940',
  'address': '5665 Peachtree Dunwoody Rd, Atlanta, GA',
  'name': 'Dr. Richard L. Duszak',
  'degree': 'MD',
  'city': 'Atlanta, GA',
  'gender': 'Male',
  'years_xp': 'Male21+ Yrs ExperienceEnglish'}]

# Testing with two radiologists

In [10]:
# Testing with two radiologists
radiologists = [
    {"name": "richard duszack", 
     "child_url": "doctors/richard-duszak-130940"},
    {"name": "lisa abramson",
     "child_url": "doctors/lisa-abramson-863145"}
]

radiologists[1]["child_url"]

[0, 1]

In [20]:
# Create lists to store the data
scraped_ed_ex_data = [] 
scraped_verif_data = []

for rad in range(0,len(radiologists)):
    child_url = radiologists[rad]["child_url"]  #part of request url + radiologist ID for relational DFs
    prefix = 'https://health.usnews.com/'
    user_agent = {'User-agent' : 'Mozilla/5.0'}
    page = requests.get(prefix+child_url, headers=user_agent)
    soup = BeautifulSoup(page.text, "html.parser")
    
    # Find all HTML relating to Education & Experience (includes med school & residency, licenses, etc.)
    ed_ex_results = soup.find(id="experience")
    ed_ex_elements = ed_ex_results.find_all("div", class_= "EducationAndExperience__Item-dbww3o-0 eUTnkN")
    
    # Find all HTML from profile page header (includes name, address, etc.)
    verif_elements = soup.find_all("div", class_= "Hero__ContentWrapper-sc-1lw4wit-0 eZBhPz mt4")
    
    # Source education and experience data
    for ed_ex_element in ed_ex_elements:
        # Initialize the dictionary
        radiologist_details = {}

        # Parse out the exact text we want
        try:
            organization_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 eRvRyE").text
            radiologist_details['organization_element'] = organization_element
        except:
            continue
        
        try:
            education_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 hwNctc").text
            radiologist_details['education_element'] = education_element
        except:
            continue
            
        # Add data to the dictionary
        radiologist_details['radiologist_id'] = child_url
#         radiologist_details['organization_element'] = organization_element
#         radiologist_details['education_element'] = education_element

        # Append the dictionaries to the list
        scraped_ed_ex_data.append(radiologist_details)

    # Source verification data
    for verif_element in verif_elements:
        # Initialize the dictionary
        radiologist_verif_info = {}

        # Parse out the exact text we want & Add data to the dictionary
        try:
            address_element = verif_element.find("p", class_="Paragraph-sc-1iyax29-0 ysuVA").text
            radiologist_verif_info['address'] = address_element
        except:
            continue
        
        try:
            name_element = verif_element.find("h1", class_="Heading__HeadingStyled-sc-1w5xk2o-0 kYBDwy Heading-sc-1w5xk2o-1 Hero__Name-sc-1lw4wit-3 cRrhAX iZgYrY").text
            radiologist_verif_info['name'] = name_element
        except:
            continue
        
        try:
            degree_element = verif_element.find("p", class_="Heading__HeadingStyled-sc-1w5xk2o-0-p kzoSbH Heading-sc-1w5xk2o-1 cRrhAX").text
            radiologist_verif_info['degree'] = degree_element
        except:
            continue
        
        try:
            city_element = verif_element.find("p", class_="Paragraph-sc-1iyax29-0 Hero__Location-sc-1lw4wit-6 iOniyG jTBKbU").text
            radiologist_verif_info['city'] = city_element
        except:
            continue
        
        try:
            gender_element = verif_element.find("p", class_="Paragraph-sc-1iyax29-0 ewEPVS flex").text
            radiologist_verif_info['gender'] = gender_element
        except:
            continue

        # Add data to the dictionary
        radiologist_verif_info['radiologist_id'] = child_url
#         radiologist_verif_info['address'] = address_element
#         radiologist_verif_info['name'] = name_element
#         radiologist_verif_info['degree'] = degree_element
#         radiologist_verif_info['city'] = city_element
#         radiologist_verif_info['gender'] = gender_element

        # Append the dictionaries to the list
        scraped_verif_data.append(radiologist_verif_info)

In [21]:
scraped_ed_ex_data

[{'organization_element': 'University of Pennsylvania Health System',
  'education_element': 'Fellowship, University of Pennsylvania Health System',
  'radiologist_id': 'doctors/richard-duszak-130940'},
 {'organization_element': 'Duke University Hospital',
  'education_element': 'Residency, Duke University Hospital',
  'radiologist_id': 'doctors/richard-duszak-130940'},
 {'organization_element': 'York Hospital',
  'education_element': 'Internship, York Hospital',
  'radiologist_id': 'doctors/richard-duszak-130940'},
 {'organization_element': 'Pennsylvania State University College of Medicine',
  'education_element': 'Medical School',
  'radiologist_id': 'doctors/richard-duszak-130940'},
 {'organization_element': 'NC State Medical License',
  'education_element': 'Active through 1998',
  'radiologist_id': 'doctors/richard-duszak-130940'},
 {'organization_element': 'PA State Medical License',
  'education_element': 'Active through 2014',
  'radiologist_id': 'doctors/richard-duszak-130940

In [22]:
scraped_verif_data

[{'address': '5665 Peachtree Dunwoody Rd, Atlanta, GA',
  'name': 'Dr. Richard L. Duszak',
  'degree': 'MD',
  'city': 'Atlanta, GA',
  'gender': 'Male',
  'radiologist_id': 'doctors/richard-duszak-130940'},
 {'address': '325 West 15th Street, New York, NY',
  'name': 'Dr. Lisa L. Abramson (Pitlor)',
  'degree': 'MD',
  'city': 'New York, NY',
  'gender': 'Female',
  'radiologist_id': 'doctors/lisa-abramson-863145'}]

# Try the full loop with all radiologists

In [37]:
# Read in the Part 1 data (child urls)
child_url_list_pre = pd.read_csv(r"C:\Users\ssantavicca3\Documents\Work Files & Folders\RadiologyTrainees_NotABot\datadump\first stage\url_list.csv")
child_url_list_pre

Unnamed: 0,zipcode,url
0,90067,/doctors/ivan-rosen-472665
1,90067,/doctors/omid-bendavid-706939
2,90067,/doctors/joshua-friedlander-845573
3,90067,/doctors/joshua-friedlander-845573
4,90067,/physician-assistants/maria-palmer-2168490
...,...,...
892458,33916,/doctors/gregory-baran-314251
892459,33916,/doctors/gregory-baran-314251
892460,33916,/doctors/paul-wozney-364234
892461,33916,/doctors/bruce-macdonald-59727


In [38]:
child_url_list_pre["url"][0]

'/doctors/ivan-rosen-472665'

In [40]:
# Sort out duplicate rows by url
import numpy as np

child_url_list = []
for i in range(0, len(child_url_list_pre)):
    child_url_list.append(child_url_list_pre["url"][i])
    
def unique(lst):
    x = np.array(lst)
    print(len(np.unique(x)))

unique(child_url_list) # count of unique child_urls or providers to be iterated over

39025


In [45]:
child_url_list[0]

'/doctors/ivan-rosen-472665'

In [47]:
# Create lists to store the data
scraped_ed_ex_data = [] 
scraped_verif_data = []

with elapsed_timer() as elapsed:
    n_iter = 0
    for i in range(0,len(child_url_list)):
        child_url = child_url_list[i]  #part of request url + radiologist ID for relational DFs
        prefix = 'https://health.usnews.com'
        user_agent = {'User-agent' : 'Mozilla/5.0'}
        page = requests.get(prefix+child_url, headers=user_agent)
        soup = BeautifulSoup(page.text, "html.parser")

        # Find all HTML relating to education & experience (includes med school & residency, licenses, etc.)
        ed_ex_results = soup.find(id="experience")
        ed_ex_elements = ed_ex_results.find_all("div", class_= "EducationAndExperience__Item-dbww3o-0 eUTnkN")

        # Find all HTML from profile page header (includes name, address, etc.)
        verif_elements = soup.find_all("div", class_= "Hero__ContentWrapper-sc-1lw4wit-0 eZBhPz mt4")

        # Source education and experience data
        for ed_ex_element in ed_ex_elements:
            # Initialize the dictionary
            radiologist_details = {}

            # Parse out the exact text we want
            try:
                organization_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 eRvRyE").text
                radiologist_details['organization_element'] = organization_element
            except:
                pass

            try:
                education_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 hwNctc").text
                radiologist_details['education_element'] = education_element
            except:
                pass

            # Add data to the dictionary
            radiologist_details['radiologist_id'] = child_url

            # Append the dictionaries to the list
            scraped_ed_ex_data.append(radiologist_details)

        # Source verification data
        for verif_element in verif_elements:
            # Initialize the dictionary
            radiologist_verif_info = {}

            # Parse out the exact text we want & Add data to the dictionary
            try:
                address_element = verif_element.find("p", class_="Paragraph-sc-1iyax29-0 ysuVA").text
                radiologist_verif_info['address'] = address_element
            except:
                pass

            try:
                name_element = verif_element.find("h1", class_="Heading__HeadingStyled-sc-1w5xk2o-0 kYBDwy Heading-sc-1w5xk2o-1 Hero__Name-sc-1lw4wit-3 cRrhAX iZgYrY").text
                radiologist_verif_info['name'] = name_element
            except:
                pass

            try:
                degree_element = verif_element.find("p", class_="Heading__HeadingStyled-sc-1w5xk2o-0-p kzoSbH Heading-sc-1w5xk2o-1 cRrhAX").text
                radiologist_verif_info['degree'] = degree_element
            except:
                pass

            try:
                city_element = verif_element.find("p", class_="Paragraph-sc-1iyax29-0 Hero__Location-sc-1lw4wit-6 iOniyG jTBKbU").text
                radiologist_verif_info['city'] = city_element
            except:
                pass

            try:
                gender_element = verif_element.find("p", class_="Paragraph-sc-1iyax29-0 ewEPVS flex").text
                radiologist_verif_info['gender'] = gender_element
            except:
                pass

            # Add data to the dictionary
            radiologist_verif_info['radiologist_id'] = child_url

            # Append the dictionaries to the list
            scraped_verif_data.append(radiologist_verif_info)
            
        # Counter and timer for progress checks  
        n_iter += 1
        if n_iter % 1000 == 0:
            print("Iteration (radiologists): "+str(n_iter)+" ----- Time Elapsed: "+str(timedelta(seconds=round(elapsed()))))
            sys.stdout.flush()

            # Save intermediate output incase of crash or timeout
            filename1 = "datadump/second stage/saved_ed_ex_list_iter"+str(n_iter)+".txt"
            with open(filename1, 'w') as f:
                for item in scraped_ed_ex_data:
                    f.write(f'{item}\n')
                    sys.stdout.flush()
            filename2 = "datadump/second stage/saved_verif_list_iter"+str(n_iter)+".txt"
            with open(filename2, 'w') as f:
                for item in scraped_verif_data:
                    f.write(f'{item}\n')
                    sys.stdout.flush()

        # Iteration delay
        sleep(randint(1,3))

Iteration (radiologists): 1000 ----- Time Elapsed: 0:45:04
Iteration (radiologists): 2000 ----- Time Elapsed: 1:31:15


ConnectionError: HTTPSConnectionPool(host='health.usnews.com', port=443): Max retries exceeded with url: //doctors/chelsea-pyle-1116030 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000200810717F0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

In [51]:
child_url_list_pre[child_url_list_pre['url'] == "/doctors/chelsea-pyle-1116030"]

Unnamed: 0,zipcode,url
2240,2180,/doctors/chelsea-pyle-1116030
45916,2115,/doctors/chelsea-pyle-1116030
92961,1854,/doctors/chelsea-pyle-1116030
92962,1854,/doctors/chelsea-pyle-1116030
127758,1830,/doctors/chelsea-pyle-1116030
129130,2467,/doctors/chelsea-pyle-1116030
129131,2467,/doctors/chelsea-pyle-1116030
144998,2144,/doctors/chelsea-pyle-1116030
169395,2141,/doctors/chelsea-pyle-1116030
171913,1842,/doctors/chelsea-pyle-1116030


# Progress Tracker

#### Started program @ 4:59PM on Tuesday (3/7)

Iteration (radiologists): 1000 ----- Time Elapsed: 0:45:04
Iteration (radiologists): 2000 ----- Time Elapsed: 1:31:15

ConnectionError: HTTPSConnectionPool(host='health.usnews.com', port=443): Max retries exceeded with url: //doctors/chelsea-pyle-1116030 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000200810717F0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))


## KEY UPDATE: 3/8/22 - I REALIZED THE ACTUALLY TRAINING SPECIALTY DATA IS NO LONGER PRESENT ON THE SITE AND CONTACTED US.HEALTH.NEWS. IF ABLE TO CONTINUE, I LEFT OFF THIS PROGRAM HAVING TRIED TO RUN THE FULL PRESUMABLY READY LOOP ONLY TO SEE THAT:

### (A) FOR SOME ED_EX_DATA INFO WAS MISSING OR JUMBLED, SO I FOUND THAT THERE WAS AN ADDITIONAL <\P> CLASS THAT CAPTURED MED SCHOOL AND OTHER TRAINING IN MANY CASES. (BELOW; SOME OF THIS NEEDS TO BE TRANSLATED TO THE FULL LOOP; I HAVEN'T DONE YET)

### (B) BARELY ANY OF THE VERIF_DATA MADE IT THROUGH - I HAVE NOT GOTTEN AROUND TO FIXING THIS UP YET, BUT I THINK I'LL NEED TO INCLUDE SOME MORE TRY/EXCEPT BLOCKS OR LOOK FOR MISSING TAG CLASSES.

- When trying to fix everything up run the loop through 100 people and try to produce effectively the end results (or at least what I'd send to SAS for further processing).

### SEPARATELY, I FIXED UP THE POST-LOOP DATA PROCESSING STEP TO CONVERT TO DATA FRAME (BELOW).

- Although, it may be best to entirely save the preprocessing until after all data is pulled to avoid loosing stuff I don't want to lose.

In [92]:
#scraped_ed_ex_data

## Create and format a DF from the list of dictionaries
scraped_ed_ex_data = pd.DataFrame.from_dict(scraped_ed_ex_data)

# Remove rows where License is recorded
scraped_ed_ex_data = scraped_ed_ex_data[scraped_ed_ex_data['organization_element'].str.contains("License") == False]

# Remove redundant organization data after comma in education_element
counter = 0
for x in scraped_ed_ex_data['education_element']:
    if "," in x:
        x = x.split(",")[:-1]
        scraped_ed_ex_data["education_element"][counter] = x[0] #remove the annoying brackets
    counter += 1
    
# View DF
scraped_ed_ex_data
scraped_ed_ex_data.to_csv("scraped_ed_ex_data.csv", encoding='utf-8', index=True)

In [67]:
scraped_verif_data

[{'address': '4650 Sunset Blvd, Los Angeles, CA',
  'name': 'Dr. Paul J. Iskander',
  'degree': 'MD',
  'city': 'Los Angeles, CA',
  'gender': 'Male',
  'radiologist_id': '/doctors/paul-iskander-690045'},
 {'address': 'Renaissance Imaging Medical Associates, Northridge, CA',
  'name': 'Dr. Ryan P. Cramer',
  'degree': 'MD',
  'city': 'Northridge, CA',
  'gender': 'Male',
  'radiologist_id': '/doctors/ryan-cramer-706967'},
 {'address': 'Renaissance Imaging Medical Associates, Northridge, CA',
  'name': 'Dr. Ryan P. Cramer',
  'degree': 'MD',
  'city': 'Northridge, CA',
  'gender': 'Male',
  'radiologist_id': '/doctors/ryan-cramer-706967'},
 {'address': '1325 Eastmoreland Avenue Suite 545, Memphis, TN',
  'name': 'Dr. George S. Flinn',
  'degree': 'MD',
  'city': 'Memphis, TN',
  'gender': 'Male',
  'radiologist_id': '/doctors/george-flinn-348765'},
 {'address': "St. Jude Children's Research Hospital, Memphis, TN",
  'name': 'Dr. David E. Buechner',
  'degree': 'MD',
  'city': 'Memphis, 

In [173]:
child_url = "/doctors/richard-duszak-130940"
prefix = 'https://health.usnews.com'
user_agent = {'User-agent' : 'Mozilla/5.0'}
page = requests.get(prefix+child_url, headers=user_agent)
soup = BeautifulSoup(page.text, "html.parser")

results = soup.find(id="experience")

# Find all HTML relating to Education & Experience (includes med school & residency, licenses, etc.)
ed_ex_elements = results.find_all("div", class_= "EducationAndExperience__Item-dbww3o-0 eUTnkN")
ed_ex_elements2 = results.find_all("div", class_= "EducationAndExperience__Item-dbww3o-0 bMIddY")
for add_element in ed_ex_elements2:
    ed_ex_elements.append(add_element)

# Pick out the child element I'm interested in
for ed_ex_element in ed_ex_elements:
    organization_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 eRvRyE").text
    try:
        education_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 hwNctc").text
    except:
        pass
    print(organization_element)
    print(education_element)
    print() # v-space between elements

University of Pennsylvania Health System
Fellowship, University of Pennsylvania Health System

Duke University Hospital
Residency, Duke University Hospital

York Hospital
Internship, York Hospital

Pennsylvania State University College of Medicine
Medical School

NC State Medical License
Active through 1998

PA State Medical License
Active through 2014

MS State Medical License
Active through 2015

TN State Medical License
Active through 2015

SIR Fellow
Active through 2015

Alpha Omega Alpha Honor Medical Society, 1987
Active through 2015

Andrew E. Yeates Memorial Award for Excellence in Neuroradiology, 1993
Active through 2015

Annual AuntMinnie.com  Minnies Awards for Excellence in Radiology
Active through 2015

Best Doctors in America, 2001
Active through 2015

Best Doctors in Memphis, 2011
Active through 2015

Best Doctors in Memphis, 2012
Active through 2015

Biology Department Outstanding Graduate Award, 1985
Active through 2015

Calhoun Award for Outstanding Contributions to R

In [188]:
## Trying to feed the previous block into a DF vs. print

# Gather the radiologists' data
radiologist_name = "richard-duszak"
radiologist_id = 1
# create a list to store the data
scraped_data_test = [] 

for ed_ex_element in ed_ex_elements:
    # initialize the dictionary
    radiologist_details = {}
    
    # parse out the exact text we want
    try:
        organization_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 eRvRyE").text
    except:
        pass
    try:
        education_element = ed_ex_element.find("p", class_="Paragraph-sc-1iyax29-0 hwNctc").text
    except:
        pass
   
    # add data to the dictionary
    radiologist_details['radiologist_id'] = radiologist_id
    radiologist_details['radiologist_name'] = radiologist_name
    radiologist_details['organization_element'] = organization_element
    radiologist_details['education_element'] = education_element
    
    # append the dictionaries to the list
    scraped_data_test.append(radiologist_details)
    
    
scraped_data_test

[{'radiologist_id': 1,
  'radiologist_name': 'richard-duszak',
  'organization_element': 'University of Pennsylvania Health System',
  'education_element': 'Fellowship, University of Pennsylvania Health System'},
 {'radiologist_id': 1,
  'radiologist_name': 'richard-duszak',
  'organization_element': 'Duke University Hospital',
  'education_element': 'Residency, Duke University Hospital'},
 {'radiologist_id': 1,
  'radiologist_name': 'richard-duszak',
  'organization_element': 'York Hospital',
  'education_element': 'Internship, York Hospital'},
 {'radiologist_id': 1,
  'radiologist_name': 'richard-duszak',
  'organization_element': 'Pennsylvania State University College of Medicine',
  'education_element': 'Medical School'},
 {'radiologist_id': 1,
  'radiologist_name': 'richard-duszak',
  'organization_element': 'NC State Medical License',
  'education_element': 'Active through 1998'},
 {'radiologist_id': 1,
  'radiologist_name': 'richard-duszak',
  'organization_element': 'PA State M

In [199]:
## Create and format a DF from the list of dictionaries
scraped_data = pd.DataFrame.from_dict(scraped_data_test)

# Remove rows with  is recorded
import numpy as np
idx = np.where(
    (scraped_data['organization_element'].str.contains("License") == False) &
    (scraped_data['education_element'].str.contains("Active through") == False) &
    (scraped_data['organization_element'].str.contains("Scholarship") == False) &
    (scraped_data['organization_element'].str.contains("List") == False)
)

scraped_data = scraped_data.loc[idx]
scraped_data = pd.concat([scraped_data, scraped_data["education_element"].str.split(', ', expand=True)], axis=1)
 
# Remove redundant data after commas
# scraped_data["organization_element2"] = ["" for i in scraped_data["organization_element"]]
# counter = 0
# for x in scraped_data['education_element']:
#     if "," in x:
#         if "Other Training" not in x:
#             scraped_data["organization_element2"][counter] = 
#             x = x.split(",")[:-1]
#             scraped_data["education_element"][counter] = x[0] #[0] --> remove the annoying brackets
#     counter += 1
    
# View DF
scraped_data.reset_index()

Unnamed: 0,index,radiologist_id,radiologist_name,organization_element,education_element,0,1
0,0,1,richard-duszak,University of Pennsylvania Health System,"Fellowship, University of Pennsylvania Health ...",Fellowship,University of Pennsylvania Health System
1,1,1,richard-duszak,Duke University Hospital,"Residency, Duke University Hospital",Residency,Duke University Hospital
2,2,1,richard-duszak,York Hospital,"Internship, York Hospital",Internship,York Hospital
3,3,1,richard-duszak,Pennsylvania State University College of Medicine,Medical School,Medical School,
4,36,1,richard-duszak,,"Other Training, Biology",Other Training,Biology
