In [1]:
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
import pandas as pd
from selenium.webdriver.common.keys import Keys
import numpy as np
import time
from datetime import datetime
from bs4 import BeautifulSoup

In [2]:
#path = r'C:/Users/Tinatin/.wdm/drivers/chromedriver/win32/83.0.4103.39/chromedriver.exe'
path = '/Users/laurendarinzo/Desktop/Personal/webscrape-Psychology-today/chromedriver'

In [3]:
final_df = pd.DataFrame(columns=['full_address', 'header_info', 'phone', 'cost_per_ses','sliding_scale','pay_by','accepted_insurance', 'quals', 'specs', 'issues', 'mental_health', 'sexuality', 'client_focus_age', 'client_focus_communities', 'treatment_approach_types_of_ther', 'treatment_approach_modality', 'additional_creds'])

In [4]:
def scraper(urls, path):
    global final_df
    #iterate over urls
    for url in urls:
        row_results = {}
        driver = Chrome(executable_path= path)
        try:
            driver.get(url)         #open the url in selenium
        except:
            print ('bad url!')
        soup = BeautifulSoup(driver.page_source,'html.parser') 
        
        #wait for the webpage to load
        time.sleep(10)
        row_results['full_address'] = ', '.join([item for item in (item.strip() for item in soup.find(class_ = "address-data").text.split('\n') if item) if item])
        row_results['header_info'] = ', '.join([item.strip(',') for item in (item.strip() for item in soup.find(id = 'profHdr').text.split('\n') if item) if item])
        row_results['phone'] = soup.find(class_ = "phone-number").text
        finances = [item for item in (item.strip() for item in soup.find(id = 'tabs-finances-office').text.split('\n') if item) if item]
        finances.remove(finances[-1])
        print(finances)
        
        if any("Cost per Session" in s for s in finances):
            for i in range(len(finances)):
                if 'Cost per Session: ' in finances[i]:
                    row_results['cost_per_ses'] = finances[i].replace('Cost per Session: ', '')
        else:
            print('Cost per Session field missing')
            row_results['cost_per_ses'] = 'missing cost_per_ses'
            
        if any('Sliding Scale' in s for s in finances):
            for i in range(len(finances)):
                if 'Sliding Scale: ' in finances[i]:
                    row_results['sliding_scale'] = finances[i].replace('Sliding Scale: ', '')
        else:
            print('Sliding Scale field missing')
            row_results['sliding_scale'] = 'missing sliding_scale'
            
        for i in range(len(finances)):
            if 'Pay By' in finances[i]:
                try:
                    row_results['pay_by'] = finances[i+1]
                except: 
                    print('missing pay_by')
                    row_results['pay_by'] = 'missing pay by'
                    
        if 'Accepted Insurance Plans' not in finances:
                print('insurance field missing')
                row_results['accepted_insurance'] = 'missing insurance'
        else:
            for i in range(len(finances)):
                if 'Accepted Insurance Plans' in finances[i]:
                    row_results['accepted_insurance'] = [i for i in finances[i+1:]]
        row_results['quals'] = soup.find(class_ = 'profile-qualifications details-section top-border').text
        row_results['specs'] = soup.find(class_ = 'spec-list attributes-top').text
        row_results['issues'] = soup.find(class_= 'spec-list attributes-issues').find_all('li')
        try:
            row_results['mental_health'] = soup.find(class_ = 'spec-list attributes-mental-health').text
        except:
            print('mental health field missing')
            row_results['mental_health'] = 'missing'
        try:
            row_results['sexuality'] = soup.find(class_ = 'spec-list attributes-sexuality').text
        except:
            print('sexuality field missing')
            row_results['sexuality'] = 'missing'
        try:  
            row_results['client_focus_age'] = soup.find(class_= 'spec-list attributes-age-focus').text
        except:
            print('client focus age field missing')
            row_results['client_focus_age'] = 'missing'
        try:
            row_results['client_focus_communities'] = soup.find(class_ = 'spec-list attributes-categories').text
        except:
            print('client focus communities field missing')
            row_results['client_focus_communities'] = 'missing'
        try:
            row_results['treatment_approach_types_of_ther'] = soup.find(class_= 'spec-list attributes-treatment-orientation').text
        except:
            print('treatment approach types of therapy field missing')
            row_results['treatment_approach_types_of_ther'] = 'missing'
        try:
            row_results['treatment_approach_modality'] = soup.find(class_ = 'spec-list attributes-modality').text
        except:
            print('treatment approach modality field missing')
            row_results['treatment_approach_modality'] = 'missing'
        try:
            row_results['additional_creds'] = soup.find(class_ = 'profile-additional-credentials details-section top-border').text
        except:
            print('additional credentials field missing')
            row_results['additional_creds'] = 'missing'
            
        driver.quit()
        final_df = final_df.append(row_results, ignore_index=True)
        print('---'*10)
        

In [5]:
def get_state_urls(homepage='https://www.psychologytoday.com/us/therapists', 
             path='/Users/laurendarinzo/Desktop/Personal/webscrape-Psychology-today/chromedriver'):
    '''
    input: starting website that contains all states
    returns a list of urls of all therapist profiles across all US states 
    '''
    #get state-level urls
    all_homepage_urls=[]
    all_state_urls=[]
    
    states=["Alabama","Alaska","Arizona","Arkansas","California","Colorado",
  "Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
  "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
  "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
  "Nebraska","Nevada","New-Hampshire","New-Jersey","New-Mexico","New-York",
  "North-Carolina","North-Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
  "Rhode-Island","South-Carolina","South-Dakota","Tennessee","Texas","Utah",
  "Vermont","Virginia","Washington","West-Virginia","Wisconsin","Wyoming"]
    states=[x.lower() for x in states]
    
   
    driver=Chrome(executable_path=path)
    driver.get(homepage)
    elems = driver.find_elements_by_tag_name('a')
    for i in states:
        print(i)
        for elem in elems:
            href = elem.get_attribute('href')
            if href is not None:
                all_homepage_urls.append(href)
        single_state_urls = [word for word in all_homepage_urls if i in word]
        if len(single_state_urls)==0:
            pass
        elif i == "kansas": #special case for kansas because the word is contained in 'arkansas'
            single_state_url=single_state_urls[1]
        else:
            single_state_url=single_state_urls[0]
        all_state_urls.append(single_state_url)
    return all_state_urls

In [6]:
all_state_urls=get_state_urls()

alabama
alaska
arizona
arkansas
california
colorado
connecticut
delaware
florida
georgia
hawaii
idaho
illinois
indiana
iowa
kansas
kentucky
louisiana
maine
maryland
massachusetts
michigan
minnesota
mississippi
missouri
montana
nebraska
nevada
new-hampshire
new-jersey
new-mexico
new-york
north-carolina
north-dakota
ohio
oklahoma
oregon
pennsylvania
rhode-island
south-carolina
south-dakota
tennessee
texas
utah
vermont
virginia
washington
west-virginia
wisconsin
wyoming


In [7]:
all_state_urls

['https://www.psychologytoday.com/us/therapists/alabama',
 'https://www.psychologytoday.com/us/therapists/alaska',
 'https://www.psychologytoday.com/us/therapists/arizona',
 'https://www.psychologytoday.com/us/therapists/arkansas',
 'https://www.psychologytoday.com/us/therapists/california',
 'https://www.psychologytoday.com/us/therapists/colorado',
 'https://www.psychologytoday.com/us/therapists/connecticut',
 'https://www.psychologytoday.com/us/therapists/delaware',
 'https://www.psychologytoday.com/us/therapists/florida',
 'https://www.psychologytoday.com/us/therapists/georgia',
 'https://www.psychologytoday.com/us/therapists/hawaii',
 'https://www.psychologytoday.com/us/therapists/idaho',
 'https://www.psychologytoday.com/us/therapists/illinois',
 'https://www.psychologytoday.com/us/therapists/indiana',
 'https://www.psychologytoday.com/us/therapists/iowa',
 'https://www.psychologytoday.com/us/therapists/kansas',
 'https://www.psychologytoday.com/us/therapists/kentucky',
 'https://

In [8]:
def get_profiles_from_each_state(all_state_urls,
                                 path='/Users/laurendarinzo/Desktop/Personal/webscrape-Psychology-today/chromedriver'):
    all_therapists=[]
    for state in all_state_urls:
        #all_profiles=[]
        print(state)
        driver=Chrome(executable_path=path)
        driver.get(state) 
        elems = driver.find_elements_by_tag_name('a')
        all_profiles=[]
        for elem in elems:
            href = elem.get_attribute('href')
            if href is not None:
                all_profiles.append(href)
        urls = [word for word in all_profiles if 'ResultsProfileBtn' in word]
        for i in urls:
            all_therapists.append(i)
    return all_therapists

In [9]:
all_therapists=get_profiles_from_each_state(all_state_urls)

https://www.psychologytoday.com/us/therapists/alabama
https://www.psychologytoday.com/us/therapists/alaska
https://www.psychologytoday.com/us/therapists/arizona
https://www.psychologytoday.com/us/therapists/arkansas
https://www.psychologytoday.com/us/therapists/california
https://www.psychologytoday.com/us/therapists/colorado
https://www.psychologytoday.com/us/therapists/connecticut
https://www.psychologytoday.com/us/therapists/delaware
https://www.psychologytoday.com/us/therapists/florida
https://www.psychologytoday.com/us/therapists/georgia
https://www.psychologytoday.com/us/therapists/hawaii
https://www.psychologytoday.com/us/therapists/idaho
https://www.psychologytoday.com/us/therapists/illinois
https://www.psychologytoday.com/us/therapists/indiana
https://www.psychologytoday.com/us/therapists/iowa
https://www.psychologytoday.com/us/therapists/kansas
https://www.psychologytoday.com/us/therapists/kentucky
https://www.psychologytoday.com/us/therapists/louisiana
https://www.psychology

In [10]:
def checkIfDuplicates_1(listOfElems):
    ''' Check if given list contains any duplicates '''
    if len(listOfElems) == len(set(listOfElems)):
        return False
    else:
        return True

In [11]:
result = checkIfDuplicates_1(all_therapists)
if result:
    print('Yes, list contains duplicates')
else:
    print('No duplicates found in list') 

No duplicates found in list


In [12]:
all_therapists

['https://www.psychologytoday.com/us/therapists/alabama/484654?sid=5f02139e5041f&ref=1&tr=ResultsProfileBtn',
 'https://www.psychologytoday.com/us/therapists/alabama/252149?sid=5f02139e5041f&ref=2&tr=ResultsProfileBtn',
 'https://www.psychologytoday.com/us/therapists/alabama/74096?sid=5f02139e5041f&ref=3&tr=ResultsProfileBtn',
 'https://www.psychologytoday.com/us/therapists/alabama/108701?sid=5f02139e5041f&ref=4&tr=ResultsProfileBtn',
 'https://www.psychologytoday.com/us/therapists/alabama/165083?sid=5f02139e5041f&ref=5&tr=ResultsProfileBtn',
 'https://www.psychologytoday.com/us/therapists/alabama/430960?sid=5f02139e5041f&ref=6&tr=ResultsProfileBtn',
 'https://www.psychologytoday.com/us/therapists/alabama/415456?sid=5f02139e5041f&ref=7&tr=ResultsProfileBtn',
 'https://www.psychologytoday.com/us/therapists/alabama/324377?sid=5f02139e5041f&ref=8&tr=ResultsProfileBtn',
 'https://www.psychologytoday.com/us/therapists/alabama/397747?sid=5f02139e5041f&ref=9&tr=ResultsProfileBtn',
 'https://w

In [13]:
len(all_therapists)

996

In [None]:
import pickle
with open('all_therapists1000', 'wb') as fp:
    pickle.dump(list_1, fp)

## Testing

In [None]:
all_therapists=[]
driver=Chrome(executable_path='/Users/laurendarinzo/Desktop/webscrape-Psychology-today/chromedriver')
driver.get('https://www.psychologytoday.com/us/therapists/alabama') 
elems = driver.find_elements_by_tag_name('a')
for elem in elems:
    href = elem.get_attribute('href')
    if href is not None:
        all_profiles.append(href)
urls = [word for word in all_profiles if 'ResultsProfileBtn' in word]
for i in urls:
    all_therapists.append(i)

In [None]:
all_therapists

In [None]:
all_profiles=get_urls()

In [None]:
len(all_profiles)

In [None]:
all_profiles

In [None]:
url = 'https://www.psychologytoday.com/us/therapists'
path='/Users/laurendarinzo/Desktop/Personal/webscrape-Psychology-today/chromedriver'

In [None]:
driver=Chrome(executable_path=path)
driver.get(url)
elems = driver.find_elements_by_tag_name('a')
for i in states:
    for elem in elems:
        href = elem.get_attribute('href')
        if href is not None:
            all_profiles.append(href)
    single_state_urls = [word for word in all_profiles if i in word]
    if len(single_state_urls)==0:
        pass
    else:
        single_state_url=single_state_urls[0]
    print(single_state_url)

In [None]:
elems = driver.find_elements_by_tag_name('a')
all_profiles = []
for elem in elems:
    href = elem.get_attribute('href')
    if href is not None:
        all_profiles.append(href)
single_state_url = [word for word in all_profiles if 'alaska' in word ]

In [None]:
single_state_url