In [1]:
import json
import time

from bs4 import BeautifulSoup
import requests
import pandas as pd

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

from tqdm import tqdm_notebook

In [2]:
base_url = "https://trustpilot.com"

In [3]:
def get_soup(url):
    return BeautifulSoup(requests.get(url).content, 'lxml')

In [4]:
data = {}
# 類別網址
soup = get_soup(base_url + '/categories')
# 找尋所有父類別 ('div', {'class': 'subCategory___BRUDy'})
for category in soup.findAll('div', {'class': 'subCategory___BRUDy'}):
    name = category.find('h3', {'class': 'subCategoryHeader___36ykD'}).text
    name = name.strip()
    data[name] = {}  
    sub_categories = category.find('div', {'class': 'subCategoryList___r67Qj'})
    # 子類別
    for sub_category in sub_categories.findAll('div', {'class': 'subCategoryItem___3ksKz'}):
        #子類別名稱
        sub_category_name = sub_category.find('a', {'class': 'internal___1jK0Z typography___lxzyt weight-inherit___229vl navigation___2n5P8'}).text
        # 子類別網址
        sub_category_uri = sub_category.find('a', {'class': 'internal___1jK0Z typography___lxzyt weight-inherit___229vl navigation___2n5P8'})['href']
        data[name][sub_category_name] = sub_category_uri

In [5]:
data

{&#39;Animals &amp; Pets&#39;: {&#39;Animal Health&#39;: &#39;/categories/animal_health&#39;,
  &#39;Animal Parks &amp; Zoo&#39;: &#39;/categories/animal_parks_zoo&#39;,
  &#39;Cats &amp; Dogs&#39;: &#39;/categories/cats_dogs&#39;,
  &#39;Horses &amp; Riding&#39;: &#39;/categories/horses_riding&#39;,
  &#39;Pet Services&#39;: &#39;/categories/pet_services&#39;,
  &#39;Pet Stores&#39;: &#39;/categories/pet_stores&#39;},
 &#39;Beauty &amp; Well-being&#39;: {&#39;Cosmetics &amp; Makeup&#39;: &#39;/categories/cosmetics_makeup&#39;,
  &#39;Hair Care &amp; Styling&#39;: &#39;/categories/hair_care_styling&#39;,
  &#39;Personal Care&#39;: &#39;/categories/personal_care&#39;,
  &#39;Salons &amp; Clinics&#39;: &#39;/categories/salons_clinics&#39;,
  &#39;Tattoos &amp; Piercings&#39;: &#39;/categories/tattoos_piercings&#39;,
  &#39;Wellness &amp; Spa&#39;: &#39;/categories/wellness_spa&#39;,
  &#39;Yoga &amp; Meditation&#39;: &#39;/categories/yoga_meditation&#39;},
 &#39;Business Services&#39;: {

In [6]:
def extract_company_urls_form_page():
    # 尋找評論網址
    a_list = driver.find_elements_by_xpath('//a[@class="internal___1jK0Z wrapper___26yB4"]')
    urls = [a.get_attribute('href') for a in a_list]
    # 過濾不是"review"的網址
    urls=[url for url in urls if url.split('/')[3]=='review']
    dedup_urls = list(set(urls))
    return dedup_urls

In [7]:
def go_next_page():
    try:
        # 尋找是否有"下一頁"的按鈕
        button = driver.find_element_by_xpath('//a[@class="paginationLinkNormalize___scOgG paginationLinkNext___1LQ14"]')
        return True, button
    except NoSuchElementException:
        return False, None

In [8]:
'''
執行Chrome Driver以不用開chrome進行加速爬蟲
'''

&#39;\n執行Chrome Driver以不用開chrome進行加速爬蟲\n&#39;

In [9]:
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument("--disable-extensions")

prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome('./chromedriver', options=options)

timeout = 3

In [10]:
company_urls = {}
for category in tqdm_notebook(data):
    for sub_category in tqdm_notebook(data[category], leave=False):
        company_urls[sub_category] = []
        # 公司評論網址
        url = base_url + data[category][sub_category] + "?numberofreviews=0&timeperiod=0&status=all"
        driver.get(url)
        try: 
            # 檢查是否有 class "internal___1jK0Z wrapper___26yB4"(討論網站)
            element_present = EC.presence_of_element_located(
                (By.CLASS_NAME, 'internal___1jK0Z wrapper___26yB4'))
            # 有的話等3秒
            WebDriverWait(driver, timeout).until(element_present)
        except:
            pass
    
        next_page = True
        c = 1
        while next_page:
            extracted_company_urls = extract_company_urls_form_page()
            company_urls[sub_category] += extracted_company_urls
            next_page, button = go_next_page()
            if c>10:
                break
            if next_page:
                c += 1
                next_url = base_url + data[category][sub_category] + "?numberofreviews=0&timeperiod=0&status=all" + f'&page={c}'
                driver.get(next_url)
                try: 
                    # 檢查是否有 class "internal___1jK0Z wrapper___26yB4"
                    element_present = EC.presence_of_element_located(
                        (By.CLASS_NAME, 'internal___1jK0Z wrapper___26yB4'))
                    # 有的話等3秒
                    WebDriverWait(driver, timeout).until(element_present)
                except:
                    pass

HBox(children=(FloatProgress(value=0.0, max=22.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value=&#39;&#39;)))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value=&#39;&#39;)))




In [11]:
with open('./company_urls_en', 'w') as f:
    json.dump(company_urls, f)

In [12]:
consolidated_data = []

for category in data:
    for sub_category in data[category]:
        for url in company_urls[sub_category]:
            consolidated_data.append((category, sub_category, url))

df_consolidated_data = pd.DataFrame(consolidated_data, columns=['category', 'sub_category', 'company_url'])

df_consolidated_data.to_csv('./consolidate_company_urls.csv', index=False)