# Scraping Medium with Selenium

While using Selenium, we need to install a webdriver based on what your browser (Chrome, Firefox, etc.) and selenium python package.

In [1]:
import os

# change working directory
os.chdir('/Users/syahrulhamdani/Documents/Projects/unbotxing/')
print(os.getcwd())

/Users/syahrulhamdani/Documents/Projects/unbotxing


In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import requests
from requests.exceptions import RequestException
from ubtools import get_html

import time
from contextlib import closing

## Why is BeautifulSoup not enough?

In [8]:
bl_review_url = 'https://www.bukalapak.com/reviews/komputer/laptop/1gkarrl-jual-laptop-asus-x441ua-i3-6006u-1tb-4gb'
review_html, _ = get_html(bl_review_url)
review_soup = BeautifulSoup(review_html, 'lxml')

In [21]:
review_body = review_soup.find('div', attrs={'class': 'c-panel__body'})
review_body

<div class="c-panel__body">
<div class="u-pad-left--7 u-pad-right--7 u-pad-top--4">
<div class="o-flag c-rating-presentation c-rating-presentation--custom">
<div class="o-flag__body c-rating-presentation__head u-pad-right--4 u-border--right">
<div class="u-txt--hero u-txt--bold">
4.9
</div>
<div class="c-rating u-mrgn-bottom--1 u-mrgn-top--2" title="4.9">
<div class="c-rating__bg"><span class="c-rating__unit c-icon c-icon--star c-icon--large"></span><span class="c-rating__unit c-icon c-icon--star c-icon--large"></span><span class="c-rating__unit c-icon c-icon--star c-icon--large"></span><span class="c-rating__unit c-icon c-icon--star c-icon--large"></span><span class="c-rating__unit c-icon c-icon--star c-icon--large"></span></div>
<div class="c-rating__fg" style="width: 98.0%;"><span class="c-rating__unit c-icon c-icon--star c-icon--large"></span><span class="c-rating__unit c-icon c-icon--star c-icon--large"></span><span class="c-rating__unit c-icon c-icon--star c-icon--large"></span><

From above, we can't find any html element **div** with attribute class `contain-user-reviews` that contains all user reviews. Hence, BeautifulSoup has limitation here in accessing data that is bounded by some javascripts. In order to access the data we want, we then need a library that can "manipulate" javascript or act like us to navigate through the web page and get those data. **Selenium** come to the rescue.

In [69]:
url_medium = 'https://towardsdatascience.com/data-science/home'
driver = webdriver.Chrome('../chromedriver')
driver.get(url_medium)

In [80]:
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')

In [84]:
SCROLL_PAUSE = 1.5

last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
    
    time.sleep(SCROLL_PAUSE)
    
    new_height = driver.execute_script('return document.body.scrollHeight')
    if new_height == last_height:
        break
    last_height = new_height

In [85]:
driver.close()

## Scraping Bukalapak with Selenium

In [86]:
# define the base URL
URL = "https://www.bukalapak.com/p/handphone/hp-smartphone/mgx5ns-jual-oppo-f9?from=list-product&product_owner=normal_seller&search%5Bkeywords%5D=oppo%20f11"

In [98]:
browser = webdriver.Chrome('../chromedriver')
browser.get(URL)

In [99]:
browser.find_element_by_id('product-detail-product-review').click()
link = browser.find_element_by_id('loadmore_link')

In [100]:
href = link.get_attribute('href')

In [101]:
browser.close()

In [96]:
def get_html(url, param=None, time_out=None):
    """Attempts to get the html at `url` via HTTP GET Requests.
    
    Parameters
    ----------
    url : str
        URL or API URI
    param : dict
        key-value pair to be attached to url
    timeout : float or int or tuple of both
        time limit to establish a connection. If a tuple (2,5) is given, then
        2 is time limit to estalbish a connection and 5 is time limit to wait
        on a response.
    
    Returns
    -------
    str
        Raw HTML
    str
        Complete URL
    """
    try:
        with closing(requests.get(url, params=param, timeout=time_out, stream=True)) as response:
            response.raise_for_status()
            if is_good_response(response):
                return response.text, response.url
            else:
                return None
    except RequestException as request_error:
        error_log(url, params=param, msg=request_error)
        return None


def is_good_response(response):
    """Evaluate response.
    
    If response seems to be HTML with status 200, return True.
    else, return False
    
    Parameters
    ----------
    response
        Requests response
    
    Returns
    -------
    bool
        Response quality
    """
    content_type = response.headers['Content-Type'].lower()
    return (
        response.status_code == 200
        and content_type is not None
        and content_type.find('html') > -1
    )


def error_log(url, params=None, msg=None):
    """Print error message and log them if exist.
    
    Parameters
    ----------
    url : str
        URL string
    params : dict
        key-value pair attached to url
    msg : str
        Error message based on `RequestException`
    """
    print(
        f'Error occured during request to {url}',
        f'with paremeter {params}',
        f'Error Message {msg}',
        sep='\n'
    )

def bl_make_soup(bl_url, bl_param=None):
    """Get HTML string using `requests.get()` method.
    
    Parameters
    ----------
    url: str
        URL target to requests a HTTP GET method
    param: dict
        `param` to pass into `requests.get()` method
        
    Returns
    -------
    bs4.BeautifulSoup
        BeautifulSoup object
    """
    html, url = get_html(url=bl_url, param=bl_param)
    
    return BeautifulSoup(html), url

In [145]:
chrome_opt = Options()
chrome_opt.add_argument('--headless')
driver = webdriver.Chrome(options=chrome_opt, executable_path='../chromedriver')
driver.get(href)

In [146]:
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
pagination = driver.find_element_by_class_name('c-pagination')

In [147]:
pagination.find_elements_by_tag_name('li')

[<selenium.webdriver.remote.webelement.WebElement (session="169699dc6876f45166c33e80d0ad565e", element="0.23720821702976913-2")>,
 <selenium.webdriver.remote.webelement.WebElement (session="169699dc6876f45166c33e80d0ad565e", element="0.23720821702976913-3")>,
 <selenium.webdriver.remote.webelement.WebElement (session="169699dc6876f45166c33e80d0ad565e", element="0.23720821702976913-4")>,
 <selenium.webdriver.remote.webelement.WebElement (session="169699dc6876f45166c33e80d0ad565e", element="0.23720821702976913-5")>,
 <selenium.webdriver.remote.webelement.WebElement (session="169699dc6876f45166c33e80d0ad565e", element="0.23720821702976913-6")>,
 <selenium.webdriver.remote.webelement.WebElement (session="169699dc6876f45166c33e80d0ad565e", element="0.23720821702976913-7")>]

In [148]:
pagination.find_elements_by_class_name('c-pagination__link')

[<selenium.webdriver.remote.webelement.WebElement (session="169699dc6876f45166c33e80d0ad565e", element="0.23720821702976913-8")>,
 <selenium.webdriver.remote.webelement.WebElement (session="169699dc6876f45166c33e80d0ad565e", element="0.23720821702976913-9")>,
 <selenium.webdriver.remote.webelement.WebElement (session="169699dc6876f45166c33e80d0ad565e", element="0.23720821702976913-10")>,
 <selenium.webdriver.remote.webelement.WebElement (session="169699dc6876f45166c33e80d0ad565e", element="0.23720821702976913-11")>]

In [154]:
review_item = driver.find_elements_by_class_name('contain-user-review-item')
for p in review_item[0].find_elements_by_tag_name('p'):
    print(p.text)

Wah, Puas Banget!
Original dan Segel Masih utuh.. Mantabb




In [161]:
pagination.find_elements_by_tag_name('a')[-2].click()

WebDriverException: Message: unknown error: Element <a class="c-pagination__link">...</a> is not clickable at point (676, 15). Other element would receive the click: <li class="c-nav-menu__item c-dropdown c-dropdown--medium c-dropdown--click-trigger">...</li>
  (Session info: headless chrome=73.0.3683.103)
  (Driver info: chromedriver=73.0.3683.68 (47787ec04b6e38e22703e856e101e840b65afe72),platform=Mac OS X 10.14.3 x86_64)


In [144]:
driver.close()

In [125]:
# create function to read 

[]