# POC Bukalapak Scraping

In [1]:
# import things
import os

os.chdir('/Users/syahrulhamdani/Documents/Projects/unbotxing/')
print(f"Working directory -> {os.getcwd()}")

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

from boxing import get_html, bl_make_soup

Working directory -> /Users/syahrulhamdani/Documents/Projects/unbotxing


In [2]:
# define site properties
url = "https://www.bukalapak.com"
print('url:', url)
sub_categories ={
    'handphone': ['hp-smartphone', 'tablet', 'headset-earphone', 'power-bank', 'aksesori-handphone'],
    'kamera': ['kamera-digital', 'kamera-analog', 'tas-case', 'aksesoris-kamera'],
    'elektronik': ['headphone', 'televisi', 'speaker'],
    'komputer': ['desktop', 'laptop', 'monitor', 'aksesoris']
    
}
print('category-subcategory:', sub_categories)

url: https://www.bukalapak.com
category-subcategory: {'handphone': ['hp-smartphone', 'tablet', 'headset-earphone', 'power-bank', 'aksesori-handphone'], 'kamera': ['kamera-digital', 'kamera-analog', 'tas-case', 'aksesoris-kamera'], 'elektronik': ['headphone', 'televisi', 'speaker'], 'komputer': ['desktop', 'laptop', 'monitor', 'aksesoris']}


For POC purpose, and my business, I will focus on scraping product with query *iphone x*, with category `handphone` and subcategory `hp-smartphone`.

In [3]:
# make the soup
scrape_url = '/'.join([url, 'c', list(sub_categories.keys())[0], sub_categories['handphone'][0]])
param = {'search[keywords]': 'iphone x'}
raw_html, site_url = get_html(scrape_url, param)
soup, soup_url = bl_make_soup(scrape_url, param)

In [4]:
soup_url

'https://www.bukalapak.com/c/handphone/hp-smartphone?search%5Bkeywords%5D=iphone+x'

In [5]:
# scraping
basic_products = soup.find(name='div', class_='basic-products')

# scrape the product title in `basic_products` soup
product_title = pd.Series(
    [product_card.find('a', class_='product__name')['title']
        for product_card in basic_products.find_all(
            'article', class_='product-display'
        )],
    name='product_title'
)
# scrape proudct href in `basic_products` soup
product_href = pd.Series(
    [url + product_card.find('a', class_='product__name')['href']
        for product_card in basic_products.find_all(
            'article', class_='product-display'
    )],
    name='product_url'
)
# scrape product review url
product_review_url = []
for product_card in basic_products.find_all('article', class_='product-display'):
    review_url = product_card.find('a', class_='review__aggregate')
    if review_url:
        product_review_url.append(url + review_url['href'])
    else:
        product_review_url.append(None)
product_review_url = pd.Series(product_review_url, name='review_url')

# concatenate into whole scraped bukalapak data
bl_data_product = pd.concat([product_title, product_href, product_review_url], axis=1)

In [6]:
bl_data_product

Unnamed: 0,product_title,product_url,review_url
0,Xiaomi Mi A2 Lite,https://www.bukalapak.com/p/handphone/hp-smart...,https://www.bukalapak.com/reviews/handphone/hp...
1,Iphone X 256gb seken likenew,https://www.bukalapak.com/p/handphone/hp-smart...,https://www.bukalapak.com/reviews/handphone/hp...
2,Apple Iphone X 256GIGA,https://www.bukalapak.com/p/handphone/hp-smart...,https://www.bukalapak.com/reviews/handphone/hp...
3,Iphone XS 256GB,https://www.bukalapak.com/p/handphone/hp-smart...,https://www.bukalapak.com/reviews/handphone/hp...
4,Iphone X,https://www.bukalapak.com/p/handphone/hp-smart...,https://www.bukalapak.com/reviews/handphone/hp...
5,iphone X 256gb seken fullset,https://www.bukalapak.com/p/handphone/hp-smart...,https://www.bukalapak.com/reviews/handphone/hp...
6,iPhone X 256GB mulus,https://www.bukalapak.com/p/handphone/hp-smart...,https://www.bukalapak.com/reviews/handphone/hp...
7,Iphone XS 64GB,https://www.bukalapak.com/p/handphone/hp-smart...,https://www.bukalapak.com/reviews/handphone/hp...
8,Iphone XS MAX 64GB,https://www.bukalapak.com/p/handphone/hp-smart...,https://www.bukalapak.com/reviews/handphone/hp...
9,IPHONE X 256GIGA,https://www.bukalapak.com/p/handphone/hp-smart...,https://www.bukalapak.com/reviews/handphone/hp...
