In [1]:
from appium import webdriver
from appium.options.android import UiAutomator2Options
from appium.webdriver.common.appiumby import AppiumBy
from appium.webdriver.common.touch_action import TouchAction
from selenium.webdriver.common.action_chains import ActionChains, ActionBuilder
from selenium.webdriver.common.actions.pointer_input import PointerInput
from selenium.webdriver.common.actions import interaction

from bs4 import BeautifulSoup as bs
import re
import numpy as np
import pandas as pd
import time
from datetime import datetime

swipe_down_delay = 1
swipe_up_delay = 1
action_delay = 2
launch_delay = 3

# define swipe
def swipe(driver, start, end):
    actions = ActionChains(driver)
    actions.w3c_actions = ActionBuilder(driver, mouse=PointerInput(interaction.POINTER_TOUCH, "touch"))
    actions.w3c_actions.pointer_action.move_to_location(start[0], start[1])
    actions.w3c_actions.pointer_action.pointer_down()
    actions.w3c_actions.pointer_action.move_to_location(end[0], end[1])
    actions.w3c_actions.pointer_action.release()
    actions.perform()

def process_bukalapak(rows, category, brand):
    return (
        pd.DataFrame(rows)
        .rename(columns={
            0:'SKU', 
            1:'price',
            2:'note',
        })
        .assign(
            price = lambda x: x.price.str.replace('[Rp\.]','', regex=True),
            category = category,
            brand = brand,
        )
    )

In [2]:
# dont forget to run "appium --allow-cors" in terminal

options = UiAutomator2Options()
options.automationName = 'UiAutomator2'
options.udid = 'emulator-5554'
options.platformName = 'Android'
options.platformVersion = '12'
options.deviceName = 'bwphone'

# setup the driver
driver = webdriver.Remote('http://127.0.0.1:4723', options=options)

In [3]:
# remove the warnings
import warnings
warnings.filterwarnings('once')

In [4]:
# keeps clicking back until reach home screen
while driver.current_activity != '.NexusLauncherActivity':
    driver.press_keycode(4)
    time.sleep(action_delay)

# open the bukalapak app then wait
driver.activate_app('com.bukalapak.mitra')
time.sleep(launch_delay)

# dismiss login
driver.find_element(by=AppiumBy.XPATH, value='//android.widget.TextView[@text="Gabung sekarang!"]').click()
time.sleep(action_delay)
driver.find_element(by=AppiumBy.XPATH, value='//android.widget.TextView[@text="Lewati"]').click()
time.sleep(action_delay)

# press pulsa -> access to pulsa, paket, token listrik
driver.find_element(by=AppiumBy.XPATH, value='//android.widget.TextView[@text="Pulsa"]').click()
time.sleep(action_delay)

results = []
for category in ['Pulsa', 'Paket Data']:
    driver.find_element(by=AppiumBy.XPATH, value=f'//android.widget.TextView[@text="{category}"]').click()    

    brands = [i.text for i in driver.find_elements(by=AppiumBy.XPATH, value='//android.widget.LinearLayout/android.widget.TextView')]
    for brand in brands:
        print(f'scrap {brand}')
        driver.find_element(by=AppiumBy.XPATH, value=f'//android.widget.LinearLayout/android.widget.TextView[@text="{brand}"]').click()

        rows = []
        while True:
            source = driver.page_source
            soup = bs(source, 'html.parser')
            new_rows = [tuple(re.findall(r'text="(.+)"', str(i.parent))) for i in list(soup.find_all(attrs={'text': re.compile(r'^Rp.+$')}))]
            if set(new_rows).intersection(rows) == set(new_rows):
                break
            swipe(driver, (500, 1400), (500, 800))
            time.sleep(swipe_down_delay)
            rows.extend(new_rows)
        results.append(process_bukalapak(rows, category, brand))

        while True:
            try:
                el = driver.find_element(by=AppiumBy.XPATH, value=f'//android.widget.LinearLayout/android.widget.TextView[@text="{brand}"]')
                time.sleep(action_delay)
                el.click()
                break
            except:
                swipe(driver, (500, 800), (500, 1400))
                time.sleep(swipe_up_delay)

driver.find_element(by=AppiumBy.XPATH, value=f'//android.widget.TextView[@text="Token Listrik"]').click()
time.sleep(action_delay)
rows = []
while True:
    source = driver.page_source
    soup = bs(source, 'html.parser')
    new_rows = [tuple(re.findall(r'text="(.+)"', str(i.parent))) for i in list(soup.find_all(attrs={'text': re.compile(r'^Rp.+$')}))]
    if set(new_rows).intersection(rows) == set(new_rows):
        break
    swipe(driver, (500, 1400), (500, 800))
    time.sleep(swipe_down_delay)
    rows.extend(new_rows)
results.append(process_bukalapak(rows, category='Token Listrik', brand=None))

results_df = (
    pd.concat(results)
    .drop_duplicates()
    .assign(
        platform = 'mitra bukalapak',
    )
)

scrap Telkomsel




scrap Xl




scrap Axis




scrap Indosat




scrap Tri




scrap Smartfren




scrap Telkomsel




scrap Xl




scrap Axis




scrap Indosat




scrap Tri




scrap Smartfren




scrap Telkomsel




scrap Xl




scrap Axis




scrap Indosat




scrap Tri




scrap Smartfren




In [5]:
from datetime import date
today = date.today().strftime("%Y_%m_%d")
results_df.to_csv(f'result/bukalapak_scrape_{today}.csv')