In [17]:
from appium import webdriver
from appium.options.android import UiAutomator2Options
from appium.webdriver.common.appiumby import AppiumBy
from appium.webdriver.common.touch_action import TouchAction
from selenium.webdriver.common.action_chains import ActionChains, ActionBuilder
from selenium.webdriver.common.actions.pointer_input import PointerInput
from selenium.webdriver.common.actions import interaction

from bs4 import BeautifulSoup as bs
import re
import numpy as np
import pandas as pd
import time
from datetime import datetime

swipe_down_delay = 1
swipe_up_delay = 1
action_delay = 10
launch_delay = 10

# define swipe
def swipe(driver, start, end):
    actions = ActionChains(driver)
    actions.w3c_actions = ActionBuilder(driver, mouse=PointerInput(interaction.POINTER_TOUCH, "touch"))
    actions.w3c_actions.pointer_action.move_to_location(start[0], start[1])
    actions.w3c_actions.pointer_action.pointer_down()
    actions.w3c_actions.pointer_action.move_to_location(end[0], end[1])
    actions.w3c_actions.pointer_action.release()
    actions.perform()

def click(driver, x, y):
    actions = ActionChains(driver)
    actions.w3c_actions = ActionBuilder(driver, mouse=PointerInput(interaction.POINTER_TOUCH, "touch"))
    actions.w3c_actions.pointer_action.move_to_location(x, y)
    actions.w3c_actions.pointer_action.pointer_down()
    actions.w3c_actions.pointer_action.pause(0.1)
    actions.w3c_actions.pointer_action.release()
    actions.perform()

def process_toko_pulsa(df, telco, category):
    cleaned =\
    (
        pd.concat(rows)
        .rename(columns={
                0:'base_price_temp', 
                1:'product_value_temp',
                2:'disc_price_temp',
                3:'note',
                4:'promo'
            })
        .assign(
                has_promo = lambda x: np.where(x.note.isnull(), 0, 1),
                SKU = lambda x: np.where(x.has_promo == 1, x.product_value_temp, x.base_price_temp),
                base_price = lambda x: np.where(x.has_promo == 1, x.base_price_temp, x.product_value_temp),
                disc_price = lambda x: np.where(x.has_promo == 1, x.disc_price_temp, np.nan),
                brand = telco,
                category = category
            )
        .drop(columns=['base_price_temp', 'product_value_temp', 'disc_price_temp'])
        .drop_duplicates()
    )
    return cleaned[cleaned['base_price'].str.contains('Rp')]

def process_toko_paket(df, telco, category):
    return (
        pd.concat(df)
        .rename(columns={
            0:'SKU', 
            1:'disc_price',
            2:'disc_rate',
            3:'base_price',
            # 5:'note',
        })
        .replace('Lihat Detail', None)
        .loc[lambda x: x.base_price.astype(str).str.contains('^Rp')]
        # .drop(columns=[4])
        .assign(
            brand = telco,
            category = category,
        )
        .drop_duplicates()
    )

def process_toko_pln(df):
    return (
    pd.concat(df)
    .rename(columns={
        0:'SKU', 
        1:'base_price',
    })
    .loc[lambda x: x.base_price.astype(str).str.contains('^Rp')]
    .drop(columns=[2,3])
    .assign(
        # price=lambda x: x.price.str.replace('[Rp\.]', '', regex=True),
        brand = 'PLN',
        category = 'PLN',
    )
    .drop_duplicates()
)  

def process_toko_ewallet(df, brand):
    return (
    pd.concat(df)
    .rename(columns={
        0:'SKU', 
        1:'base_price',
    })
    .loc[lambda x: x.base_price.astype(str).str.contains('^Rp')]
    .drop(columns=[2,3])
    .assign(
        # price=lambda x: x.price.str.replace('[Rp\.]', '', regex=True),
        brand = brand,
        category = 'E-Wallet',
    )
    .drop_duplicates()
)

def process_toko_voucher(df, brand):
    cleaned =\
    (
        pd.concat(df)
        .rename(columns={
            0:'SKU'
        })
        .assign(
            cnt = lambda x: x.count(axis=1),
            base_price = lambda x: np.where(x.cnt == 2, x[1], x[2]),
            disc_price = lambda x: np.where(x.cnt == 4, x[3], ''),
            disc_rate = lambda x: np.where(x.cnt == 4, x[1], ''),
            brand = brand,
            category = 'Voucher Game'
        )
        .drop_duplicates()
    )
    return cleaned

In [18]:
# dont forget to run "appium --allow-cors" in terminal

options = UiAutomator2Options()
options.automationName = 'UiAutomator2'
options.udid = 'emulator-5554'
options.platformName = 'Android'
options.platformVersion = '12'
options.deviceName = 'bwphone'

# setup the driver
driver = webdriver.Remote('http://127.0.0.1:4723', options=options)

In [19]:
# [i for i in driver.find_elements(by=AppiumBy.CLASS_NAME, value='android.view.View') if 'Pulsa' in i.text][0].click()

In [20]:
telcos_prefix = {
    'smartfren':'0881',
    'telkomsel':'0812',
    'im3':'0814',
    'xl':'0818',
    'axis':'0831',
    '3':'0894',
}

# # keeps clicking back until reach home screen
# while driver.current_activity != '.NexusLauncherActivity':
#     driver.press_keycode(5)
#     time.sleep(action_delay)

# # open the app then wait
# driver.activate_app('com.tokopedia.kelontongapp')
# time.sleep(launch_delay)

# # popup will appear. we will click back and if we are out then will just go back in
# driver.press_keycode(4)
# time.sleep(launch_delay)
# if driver.current_package != 'com.tokopedia.kelontongapp':
#     driver.activate_app('com.tokopedia.kelontongapp')
#     time.sleep(launch_delay)


results = []
# --- PULSA
p = 'Pulsa'
print('processing Pulsa')
for telco in list(telcos_prefix.keys()):
    # Go into pulsa
    # notes: easier to just mention the coordinate
    # [i for i in driver.find_elements(by=AppiumBy.CLASS_NAME, value='android.view.View') if 'Pulsa' in i.text][0].click()
    click(driver, 117, 770)
    time.sleep(action_delay)

    el3 = driver.find_element(by=AppiumBy.CLASS_NAME, value='android.widget.EditText')
    el3.send_keys(telcos_prefix[telco])
    time.sleep(action_delay)
    driver.press_keycode(4)
    time.sleep(action_delay)

    rows = []
    while True:
        source = driver.page_source
        soup = bs(source, 'html')
        new_rows = pd.concat([pd.Series(re.findall(r'text="(.+)"', str(i.parent))) for i in list(soup.find_all(attrs={'text': re.compile(r'^Rp.+$')}))], axis=1).T
        rows.append(new_rows)
        if bool(re.search('Mau catat', source)):
            break
        swipe(driver, (500, 1400), (500, 800))
        time.sleep(swipe_down_delay)
    results.append(process_toko_pulsa(rows, telco, p))

    # go back
    driver.press_keycode(4)
    time.sleep(action_delay)

# --- PAKET DATA
p = 'Paket Data'
print('processing Paket Data')
for telco in list(telcos_prefix.keys()):
    print(p, telco)
    
    # go to the coordinate
    # [i for i in driver.find_elements(by=AppiumBy.CLASS_NAME, value='android.view.View') if 'Paket Data' in i.text][0].click()
    click(driver, 330, 770)
    time.sleep(action_delay)

    # # dismiss interstitial banner
    # try:
    #     e = driver.find_element(by=AppiumBy.XPATH, value='/hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.view.ViewGroup/android.widget.RelativeLayout/android.webkit.WebView/android.webkit.WebView/android.view.View[2]/android.app.Dialog/android.view.View/android.view.View[1]/android.widget.Button')
    #     e.click()
    #     time.sleep(action_delay)
    # except:
    #     pass

    # click the text box
    el3 = driver.find_element(by=AppiumBy.CLASS_NAME, value='android.widget.EditText')
    el3.send_keys(telcos_prefix[telco])
    time.sleep(action_delay)
    driver.press_keycode(4)
    time.sleep(action_delay)

    rows = []
    while True:
        source = driver.page_source
        soup = bs(source, 'html')
        new_rows = pd.concat([pd.Series(re.findall(r'text="(.+)"', str(i.parent))) for i in list(soup.find_all(attrs={'text': re.compile(r'^Rp.+$')}))], axis=1).T
        rows.append(new_rows)
        if bool(re.search('Mau catat', source)):
            break
        swipe(driver, (500, 1400), (500, 800))
        time.sleep(swipe_down_delay)
    results.append(process_toko_paket(rows, telco, p))
        
    driver.press_keycode(4)
    time.sleep(action_delay)


# --- PLN
p = 'PLN'
print('processing PLN')
# [i for i in driver.find_elements(by=AppiumBy.CLASS_NAME, value='android.view.View') if 'PLN' in i.text][0].click()
click(driver, 550, 770)
time.sleep(action_delay)
driver.press_keycode(4)
time.sleep(action_delay)

rows = []
while True:
    source = driver.page_source
    soup = bs(source, 'html')
    new_rows = pd.concat([pd.Series(re.findall(r'text="(.+)"', str(i.parent))) for i in list(soup.find_all(attrs={'text': re.compile(r'^Rp.+$')}))], axis=1).T
    rows.append(new_rows)
    if bool(re.search('Mau catat', source)):
        break
    swipe(driver, (500, 1400), (500, 800))
    time.sleep(swipe_down_delay)
results.append(process_toko_pln(rows))
driver.press_keycode(4)
time.sleep(action_delay)


# -- EWALLET
ewallet_dict = {1:'GoPay Dompet Digital', 2:'Saldo Driver Gojek', 3:'LinkAja', 4:'DANA Dompet Digital', 5:'OVO Dompet Digital'}
print('processing EWALLET')
rows = []
for i in ewallet_dict:

    # click "Lihat Semua"
    click(driver, 117, 1050)
    time.sleep(action_delay)
    # click(driver, 150, 1150)
    # time.sleep(action_delay)


    # [i for i in driver.find_elements(by=AppiumBy.CLASS_NAME, value='android.view.View') if 'Top Up DANA' in i.text][0].click()
    # click(driver, 950, 1330)
    # time.sleep(action_delay)
    driver.press_keycode(4)
    time.sleep(action_delay)

    # open the brand list
    driver.find_element(by=AppiumBy.XPATH, value="/hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.view.ViewGroup/android.widget.RelativeLayout/android.webkit.WebView/android.webkit.WebView/android.view.View/android.view.View[2]/android.view.View/android.view.View[1]/android.widget.EditText").click()
    time.sleep(action_delay)
    # select the brand
    driver.find_element(by=AppiumBy.XPATH, value=f"/hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.view.ViewGroup/android.widget.RelativeLayout/android.webkit.WebView/android.webkit.WebView/android.view.View[2]/android.app.Dialog/android.view.View[2]/android.widget.ListView/android.view.View[{i}]/android.view.View/android.view.View/android.widget.TextView").click()
    time.sleep(action_delay)
    # click the sku list
    driver.find_element(by=AppiumBy.XPATH, value="/hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.view.ViewGroup/android.widget.RelativeLayout/android.webkit.WebView/android.webkit.WebView/android.view.View/android.view.View[3]/android.view.View/android.view.View[1]/android.widget.EditText").click()
    time.sleep(action_delay)
    # scrape
    # // just swipe down twice then go back, kinda too lazy to figure out the proper way
    cnt = 0
    while cnt <= 2:
        source = driver.page_source
        soup = bs(source, 'html')
        new_rows = pd.concat([pd.Series(re.findall(r'text="(.+)"', str(i.parent))) for i in list(soup.find_all(attrs={'text': re.compile(r'^Rp.+$')}))], axis=1).T
        rows.append(new_rows)
        swipe(driver, (500, 1400), (500, 800))
        time.sleep(swipe_down_delay)
        cnt += 1
    results.append(process_toko_ewallet(rows, ewallet_dict[i]))
    
    # click x button on top right
    # click(driver, 70, 170)
    driver.press_keycode(4)
    time.sleep(action_delay)
    driver.press_keycode(4)
    time.sleep(action_delay)
    driver.press_keycode(4)
    time.sleep(action_delay)
    


processing Pulsa




processing Paket Data
Paket Data smartfren




Paket Data telkomsel




Paket Data im3




Paket Data xl




Paket Data axis




Paket Data 3




processing PLN




processing EWALLET




In [21]:
results_df = (
    pd.concat(results)
    .drop_duplicates()
    .assign(
        platform = 'tokopedia',
    )
    [['SKU', 'base_price', 'disc_price', 'brand', 'category', 'note', 'platform']]
)

In [22]:
from datetime import date
today = date.today().strftime("%Y_%m_%d")
results_df.to_csv(f'result/tokopedia_scrape_{today}.csv')