In [16]:
import os
import platform
import pandas as pd
import numpy as np
import time
import threading
import aiohttp
import asyncio
import nest_asyncio
nest_asyncio.apply()

# for logging
import sys
import logging
import datetime
from logging.handlers import TimedRotatingFileHandler

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException, NoSuchElementException

#logging
class PrintLogger:
  def __init__(self, log):
    self.terminal = sys.stdout
    self.log = log

  def write(self, message):
    self.terminal.write(message)
    self.log.write(message)

  def flush(self):
    pass

current_date = datetime.datetime.now().strftime("%Y-%m-%d")
def setup_logging():
  log_formatter = logging.Formatter("%(asctime)s - %(message)s", "%Y-%m-%d %H:%M:%S")
  log_file = f'./loggings/verval_scrape_{current_date}.log'
  log_handler = TimedRotatingFileHandler(log_file, when="midnight", interval=1, backupCount=30, utc=False)
  log_handler.setFormatter(log_formatter)
  log_handler.setLevel(logging.DEBUG)
  logger = logging.getLogger()
  logger.addHandler(log_handler)

  sys.stdout = PrintLogger(log_handler.stream)
setup_logging()


# timeout limit for WebDriverWait
timeout_limit = 5

#maximum retry chance for scrape
max_retries = 3 

os_system = platform.system()
print('OS SYSTEM:   ', os_system)

#cpu count
num_threads = os.cpu_count()
print(f'CPU CORES:    {num_threads}')

# set path ke file chromedriver to operate the Chrome browser.
# chrome_version = 'v114_0_5735_90'
chrome_version = 'v118.0.5993.70'
if os_system == 'Windows':
    chrome_path = os.path.join('webdriver', 'chrome', os_system, chrome_version, 'chromedriver.exe')
elif os_system == 'Linux':
    chrome_path = os.path.join('webdriver', 'chrome', os_system, chrome_version, 'chromedriver')
else:
    chrome_path = os.path.join('webdriver', 'chrome', 'MacOS', chrome_version, 'chromedriver')

print('CHROME PATH:    ', chrome_path)
#webdriver options
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-setuid-sandbox')
#overcome limited resource problems
# chrome_options.add_argument('--disable-dev-shm-usage')
#open Browser in maximized mode
chrome_options.add_argument("start-maximized")
#disable extension
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")

def driversetup():
  # webdriver_service = ChromeService(ChromeDriverManager().install())
  chrome_service = Service(executable_path=chrome_path)
  driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
  return driver
driver = driversetup()


"""## get provinsi"""

#get provinsi
print('getting provices')
start_province = time.time()
# driver.get('https://vervalyayasan.data.kemdikbud.go.id/index.php/Chome/rekapitulasi?kode_wilayah=000000')

# #to show all link (dropdown)
# dropdown_container = driver.find_element(By.CLASS_NAME, 'dataTables_length')
# select = dropdown_container.find_element(By.TAG_NAME, 'select')
# dropdown = Select(select)
# dropdown.select_by_value('-1')

# province_list = []
# province_urls = []

# urls_elements = driver.find_element(By.TAG_NAME, 'tbody').find_elements(By.XPATH, "tr/td/a")

# for url in urls_elements:
#   province_list.append(url.get_attribute('innerHTML'))
#   province_urls.append(url.get_attribute('href'))

# df_provinces = pd.DataFrame({'province': province_list, 'urls': province_urls})
# df_provinces.to_csv('./verval_yayasan/test_province_list.csv', index=False)
# print(f'province done in: {time.time() - start_province} seconds')


"""## kabupaten/kota links multithreading"""
start_kabupaten = time.time()
print('getting kabupaten/kota')


def get_kab_kota(url):
  retry_count = 0
  while retry_count < max_retries:
    try:
      driver.get(url)

      #navigate dropdown to show all link
      dropdown_container = driver.find_element(By.CLASS_NAME, 'dataTables_length')
      select = dropdown_container.find_element(By.TAG_NAME, 'select')
      dropdown = Select(select)
      dropdown.select_by_value('-1')

      urls_elements = driver.find_element(By.TAG_NAME, 'tbody').find_elements(By.XPATH, "tr/td/a")
      kabupaten_len.append(urls_elements)

      for url in urls_elements:
        kab_kota_names.append(url.get_attribute('innerHTML'))
        kab_kota_urls.append(url.get_attribute('href'))
      return

    except Exception as e:
      print(f'province url: {url}, error: {e}, retry: {retry_count}')
      if retry_count == 2:
          print(f'province with url {url} was failed to be get')
      retry_count += 1
      time.sleep(3)
  return None


def main_kab_kota(province_urls):
    print('in main_get_kode_kecamatan')
    threads = []
    province_batches = np.array_split(province_urls, num_threads)
    for t in range(num_threads):
        thread = threading.Thread(target=get_kab_kota, args=(province_batches[t],))
        threads.append(thread)

    for thread in threads:
        thread.start()

    for thread in threads:
        thread.join()
    return None

kab_kota_names = []
kab_kota_urls = []
kabupaten_len = []
province_urls = pd.read_csv('verval_yayasan\\test_province_list.csv')
province_urls = province_urls['urls']

print(len(province_urls))
main_kab_kota(province_urls)


df_kab_kota = pd.DataFrame({'kab/kota': kab_kota_names, 'urls': kab_kota_urls})
df_kab_kota.to_csv('./verval_yayasan/kab_kota_list.csv', index=False)
print(f'kabupaten done in: {time.time() - start_kabupaten}')

KeyboardInterrupt: 

In [15]:
kabupaten_len

[]