<a href="https://colab.research.google.com/github/socialx-analytics/bi-11-sept-25/blob/main/002_scraping_inflasi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install & import library
# !pip -q install beautifulsoup4 lxml

import re, time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from google.colab import files


In [None]:
# Set URL
url = "https://www.bi.go.id/id/statistik/indikator/data-inflasi.aspx"
url

'https://www.bi.go.id/id/statistik/indikator/data-inflasi.aspx'

In [None]:
# Setup session dengan retry & backoff (lebih robust)
session = requests.Session()
session.trust_env = False  # hindari proxy env colab yg kadang bikin flaky

retry = Retry(
    total=6,
    connect=6,
    read=6,
    backoff_factor=1.5,
    status_forcelist=[429, 500, 502, 503, 504, 521, 522, 524],
    allowed_methods=frozenset(["GET", "POST"]),
    respect_retry_after_header=True,
)
adapter = HTTPAdapter(max_retries=retry, pool_connections=10, pool_maxsize=10)
session.mount("http://", adapter)
session.mount("https://", adapter)

# Beberapa profil header untuk fallback kalau server “rewel”
HEADER_PROFILES = [
    {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7",
        "Cache-Control": "no-cache",
        "Connection": "close",
    },
    {
        # fallback: keep-alive + accept-encoding
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "id,en-US;q=0.9,en;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Referer": "https://www.bi.go.id/id/statistik/indikator/default.aspx",
    },
]

# Timeout tuple: (connect, read) — baca dibuat panjang
TIMEOUT = (30, 180)

def fetch(method, url, *, data=None):
    """
    Wrapper GET/POST dengan fallback ke beberapa profil header.
    Mengurangi kemungkinan 'RemoteDisconnected' dari ASP.NET/load balancer.
    """
    last_exc = None
    for i, h in enumerate(HEADER_PROFILES, 1):
        try:
            if method == "GET":
                resp = session.get(url, headers=h, timeout=TIMEOUT, allow_redirects=True)
            else:
                resp = session.post(url, headers=h, data=data, timeout=TIMEOUT, allow_redirects=True)
            resp.raise_for_status()
            return resp
        except Exception as e:
            print(f"[fetch] Profil header {i} gagal: {type(e).__name__} -> retry profil berikutnya ...")
            last_exc = e
            time.sleep(1.0 + 0.5 * i)
    # kalau semua profil gagal, lempar exception terakhir
    raise last_exc


In [None]:
# Ambil konten website (halaman pertama)
response = fetch("GET", url)
response


<Response [200]>

In [None]:
# Parse HTML
soup = BeautifulSoup(response.content, "lxml")
soup


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html dir="ltr" lang="en-US">
<head><meta content="Bank Indonesia, BI, Rupiah, kurs BI, kurs tengah BI, kalkulator kurs, inflasi, kurs transaksi, JISDOR, PINTAR BI, BI Fast, bank sentral, QRIS, uang Rupiah, SIMODIS, Sobat Rupiah, Cadangan Devisa, Gubernur Bank Indonesia, uang, bank, CBP, Cinta Bangga Rupiah, Cinta Rupiah, JIBOR, IndONIA" name="keywords"/><meta content="index, follow" name="robots"/><meta content="Bank Indonesia" name="Author"/><meta content="Microsoft SharePoint" name="GENERATOR"/><meta content="text/html; charset=utf-8" http-equiv="Content-type"/><meta content="IE=11; IE=10" http-equiv="X-UA-Compatible"/><meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/><meta content="width=device-width, initial-scale=1" name="viewport"/><meta content="IE=10" http-equiv="X-UA-Compatible"/><script type="text/javascript">var _browserisFlight = true;</script><meta co

In [None]:
# Ambil elemen data tabel & helper untuk pagination ASP.NET

def parse_table(soup):
    data = []
    table = soup.find("table", {"class": "table table-striped table-no-bordered table-lg"})
    if not table:
        return data
    for row in table.find_all("tr")[1:]:  # skip header
        cols = row.find_all("td")
        if len(cols) >= 2:
            tgl = cols[0].get_text(strip=True)
            infl = cols[1].get_text(strip=True)
            data.append({"Tanggal": tgl, "Data Inflasi": infl})
    return data

def get_hidden(soup, name):
    el = soup.find("input", {"name": name})
    return el["value"] if el and el.has_attr("value") else ""

def next_page_target(soup):
    """
    Cari target __doPostBack untuk halaman berikutnya.
    Cara sederhana: temukan halaman aktif (span.active), lalu ambil anchor bernomor curr+1.
    """
    pag = soup.find("span", id=re.compile("DataPagerDataInflasi"))
    if not pag:
        return None
    active = pag.find("span", {"class": "page-link--custom active"})
    if not active:
        return None
    try:
        curr = int(active.get_text(strip=True))
    except ValueError:
        return None

    for a in pag.find_all("a", class_="pagination-list"):
        if a.get_text(strip=True) == str(curr + 1):
            m = re.search(r"__doPostBack\('([^']+)'", a.get("href", ""))
            return m.group(1) if m else None
    return None


In [None]:
# Ambil data multipage
all_rows = []

# Page 1
print("Scraping page 1 ...")
all_rows.extend(parse_table(soup))

# Halaman berikutnya
page = 1
while True:
    target = next_page_target(soup)
    if not target:
        break

    payload = {
        "__EVENTTARGET": target,
        "__EVENTARGUMENT": "",
        "__LASTFOCUS": "",
        "__VIEWSTATE": get_hidden(soup, "__VIEWSTATE"),
        "__VIEWSTATEGENERATOR": get_hidden(soup, "__VIEWSTATEGENERATOR"),
        "__EVENTVALIDATION": get_hidden(soup, "__EVENTVALIDATION"),
    }

    page += 1
    print(f"Scraping page {page} ...")
    response = fetch("POST", url, data=payload)
    soup = BeautifulSoup(response.content, "lxml")
    all_rows.extend(parse_table(soup))
    time.sleep(0.5)  # jeda kecil biar ramah server


Scraping page 1 ...
Scraping page 2 ...
Scraping page 3 ...
Scraping page 4 ...
Scraping page 5 ...
Scraping page 6 ...
Scraping page 7 ...
Scraping page 8 ...
Scraping page 9 ...
Scraping page 10 ...


In [None]:
# Buat DataFrame
df = pd.DataFrame(all_rows)
df

Unnamed: 0,Tanggal,Data Inflasi
0,Agustus 2025,2.31 %
1,Juli 2025,2.37 %
2,Juni 2025,1.87 %
3,Mei 2025,1.6 %
4,April 2025,1.95 %
...,...,...
95,September 2017,3.72 %
96,Agustus 2017,3.82 %
97,Juli 2017,3.88 %
98,Juni 2017,4.37 %


In [None]:
# Simpan CSV & auto-download
filename = "data_inflasi_bi.csv"
df.to_csv(filename, index=False, encoding="utf-8-sig")
files.download(filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>