In [1]:
pip install pandas requests tqdm

Note: you may need to restart the kernel to use updated packages.


In [None]:
import requests, pandas as pd, xml.etree.ElementTree as ET
from datetime import datetime, timedelta
from time import sleep

BASE_JSON = "http://www.garak.co.kr/homepage/publicdata/dataJsonOpen.do"
BASE_XML  = "http://www.garak.co.kr/homepage/publicdata/dataOpen.do"

COMMON = {
    "id":"4655",
    "passwd":"wanna1102!",
    "dataid":"data22",           # 정산후(반입물량)
    "pagesize":"1000",
    "portal.templet":"false",
}
HEADERS = {"User-Agent":"Mozilla/5.0", "Accept-Charset":"euc-kr"}

START, END = "20210101", "20231231"

def daterange(a, b):
    d0 = datetime.strptime(a, "%Y%m%d").date()
    d1 = datetime.strptime(b, "%Y%m%d").date()
    while d0 <= d1:
        yield d0.strftime("%Y%m%d")
        d0 += timedelta(days=1)

def parse_json_payload(obj):
    # data22는 보통 resultData, 일부는 list 키를 씀
    rows = obj.get("resultData")
    if rows is None: rows = obj.get("list")
    if not rows: return []
    # 문자열 공백 정리
    out = []
    for r in rows:
        out.append({k: (v.strip() if isinstance(v,str) else v) for k,v in r.items()})
    return out

def fetch_day(date_str, max_pages=50):
    all_rows = []
    for page in range(1, max_pages+1):
        params = dict(COMMON, date=date_str, pageidx=str(page))
        # 1) JSON 시도
        try:
            r = requests.get(BASE_JSON, params=params, headers=HEADERS, timeout=20)
            r.encoding = "euc-kr"
            data = r.json()
            rows = parse_json_payload(data)
        except Exception:
            rows = []
        # 2) JSON이 비었으면 XML 폴백
        if not rows:
            try:
                r = requests.get(BASE_XML, params=params, headers=HEADERS, timeout=20)
                r.encoding = "euc-kr"
                root = ET.fromstring(r.text)
                rows = []
                for item in root.findall(".//list"):
                    row = {child.tag: (child.text or "").strip() for child in item}
                    rows.append(row)
            except Exception:
                rows = []

        if not rows:  # 이 페이지엔 더 이상 없음
            break

        for row in rows:
            row["DATE"] = date_str
            all_rows.append(row)

        sleep(0.2)
    return all_rows

def main():
    records = []
    for ds in daterange(START, END):
        day_rows = fetch_day(ds)
        records.extend(day_rows)

    if not records:
        print("수집된 레코드가 없습니다. (resultData/list 키 및 엔드포인트 재확인 요망)")
        return

    df = pd.DataFrame(records)

    # 숫자형 변환
    num_cols = [c for c in ["SUM_TOT","CORP_CD_1","CORP_CD_2","CORP_CD_3","CORP_CD_4","CORP_CD_5","CORP_CD_6"] if c in df.columns]
    for c in num_cols:
        df[c] = (df[c].astype(str)
                    .str.replace(",", "", regex=False)
                    .str.replace("\u00a0","", regex=False)
                    .str.strip())
        df[c] = pd.to_numeric(df[c], errors="coerce")

    # 컬럼 정리(있으면 앞으로)
    preferred = ["DATE","BURYU","SORT_CD","ITM_CD","ITM_NM","PUM_CD","PUM_NM",
                 "CORP_CD_1","CORP_CD_2","CORP_CD_3","CORP_CD_4","CORP_CD_5","CORP_CD_6","SUM_TOT"]
    cols = [c for c in preferred if c in df.columns] + [c for c in df.columns if c not in preferred]
    df = df[cols]

    out = "garak_inbound_2021_2023.csv"
    df.to_csv(out, index=False, encoding="utf-8-sig")
    print(f"완료: {out} (rows={len(df):,}, cols={df.shape[1]})")

if __name__ == "__main__":
    main()
