In [22]:
import requests

session = requests.Session()

# Header ala browser (cukup minimal tapi efektif)
session.headers.update({
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,text/csv,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.weather.gov.sg/climate-historical-daily/",
    "Connection": "keep-alive",
})

# 1) Prime cookies (ini sering jadi kunci)
session.get("https://www.weather.gov.sg/climate-historical-daily/", timeout=15)

# 2) Baru download CSV
url = "https://www.weather.gov.sg/files/dailydata/DAILYDATA_S104_200901.csv"
r = session.get(url, timeout=15)

print("status:", r.status_code)
print("content-type:", r.headers.get("Content-Type"))
print("first 200 chars:\n", r.text[:500])

status: 200
content-type: text/csv
first 200 chars:
 Station,Year,Month,Day,Daily Rainfall Total (mm),Highest 30 Min Rainfall (mm),Highest 60 Min Rainfall (mm),Highest 120 Min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h)
Admiralty,2009,1,1,,,,,,,,,
Admiralty,2009,1,2,,,,,,,,,
Admiralty,2009,1,3,,,,,,,,,
Admiralty,2009,1,4,,,,,,,,,
Admiralty,2009,1,5,,,,,,,,,
Admiralty,2009,1,6,,,,,,,,,
Admiralty,2009,1,


In [9]:
import sys
sys.path.append("..")

import pandas as pd
import requests
import time
from pathlib import Path
from io import StringIO

from src import dataset
import importlib

importlib.reload(dataset)

from src.config import DATA_DIR

BASE_URL = "https://www.weather.gov.sg/files/dailydata/"
OUT_BASE = DATA_DIR/'ground_truth'
OUT_BASE.mkdir(exist_ok=True)

YEARS = range(1980, 2026)  # kecilkan dulu untuk sanity check
MONTHS = range(1, 13)

session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36",
    "Accept": "text/csv,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.weather.gov.sg/climate-historical-daily/",
    "Connection": "keep-alive",
})

# Prime cookies sekali
session.get("https://www.weather.gov.sg/climate-historical-daily/", timeout=15)

def normalize_df(df: pd.DataFrame, station_name: str) -> pd.DataFrame:
    df = dataset.clean_column_names(df)

    df["date"] = pd.to_datetime(df[["year", "month", "day"]])
    df["location"] = station_name

    final_cols = [
        "date",
        "highest_30_min_rainfall_mm",
        "highest_60_min_rainfall_mm",
        "highest_120_min_rainfall_mm",
        "mean_temperature_c",
        "maximum_temperature_c",
        "minimum_temperature_c",
        "mean_wind_speed_kmh",
        "max_wind_speed_kmh",
        "location",
        "daily_rainfall_total_mm",
    ]

    missing = [c for c in final_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}. Got: {list(df.columns)}")

    return df[final_cols]

In [12]:
BASE_URL = "https://www.weather.gov.sg/files/dailydata/"
OUT_BASE = DATA_DIR/"ground_truth"
OUT_BASE.mkdir(exist_ok=True)

YEARS = range(1980, 2026)   # kecilkan dulu untuk sanity check
MONTHS = range(1, 13)

session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36",
    "Accept": "text/csv,*/*;q=0.8",
    "Referer": "https://www.weather.gov.sg/climate-historical-daily/",
})

# prime cookie
session.get("https://www.weather.gov.sg/climate-historical-daily/", timeout=15)

for st in stations:
    code = st["station_code"]
    name = st["station_name"]

    station_dir = OUT_BASE / code
    station_dir.mkdir(exist_ok=True)

    print(f"\n=== {code} | {name} ===")

    for y in YEARS:
        for m in MONTHS:
            ym = f"{y}{m:02d}"
            url = f"{BASE_URL}DAILYDATA_{code}_{ym}.csv"

            try:
                r = session.get(url, timeout=15)
                if r.status_code != 200 or len(r.text) < 50:
                    continue

                df_raw = pd.read_csv(StringIO(r.text), encoding="utf-8-sig")
                df_final = normalize_df(df_raw, name)

                out_file = station_dir / f"{code}_{ym}.csv"
                df_final.to_csv(out_file, index=False)

                print("OK", out_file.name)
                time.sleep(0.25)

            except Exception as e:
                print("ERR", code, ym, e)


=== S104 | Admiralty ===
OK S104_200901.csv
OK S104_200902.csv
OK S104_200903.csv
OK S104_200904.csv
OK S104_200905.csv
OK S104_200906.csv
OK S104_200907.csv
OK S104_200908.csv
OK S104_200909.csv
OK S104_200910.csv
OK S104_200911.csv
OK S104_200912.csv
OK S104_201001.csv
OK S104_201002.csv
OK S104_201003.csv
OK S104_201004.csv
OK S104_201005.csv
OK S104_201006.csv
OK S104_201007.csv
OK S104_201008.csv
OK S104_201009.csv
OK S104_201010.csv
OK S104_201011.csv
OK S104_201012.csv
OK S104_201101.csv
OK S104_201102.csv
OK S104_201103.csv
OK S104_201104.csv
OK S104_201105.csv
OK S104_201106.csv
OK S104_201107.csv
OK S104_201108.csv
OK S104_201109.csv
OK S104_201110.csv
OK S104_201111.csv
OK S104_201112.csv
OK S104_201201.csv
OK S104_201202.csv
OK S104_201203.csv
OK S104_201204.csv
OK S104_201205.csv
OK S104_201206.csv
OK S104_201207.csv
OK S104_201208.csv
OK S104_201209.csv
OK S104_201210.csv
OK S104_201211.csv
OK S104_201212.csv
OK S104_201301.csv
OK S104_201302.csv
OK S104_201303.csv
OK S1