In [103]:
import os
import sys
import logging
from datetime import datetime, timedelta
from typing import List, Dict, Optional

import pandas as pd

from sqlalchemy import create_engine
from sqlalchemy.exc import SQLAlchemyError


In [104]:
df_yellow = pd.read_parquet(r"C:\Users\Robert\Documents\bootcamp\yellow_tripdata_2025-09.parquet")
print(df_yellow.columns.tolist())

df_green = pd.read_parquet(r"C:\Users\Robert\Documents\bootcamp\green_tripdata_2025-09.parquet")
print(df_green.columns.tolist())

['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'Airport_fee', 'cbd_congestion_fee']
['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime', 'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID', 'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge', 'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge', 'cbd_congestion_fee']


In [105]:
LOG_LEVEL = logging.INFO
logging.basicConfig(
    level=LOG_LEVEL,
    format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)]
)

logger = logging.getLogger("data_extraction_pipeline")

In [None]:
DATA_SOURCES = [
    {
        "name": "yellow_trip",
        "path": r"C:\Users\Robert\Documents\bootcamp\yellow_tripdata_2025-09.parquet",  # ganti dengan path sebenarnya
        "format": "parquet",                        # atau "csv"
        "date_column": "tpep_pickup_datetime",           # ganti sesuai kolom di file
        "target_table": "public.yellow_trip"
    },
    {
        "name": "green_trip",
        "path": r"C:\Users\Robert\Documents\bootcamp\green_tripdata_2025-09.parquet",   # ganti dengan path sebenarnya
        "format": "parquet",
        "date_column": "lpep_pickup_datetime",           # ganti sesuai kolom di file
        "target_table": "public.green_trip"
    }
]

# --- Konfigurasi koneksi PostgreSQL (contoh) ---
# Format: postgresql://username:password@host:port/database
POSTGRES_CONN_STR = os.getenv(
    "POSTGRES_CONN_STR",
    "postgresql://postgres:obet@127.0.0.1:5432/my_database"  # ganti
)

USE_BIGQUERY = False


In [None]:
from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError

def get_postgres_engine(conn_str: str):
    try:
        engine = create_engine(conn_str)
        # Tes koneksi ringan
        with engine.connect() as conn:
            conn.execute(text("SELECT 1"))
        logger.info("Koneksi ke PostgreSQL berhasil.")
        return engine
    except SQLAlchemyError as e:
        logger.error(f"Gagal konek ke PostgreSQL: {e}")
        raise




def read_source_file(path: str, file_format: str) -> Optional[pd.DataFrame]:
    if not os.path.exists(path):
        logger.error(f"File tidak ditemukan: {path}")
        return None

    try:
        if file_format.lower() == "parquet":
            df = pd.read_parquet(path)
        elif file_format.lower() == "csv":
            df = pd.read_csv(path)
        else:
            logger.error(f"Format file tidak didukung: {file_format}")
            return None

        if df is None or df.empty:
            logger.warning(f"Dataframe kosong dari file: {path}")
            return None

        logger.info(f"Berhasil baca file {path} dengan {len(df)} baris.")
        return df
    except Exception as e:
        logger.error(f"Error saat membaca file {path}: {e}")
        return None


In [None]:
def validate_dataframe(
    df: Optional[pd.DataFrame],
    required_columns: List[str],
    source_name: str
) -> Optional[pd.DataFrame]:
    """Validasi dasar dataframe."""
    if df is None:
        logger.error(f"[{source_name}] Dataframe adalah None.")
        return None

    if df.empty:
        logger.error(f"[{source_name}] Dataframe kosong.")
        return None

    missing_cols = [c for c in required_columns if c not in df.columns]
    if missing_cols:
        logger.error(f"[{source_name}] Kolom hilang: {missing_cols}")
        return None

    return df


def filter_by_period(
    df: pd.DataFrame,
    date_col: str,
    run_date: datetime,
    mode: str = "monthly_first_day"
) -> pd.DataFrame:

    if date_col not in df.columns:
        raise ValueError(f"Kolom tanggal '{date_col}' tidak ditemukan di dataframe.")

    df = df.copy()

    # Pastikan kolom bertipe datetime
    if not pd.api.types.is_datetime64_any_dtype(df[date_col]):
        df[date_col] = pd.to_datetime(df[date_col], errors="coerce")

    df = df.dropna(subset=[date_col])

    if mode == "all":
        return df

    if mode == "daily":
        mask = df[date_col].dt.date == run_date.date()
    elif mode == "weekly":
        mask = (
            (df[date_col].dt.isocalendar().year == run_date.isocalendar().year) &
            (df[date_col].dt.isocalendar().week == run_date.isocalendar().week)
        )
    elif mode == "monthly":
        mask = (
            (df[date_col].dt.year == run_date.year) &
            (df[date_col].dt.month == run_date.month)
        )
    elif mode == "monthly_first_day":
        # Hanya baris yang tanggalnya = 1 di bulan yang sama
        mask = (
            (df[date_col].dt.year == run_date.year) &
            (df[date_col].dt.month == run_date.month) &
            (df[date_col].dt.day == 1)
        )
    else:
        raise ValueError(f"Mode filter tidak dikenal: {mode}")

    filtered = df[mask]

    logger.info(
        f"Filter mode='{mode}' untuk tanggal run {run_date.date()} "
        f"menghasilkan {len(filtered)} baris dari {len(df)} baris."
    )

    return filtered


In [None]:
def save_to_postgres(
    df: pd.DataFrame,
    engine,
    table_name: str,
    if_exists: str = "append",
    chunksize: int = 10_000
):
    if df is None or df.empty:
        logger.warning(f"Tidak ada data yang disimpan ke PostgreSQL untuk tabel {table_name}.")
        return

    try:
        df.to_sql(
            name=table_name.split(".")[-1],    # jika "schema.table"
            schema=table_name.split(".")[0] if "." in table_name else None,
            con=engine,
            if_exists=if_exists,
            index=False,
            chunksize=chunksize
        )
        logger.info(f"Berhasil simpan {len(df)} baris ke PostgreSQL tabel {table_name}.")
    except SQLAlchemyError as e:
        logger.error(f"Error saat menyimpan ke PostgreSQL ({table_name}): {e}")
    except Exception as e:
        logger.error(f"Error umum saat menyimpan ke PostgreSQL ({table_name}): {e}")


def save_to_bigquery(
    df: pd.DataFrame,
    table_name: str,
    project_id: str,
    if_exists: str = "append"
):
    if df is None or df.empty:
        logger.warning(f"Tidak ada data yang disimpan ke BigQuery untuk tabel {table_name}.")
        return

    if not USE_BIGQUERY:
        logger.info("USE_BIGQUERY=False, skip upload ke BigQuery.")
        return

    if "." not in table_name:
        logger.error(
            f"Nama tabel BigQuery harus dalam format 'dataset.table', "
            f"didapat: {table_name}"
        )
        return

    dataset, table = table_name.split(".", 1)
    try:
        from pandas_gbq import to_gbq
        to_gbq(
            df,
            destination_table=f"{dataset}.{table}",
            project_id=project_id,
            if_exists=if_exists
        )
        logger.info(f"Berhasil simpan {len(df)} baris ke BigQuery tabel {dataset}.{table}.")
    except Exception as e:
        logger.error(f"Error saat menyimpan ke BigQuery ({table_name}): {e}")


In [None]:
def process_single_run(
    run_date: datetime,
    data_sources: List[Dict],
    filter_mode: str = "monthly_first_day"
):

    logger.info("=" * 80)
    logger.info(f"Mulai proses running untuk tanggal: {run_date.date()} "
                f"dengan mode filter='{filter_mode}'")
    logger.info("=" * 80)

    engine = get_postgres_engine(POSTGRES_CONN_STR)

    for src in data_sources:
        name = src.get("name")
        path = src.get("path")
        fmt = src.get("format", "parquet")
        date_col = src.get("date_column")
        target_table_pg = src.get("target_table")


        logger.info(f"--- Proses source: {name} ---")

        # 1. Baca file
        df_raw = read_source_file(path, fmt)
        if df_raw is None:
            logger.error(f"[{name}] Gagal baca file, skip source ini.")
            continue

        # 2. Validasi dataframe
        df_valid = validate_dataframe(df_raw, required_columns=[date_col], source_name=name)
        if df_valid is None:
            logger.error(f"[{name}] Validasi gagal, skip source ini.")
            continue

        # 3. Filter berdasarkan periode
        try:
            df_filtered = filter_by_period(df_valid, date_col=date_col, run_date=run_date, mode=filter_mode)
        except Exception as e:
            logger.error(f"[{name}] Error saat filter_by_period: {e}")
            continue

        if df_filtered is None or df_filtered.empty:
            logger.warning(f"[{name}] Tidak ada data setelah filter, tidak akan disimpan.")
            continue

        # 4. Simpan ke PostgreSQL
        save_to_postgres(df_filtered, engine=engine, table_name=target_table_pg)

    logger.info(f"Selesai proses running untuk tanggal: {run_date.date()}")


In [None]:
def generate_first_day_months(
    start_year: int,
    start_month: int,
    num_months: int
) -> List[datetime]:
    dates = []
    year = start_year
    month = start_month

    for _ in range(num_months):
        dates.append(datetime(year, month, 1))

        # Increment bulan
        month += 1
        if month > 12:
            month = 1
            year += 1

    return dates


# Contoh: 7x run dari 1 Jan 2024 s/d 1 Jul 2024
run_dates = generate_first_day_months(start_year=2024, start_month=1, num_months=7)
for d in run_dates:
    logger.info(f"Scheduled run date: {d.date()}")


2025-11-16 13:41:42,354 [INFO] data_extraction_pipeline - Scheduled run date: 2024-01-01
2025-11-16 13:41:42,357 [INFO] data_extraction_pipeline - Scheduled run date: 2024-02-01
2025-11-16 13:41:42,358 [INFO] data_extraction_pipeline - Scheduled run date: 2024-03-01
2025-11-16 13:41:42,363 [INFO] data_extraction_pipeline - Scheduled run date: 2024-04-01
2025-11-16 13:41:42,365 [INFO] data_extraction_pipeline - Scheduled run date: 2024-05-01
2025-11-16 13:41:42,367 [INFO] data_extraction_pipeline - Scheduled run date: 2024-06-01
2025-11-16 13:41:42,368 [INFO] data_extraction_pipeline - Scheduled run date: 2024-07-01


In [None]:
if __name__ == "__main__":
    # Mode filter bisa kamu ganti: 'daily', 'weekly', 'monthly', 'monthly_first_day', 'all'
    FILTER_MODE = "monthly_first_day"

    for run_date in run_dates:
        try:
            process_single_run(
                run_date=run_date,
                data_sources=DATA_SOURCES,
                filter_mode=FILTER_MODE
            )
        except Exception as e:
            # Error handling global per running
            logger.error(f"Terjadi error fatal pada run tanggal {run_date.date()}: {e}")


2025-11-16 13:41:42,390 [INFO] data_extraction_pipeline - Mulai proses running untuk tanggal: 2024-01-01 dengan mode filter='monthly_first_day'
2025-11-16 13:41:42,674 [INFO] data_extraction_pipeline - Koneksi ke PostgreSQL berhasil.
2025-11-16 13:41:42,675 [INFO] data_extraction_pipeline - --- Proses source: yellow_trip ---
2025-11-16 13:41:43,546 [INFO] data_extraction_pipeline - Berhasil baca file C:\Users\Robert\Documents\bootcamp\yellow_tripdata_2025-09.parquet dengan 4251015 baris.
2025-11-16 13:41:47,529 [INFO] data_extraction_pipeline - Filter mode='monthly_first_day' untuk tanggal run 2024-01-01 menghasilkan 0 baris dari 4251015 baris.
2025-11-16 13:41:47,625 [INFO] data_extraction_pipeline - --- Proses source: green_trip ---
2025-11-16 13:41:47,647 [INFO] data_extraction_pipeline - Berhasil baca file C:\Users\Robert\Documents\bootcamp\green_tripdata_2025-09.parquet dengan 48893 baris.
2025-11-16 13:41:47,723 [INFO] data_extraction_pipeline - Filter mode='monthly_first_day' un

In [None]:

import pytz

def prepare_single_trip_df(df: pd.DataFrame, service_type: str) -> pd.DataFrame:
    """
    - Validasi dasar dataframe
    - Menyatukan nama kolom pickup/dropoff
    - Konversi ke datetime
    - Standardisasi timezone (America/New_York)
    - Data cleansing dasar
    """
    if df is None:
        logger.error(f"[{service_type}] Dataframe adalah None.")
        return pd.DataFrame()
    if df.empty:
        logger.error(f"[{service_type}] Dataframe kosong.")
        return pd.DataFrame()

    df = df.copy()
    df["service_type"] = service_type

    # Cari kolom pickup/dropoff
    pickup_candidates = ["tpep_pickup_datetime", "lpep_pickup_datetime", "pickup_datetime"]
    dropoff_candidates = ["tpep_dropoff_datetime", "lpep_dropoff_datetime", "dropoff_datetime"]

    pickup_col = next((c for c in pickup_candidates if c in df.columns), None)
    dropoff_col = next((c for c in dropoff_candidates if c in df.columns), None)

    if pickup_col is None or dropoff_col is None:
        logger.error(
            f"[{service_type}] Tidak menemukan kolom pickup/dropoff yang valid. "
            f"Pickup candidates: {pickup_candidates}, dropoff candidates: {dropoff_candidates}"
        )
        return pd.DataFrame()

    # Standarisasi nama kolom
    df = df.rename(columns={pickup_col: "pickup_datetime", dropoff_col: "dropoff_datetime"})

    # Konversi ke datetime
    for col in ["pickup_datetime", "dropoff_datetime"]:
        if not pd.api.types.is_datetime64_any_dtype(df[col]):
            df[col] = pd.to_datetime(df[col], errors="coerce")

    before = len(df)
    df = df.dropna(subset=["pickup_datetime", "dropoff_datetime"])
    after = len(df)
    if after < before:
        logger.warning(f"[{service_type}] {before - after} baris dihapus karena gagal konversi datetime.")

    # --- Standardisasi timezone ---
    # Asumsi: data taxi NYC, berada pada timezone New York.
    # Kita standardisasikan ke 'America/New_York' agar konsisten.
    ny_tz = pytz.timezone("America/New_York")

    for col in ["pickup_datetime", "dropoff_datetime"]:
        if df[col].dt.tz is None:
            df[col] = df[col].dt.tz_localize(ny_tz, ambiguous="NaT", nonexistent="NaT")
        else:
            df[col] = df[col].dt.tz_convert(ny_tz)

    # Drop baris yang gagal karena isu DST
    before = len(df)
    df = df.dropna(subset=["pickup_datetime", "dropoff_datetime"])
    after = len(df)
    if after < before:
        logger.warning(f"[{service_type}] {before - after} baris dihapus karena isu DST.")

    # Kolom turunan tanggal & waktu
    df["pickup_date"] = df["pickup_datetime"].dt.date

    # Format tanggal diseragamkan ke string "YYYY-MM-DD"
    df["pickup_date_str"] = df["pickup_datetime"].dt.strftime("%Y-%m-%d")

    df["pickup_hour"] = df["pickup_datetime"].dt.hour
    df["pickup_dow"] = df["pickup_datetime"].dt.dayofweek  # 0=Senin, 6=Minggu

    # --- Data cleansing dasar ---
    before_clean = len(df)

    if "trip_distance" in df.columns:
        df = df[df["trip_distance"].fillna(0) >= 0]
        df = df[df["trip_distance"] <= 200]  # buang outlier sangat jauh

    if "passenger_count" in df.columns:
        df = df[df["passenger_count"].fillna(0) >= 0]
        df = df[df["passenger_count"] <= 8]

    if "fare_amount" in df.columns:
        df = df[df["fare_amount"].fillna(0) >= 0]

    # Drop trip yang waktunya terbalik
    df = df[df["dropoff_datetime"] >= df["pickup_datetime"]]

    after_clean = len(df)
    if after_clean < before_clean:
        logger.info(
            f"[{service_type}] {before_clean - after_clean} baris dihapus "
            "karena tidak lolos aturan data cleansing."
        )

    logger.info(f"[{service_type}] Data siap pakai: {len(df)} baris.")
    return df


# Terapkan ke df_yellow dan df_green (hasil cell sebelumnya)
df_yellow_clean = prepare_single_trip_df(df_yellow, "yellow")
df_green_clean  = prepare_single_trip_df(df_green, "green")

# Gabungkan
frames = [df for df in [df_yellow_clean, df_green_clean] if not df.empty]
if not frames:
    logger.error("Tidak ada data valid setelah proses cleansing. df_trips akan kosong.")
    df_trips = pd.DataFrame()
else:
    df_trips = pd.concat(frames, ignore_index=True)
    logger.info(f"Total gabungan df_trips: {len(df_trips)} baris dengan kolom: {list(df_trips.columns)}")


2025-11-16 13:44:14,932 [INFO] data_extraction_pipeline - [yellow] 1145305 baris dihapus karena tidak lolos aturan data cleansing.
2025-11-16 13:44:14,935 [INFO] data_extraction_pipeline - [yellow] Data siap pakai: 3105710 baris.
2025-11-16 13:44:16,662 [INFO] data_extraction_pipeline - [green] 5531 baris dihapus karena tidak lolos aturan data cleansing.
2025-11-16 13:44:16,664 [INFO] data_extraction_pipeline - [green] Data siap pakai: 43362 baris.
2025-11-16 13:44:18,730 [INFO] data_extraction_pipeline - Total gabungan df_trips: 3149072 baris dengan kolom: ['VendorID', 'pickup_datetime', 'dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'Airport_fee', 'cbd_congestion_fee', 'service_type', 'pickup_date', 'pickup_date_str', 'pickup_hour', 'pickup_dow', 'ehail_fee', 'trip

In [None]:
from typing import Dict

def build_aggregations(df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
    """
    Membangun minimal 5 agregasi dari df_trips.
    """
    if df is None or df.empty:
        logger.error("Dataframe untuk agregasi kosong.")
        return {}

    aggs: Dict[str, pd.DataFrame] = {}

    # 1) Total trip per hari
    try:
        agg1 = (
            df.groupby("pickup_date_str")
              .size()
              .reset_index(name="total_trip")
              .sort_values("pickup_date_str")
        )
        aggs["agg1_total_trip_per_hari"] = agg1
    except Exception as e:
        logger.error(f"Gagal membuat agg1_total_trip_per_hari: {e}")

    # 2) Total trip per hari per service_type
    try:
        agg2 = (
            df.groupby(["pickup_date_str", "service_type"])
              .size()
              .reset_index(name="total_trip")
              .sort_values(["pickup_date_str", "service_type"])
        )
        aggs["agg2_total_trip_per_hari_per_service"] = agg2
    except Exception as e:
        logger.error(f"Gagal membuat agg2_total_trip_per_hari_per_service: {e}")

    # 3) Rata-rata jarak perjalanan per hari
    if "trip_distance" in df.columns:
        try:
            agg3 = (
                df.groupby("pickup_date_str")["trip_distance"]
                  .agg(avg_trip_distance_km="mean")
                  .reset_index()
                  .sort_values("pickup_date_str")
            )
            aggs["agg3_rata2_jarak_per_hari"] = agg3
        except Exception as e:
            logger.error(f"Gagal membuat agg3_rata2_jarak_per_hari: {e}")
    else:
        logger.warning("Kolom trip_distance tidak tersedia, agg3 dilewati.")

    # 4) Total revenue per hari
    revenue_cols = [
        "fare_amount",
        "extra",
        "mta_tax",
        "tip_amount",
        "tolls_amount",
        "improvement_surcharge",
        "congestion_surcharge",
        "total_amount"
    ]
    actual_rev_cols = [c for c in revenue_cols if c in df.columns]

    if actual_rev_cols:
        try:
            df_rev = df.copy()
            if "total_amount" in actual_rev_cols:
                df_rev["revenue_total"] = df_rev["total_amount"]
            else:
                df_rev["revenue_total"] = df_rev[actual_rev_cols].sum(axis=1)

            agg4 = (
                df_rev.groupby("pickup_date_str")["revenue_total"]
                      .agg(total_revenue="sum", avg_revenue_per_trip="mean")
                      .reset_index()
                      .sort_values("pickup_date_str")
            )
            aggs["agg4_revenue_per_hari"] = agg4
        except Exception as e:
            logger.error(f"Gagal membuat agg4_revenue_per_hari: {e}")
    else:
        logger.warning("Kolom revenue tidak ditemukan, agg4 dilewati.")

    # 5) Rata-rata jumlah penumpang per hari
    if "passenger_count" in df.columns:
        try:
            agg5 = (
                df.groupby("pickup_date_str")["passenger_count"]
                  .agg(avg_passenger_per_trip="mean")
                  .reset_index()
                  .sort_values("pickup_date_str")
            )
            aggs["agg5_rata2_penumpang_per_hari"] = agg5
        except Exception as e:
            logger.error(f"Gagal membuat agg5_rata2_penumpang_per_hari: {e}")
    else:
        logger.warning("Kolom passenger_count tidak tersedia, agg5 dilewati.")

    if len(aggs) < 5:
        logger.warning(f"Hanya {len(aggs)} agregasi yang berhasil dibuat. Cek kembali kolom pada df_trips.")

    logger.info(f"Berhasil membuat {len(aggs)} agregasi.")
    return aggs


aggregations = build_aggregations(df_trips)

aggregation_descriptions = {
    "agg1_total_trip_per_hari": (
        "Agregasi ini menunjukkan jumlah perjalanan taksi per tanggal penjemputan "
        "(pickup_date). Data ini digunakan untuk melihat tren volume perjalanan "
        "harian secara keseluruhan."
    ),
    "agg2_total_trip_per_hari_per_service": (
        "Agregasi ini membagi jumlah perjalanan per hari berdasarkan jenis layanan "
        "(yellow/green) sehingga memudahkan analisis kontribusi masing-masing layanan "
        "pada setiap hari."
    ),
    "agg3_rata2_jarak_per_hari": (
        "Agregasi ini menampilkan rata-rata jarak tempuh perjalanan per hari. "
        "Informasi ini bermanfaat untuk memahami karakteristik jarak perjalanan "
        "pada masing-masing tanggal."
    ),
    "agg4_revenue_per_hari": (
        "Agregasi ini merangkum total pendapatan dan rata-rata pendapatan per trip "
        "pada setiap hari, berdasarkan kolom tarif yang tersedia di dataset."
    ),
    "agg5_rata2_penumpang_per_hari": (
        "Agregasi ini menunjukkan rata-rata jumlah penumpang per perjalanan "
        "pada setiap hari, sehingga dapat menggambarkan kepadatan penumpang per trip."
    ),
}


2025-11-16 13:44:22,625 [INFO] data_extraction_pipeline - Berhasil membuat 5 agregasi.


In [116]:
# BLOCK 3: Simpan hasil agregasi ke file CSV

from pathlib import Path

def save_aggregations_to_csv(
    aggs: Dict[str, pd.DataFrame],
    output_dir: str = "output_aggregations"
) -> Dict[str, str]:
    """
    Simpan setiap dataframe agregasi ke file CSV.
    Return: {nama_agregasi: path_csv}
    """
    output_paths: Dict[str, str] = {}

    if not aggs:
        logger.error("Tidak ada agregasi yang akan disimpan ke CSV.")
        return output_paths

    try:
        out_dir = Path(output_dir)
        out_dir.mkdir(parents=True, exist_ok=True)
    except Exception as e:
        logger.error(f"Gagal membuat direktori '{output_dir}': {e}")
        return output_paths

    for name, df in aggs.items():
        if df is None or df.empty:
            logger.warning(f"Agregasi {name} kosong, dilewati.")
            continue

        try:
            file_path = out_dir / f"{name}.csv"
            df.to_csv(file_path, index=False)
            output_paths[name] = str(file_path)
            logger.info(f"Berhasil menyimpan {name} ke {file_path}.")
        except Exception as e:
            logger.error(f"Gagal menyimpan agregasi {name} ke CSV: {e}")

    return output_paths


csv_paths = save_aggregations_to_csv(aggregations)


2025-11-16 13:44:22,849 [ERROR] data_extraction_pipeline - Gagal menyimpan agregasi agg1_total_trip_per_hari ke CSV: [Errno 13] Permission denied: 'output_aggregations\\agg1_total_trip_per_hari.csv'
2025-11-16 13:44:22,866 [INFO] data_extraction_pipeline - Berhasil menyimpan agg2_total_trip_per_hari_per_service ke output_aggregations\agg2_total_trip_per_hari_per_service.csv.
2025-11-16 13:44:22,881 [INFO] data_extraction_pipeline - Berhasil menyimpan agg3_rata2_jarak_per_hari ke output_aggregations\agg3_rata2_jarak_per_hari.csv.
2025-11-16 13:44:22,886 [INFO] data_extraction_pipeline - Berhasil menyimpan agg4_revenue_per_hari ke output_aggregations\agg4_revenue_per_hari.csv.
2025-11-16 13:44:22,892 [INFO] data_extraction_pipeline - Berhasil menyimpan agg5_rata2_penumpang_per_hari ke output_aggregations\agg5_rata2_penumpang_per_hari.csv.
