In [None]:
"""
Entry point for the ETL pipeline:
- Chunked extraction
- Parallel transform + write to parquet
- Merge outputs
- Benchmark vs. sequential
"""

import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

import config
from etl import run_pipeline
from benchmark import run_sequential_benchmark, compare_results


def ensure_dirs():
    for d in [config.DATA_DIR, config.OUTPUT_DIR, config.PARQUET_DIR, config.LOG_DIR]:
        Path(d).mkdir(parents=True, exist_ok=True)


def plot_benchmark(par_metrics, seq_metrics, out_path: Path):
    data = [
        {"mode": "Parallel", "time_s": par_metrics["elapsed_seconds"]},
        {"mode": "Sequential", "time_s": seq_metrics["elapsed_seconds"]}
    ]
    df = sns.load_dataset("tips")  # dummy to guarantee seaborn load; we'll not use it
    plt.figure(figsize=(6, 4))
    modes = [d["mode"] for d in data]
    times = [d["time_s"] for d in data]
    sns.barplot(x=modes, y=times, palette="deep")
    plt.title("ETL Execution Time Comparison")
    plt.ylabel("Seconds")
    plt.xlabel("Mode")
    for i, t in enumerate(times):
        plt.text(i, t, f"{t:.2f}s", ha="center", va="bottom")
    out_path.parent.mkdir(parents=True, exist_ok=True)
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()


def main():
    ensure_dirs()

    # Run parallel pipeline
    par_metrics = run_pipeline(config)

    # Run sequential benchmark for comparison
    seq_metrics = run_sequential_benchmark(config.INPUT_FILE, config.CHUNK_SIZE, config)

    # Compare
    summary = compare_results(par_metrics, seq_metrics)

    # Persist metrics
    logs_json = {
        "parallel": par_metrics,
        "sequential": seq_metrics,
        "summary": summary
    }
    log_file = Path(config.LOG_DIR) / "metrics.json"
    log_file.write_text(json.dumps(logs_json, indent=2))
    print(f"Saved metrics to {log_file}")

    # Plot benchmark chart
    plot_file = Path(config.OUTPUT_DIR) / "benchmark_time_comparison.png"
    plot_benchmark(par_metrics, seq_metrics, plot_file)
    print(f"Saved benchmark chart to {plot_file}")

    # Optional CSV export of merged output
    if config.WRITE_CSV_EXPORT and config.MERGED_PARQUET.exists():
        import pandas as pd
        df = pd.read_parquet(config.MERGED_PARQUET)
        csv_path = Path(config.OUTPUT_DIR) / "merged_output.csv"
        df.to_csv(csv_path, index=False)
        print(f"Saved CSV export to {csv_path}")

    # Console summary
    print("\n=== ETL Summary ===")
    for k, v in summary.items():
        print(f"{k}: {v}")


if __name__ == "__main__":
    main()