In [None]:
# ════════════════════════════════════════════════════════════════════════
# 📦 DEPENDENCIES
# ════════════════════════════════════════════════════════════════════════
# Purpose: Import all necessary libraries for data handling, plotting, statistics, and console styling.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm
from colorama import Fore, Style, init
from matplotlib.backends.backend_pdf import PdfPages
from hydroeval import nse, pbias, kge
from sklearn.metrics import r2_score

# Initialize colorama for colored console output
init(autoreset=True)


# ════════════════════════════════════════════════════════════════════════
# ⚙️ USER CONFIGURATION
# ════════════════════════════════════════════════════════════════════════
# Purpose: This section contains all editable options. Modify them to match your project.

class Configuration:
    """
    Holds all user-configurable settings for the analysis.
    """
    # --- 1. Output Control ---
    # Description: Choose whether to save plots as PDF files.
    # - Set to True to save a PDF for each subbasin.
    # - Set to False to run the script without creating any files.
    SAVE_PLOTS_TO_PDF = True

    # --- 2. Input/Output Data Paths ---
    # Description: Define the locations for your data files.
    BASE_DIR = Path("Your_project_folder")
    COMPARISON_DIR = BASE_DIR / "Discharge_comparison" # Reads merged CSVs from here
    PDF_OUTPUT_DIR = BASE_DIR / "PDF_Plot_Reports"     # Saves PDF reports here

    # --- 3. Time Periods for Analysis ---
    # Description: Specify the date ranges for calibration and validation.
    CALIBRATION_START = "2000-01-01"
    CALIBRATION_END = "2009-12-31"
    VALIDATION_START = "2010-01-01"
    VALIDATION_END = "2015-12-31"

    # --- 4. Subbasin Selection ---
    # Description: Choose which subbasins to process.
    # - To run ALL subbasins, leave the list empty: []
    # - To run specific subbasins, add their names: ["Subbasin_1", "Subbasin_42"]
    SUBBASINS_TO_RUN = []

    # --- 5. Column Names in Source Files ---
    # Description: Ensure these names match the column headers in your CSV files.
    DATE_COL = "Year_Month"
    DATE_FORMAT = "%Y_%b"
    OBS_COL = "OBSERVED"
    SIM_COL = "SIMULATED"

    # --- 6. Seasonal Definitions ---
    # Description: Define the months for each season.
    SEASONS = {
        'Wet Season': [6, 7, 8, 9, 10],         # June - October
        'Dry Season': [11, 12, 1, 2, 3, 4, 5]    # November - May
    }

    # --- 7. Plot Style Customization ---
    # Description: Define the visual appearance of the plots.
    PLOT_STYLES = {
        'observed_color': 'black',
        'simulated_color': 'crimson',
        'calibration_fill': 'mediumseagreen',
        'validation_fill': 'cornflowerblue',
        'fill_opacity': 0.25,
        'line_width': 1.5,
        'scatter_marker_size': 10,
        'one_to_one_line_color': 'navy'
    }

    # --- 8. Statistics Configuration ---
    # Description: Parameters for metric calculations.
    STD_TYPE = 1               # 1 = sample std (ddof=1), 0 = population std (ddof=0)
    UNCERTAINTY_PERCENT = 0.5  # Fractional uncertainty for P-factor & R-factor

    # Description: Choose which metrics to display in the summary table.
    STATISTICS_TO_DISPLAY = [
        "NSE", "KGE (2012)", "R²", "Pearson r", "RSR", "PBIAS (%)", "RMSE",
        "MAE", "MAPE (%)", "Bias", "SDR", "P-factor", "R-factor"
    ]


# Instantiate the configuration object
cfg = Configuration()


# ════════════════════════════════════════════════════════════════════════
# 🛠️ HELPER & STATISTICAL FUNCTIONS
# ════════════════════════════════════════════════════════════════════════

def safe_divide(num: float, den: float, default: float = np.nan) -> float:
    """Performs division, returning a default value if the denominator is zero or invalid."""
    return num / den if den and np.isfinite(den) and den != 0 else default

def compute_metrics(obs: np.ndarray, sim: np.ndarray) -> dict:
    """Calculates a comprehensive set of performance metrics."""
    obs, sim = np.asarray(obs).ravel(), np.asarray(sim).ravel()
    mask = ~np.isnan(obs) & ~np.isnan(sim)
    obs, sim = obs[mask], sim[mask]

    if obs.size < 2: return {metric: np.nan for metric in cfg.STATISTICS_TO_DISPLAY}

    def get_value(result):
        """Safely extracts the primary value from a metric function's result."""
        if isinstance(result, (list, np.ndarray)):
            return result[0]
        return result

    mean_obs, std_obs = obs.mean(), obs.std(ddof=cfg.STD_TYPE)
    mean_sim, std_sim = sim.mean(), sim.std(ddof=cfg.STD_TYPE)
    diff = sim - obs
    rmse_val = np.sqrt(np.mean(diff**2))
    pearson_r = np.corrcoef(obs, sim)[0, 1]
    
    delta = np.abs(sim) * cfg.UNCERTAINTY_PERCENT
    p_factor = np.sum((obs >= (sim - delta)) & (obs <= (sim + delta))) / len(obs)
    r_factor = safe_divide(np.mean(2 * delta), std_obs)
    mape = np.nanmean(np.abs(diff / np.where(obs == 0, np.nan, obs))) * 100

    return {
        "NSE": get_value(nse(sim, obs)),
        "KGE (2012)": get_value(kge(sim, obs)),
        "R²": r2_score(obs, sim),
        "Pearson r": pearson_r,
        "RSR": safe_divide(rmse_val, std_obs),
        "PBIAS (%)": pbias(sim, obs),
        "RMSE": rmse_val,
        "MAE": np.mean(np.abs(diff)),
        "MAPE (%)": mape,
        "Bias": diff.mean(),
        "SDR": safe_divide(std_sim, std_obs),
        "P-factor": p_factor,
        "R-factor": r_factor
    }

def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Prepares the dataframe for analysis by adding date components and season columns."""
    df_processed = df.copy()
    df_processed['Year'] = df_processed['Date'].dt.year
    df_processed['Month'] = df_processed['Date'].dt.month
    season_map = {m: s for s, months in cfg.SEASONS.items() for m in months}
    df_processed['Season'] = df_processed['Month'].map(season_map)
    df_processed = df_processed.dropna(subset=[cfg.OBS_COL, cfg.SIM_COL])
    return df_processed


# ════════════════════════════════════════════════════════════════════════
# 📊 PLOTTING FUNCTIONS
# ════════════════════════════════════════════════════════════════════════

def plot_statistics_table(stats: dict, title: str) -> plt.Figure:
    """Creates a clean table of statistical metrics as a plot figure."""
    fig, ax = plt.subplots(figsize=(8, 5))
    ax.set_title(title, fontsize=16, weight='bold')
    ax.axis('off')
    
    df = pd.DataFrame.from_dict(stats, orient='index').round(5)
    df.columns = ["Calibration", "Validation", "Full Period"]
    
    table = ax.table(cellText=df.values, colLabels=df.columns, rowLabels=df.index,
                     loc='center', cellLoc='center')
    table.set_fontsize(10)
    table.scale(1, 1.8)
    
    for (row, col), cell in table.get_celld().items():
        if row == 0 or col == -1: cell.set_text_props(weight='bold')
    
    fig.tight_layout()
    return fig

def plot_combined_hydrograph(df: pd.DataFrame, title: str) -> plt.Figure:
    fig, ax = plt.subplots(figsize=(15, 5))
    ax.plot(df['Date'], df[cfg.OBS_COL], label='Observed', color=cfg.PLOT_STYLES['observed_color'], lw=cfg.PLOT_STYLES['line_width'])
    ax.plot(df['Date'], df[cfg.SIM_COL], label='Simulated', color=cfg.PLOT_STYLES['simulated_color'], lw=cfg.PLOT_STYLES['line_width'], alpha=0.9)
    ax.axvspan(pd.to_datetime(cfg.CALIBRATION_START), pd.to_datetime(cfg.CALIBRATION_END),
               color=cfg.PLOT_STYLES['calibration_fill'], alpha=cfg.PLOT_STYLES['fill_opacity'], label='Calibration')
    ax.axvspan(pd.to_datetime(cfg.VALIDATION_START), pd.to_datetime(cfg.VALIDATION_END),
               color=cfg.PLOT_STYLES['validation_fill'], alpha=cfg.PLOT_STYLES['fill_opacity'], label='Validation')
    ax.set_title(title, fontsize=16, weight='bold')
    ax.set_xlabel("Date")
    ax.set_ylabel("Flow ($m^3/s$)")
    ax.legend()
    ax.grid(True, linestyle='--', alpha=0.6)
    fig.tight_layout()
    return fig

def plot_scatter(df: pd.DataFrame, title: str) -> plt.Figure:
    fig, ax = plt.subplots(figsize=(7, 7))
    max_val = max(df[cfg.OBS_COL].max(), df[cfg.SIM_COL].max()) * 1.05
    ax.scatter(df[cfg.OBS_COL], df[cfg.SIM_COL], color=cfg.PLOT_STYLES['simulated_color'],
               s=cfg.PLOT_STYLES['scatter_marker_size'], alpha=0.7)
    ax.plot([0, max_val], [0, max_val], color=cfg.PLOT_STYLES['one_to_one_line_color'],
            linestyle='--', label='1:1 Line')
    r2 = r2_score(df[cfg.OBS_COL], df[cfg.SIM_COL])
    ax.text(0.05, 0.95, f'$R^2 = {r2:.3f}$', transform=ax.transAxes, fontsize=12,
            verticalalignment='top', bbox=dict(boxstyle='round,pad=0.3', fc='wheat', alpha=0.5))
    ax.set_title(title, weight='bold')
    ax.set_xlabel("Observed Flow ($m^3/s$)")
    ax.set_ylabel("Simulated Flow ($m^3/s$)")
    ax.set_xlim(0, max_val)
    ax.set_ylim(0, max_val)
    ax.legend()
    ax.grid(True, linestyle='--', alpha=0.6)
    fig.tight_layout()
    return fig

def plot_flow_duration_curve(df: pd.DataFrame, title: str) -> plt.Figure:
    obs_sorted = np.sort(df[cfg.OBS_COL])[::-1]
    sim_sorted = np.sort(df[cfg.SIM_COL])[::-1]
    exceedance = np.arange(1, len(obs_sorted) + 1) / len(obs_sorted) * 100
    fig, ax = plt.subplots(figsize=(8, 5))
    ax.plot(exceedance, obs_sorted, label='Observed', color=cfg.PLOT_STYLES['observed_color'])
    ax.plot(exceedance, sim_sorted, label='Simulated', color=cfg.PLOT_STYLES['simulated_color'])
    ax.set_yscale('log')
    ax.set_title(title, weight='bold')
    ax.set_xlabel("Exceedance Probability (%)")
    ax.set_ylabel("Flow ($m^3/s$)")
    ax.legend()
    ax.grid(True, which="both", linestyle='--', alpha=0.6)
    fig.tight_layout()
    return fig

def plot_seasonal_barchart(df: pd.DataFrame, title: str) -> plt.Figure:
    seasonal_stats = df.groupby('Season')[[cfg.OBS_COL, cfg.SIM_COL]].mean()
    season_order = list(cfg.SEASONS.keys())
    seasonal_stats = seasonal_stats.reindex(season_order)
    x = np.arange(len(season_order))
    bar_width = 0.35
    fig, ax = plt.subplots(figsize=(7, 5))
    ax.bar(x - bar_width / 2, seasonal_stats[cfg.OBS_COL], bar_width, label='Observed', color=cfg.PLOT_STYLES['observed_color'])
    ax.bar(x + bar_width / 2, seasonal_stats[cfg.SIM_COL], bar_width, label='Simulated', color=cfg.PLOT_STYLES['simulated_color'])
    ax.set_xticks(x, season_order)
    ax.set_ylabel("Average Flow ($m^3/s$)")
    ax.set_title(title, weight='bold')
    ax.legend()
    ax.grid(True, axis='y', linestyle='--', alpha=0.6)
    fig.tight_layout()
    return fig


# ════════════════════════════════════════════════════════════════════════
# 🔁 MAIN SCRIPT EXECUTION
# ════════════════════════════════════════════════════════════════════════
def main():
    """Main function to orchestrate the entire workflow."""
    print(Fore.CYAN + Style.BRIGHT + "--- SCRIPT INITIALIZATION ---")
    if cfg.SAVE_PLOTS_TO_PDF:
        print(f"- Output Mode: PDF reports will be saved to: {cfg.PDF_OUTPUT_DIR}")
        cfg.PDF_OUTPUT_DIR.mkdir(exist_ok=True)
    else:
        print("- Output Mode: No files will be saved.")
    print("-" * 50 + "\n")

    # --- Step 1: Discover, Validate, and Sort Input Files ---
    print(Fore.CYAN + Style.BRIGHT + "--- STEP 1: DISCOVERING AND SORTING INPUT FILES ---\n")
    input_files = list(cfg.COMPARISON_DIR.glob("*.csv"))
    if cfg.SUBBASINS_TO_RUN:
        input_files = [f for f in input_files if f.stem in cfg.SUBBASINS_TO_RUN]
    if not input_files:
        print(Fore.RED + f"CRITICAL ERROR: No files found to process. Exiting.")
        return
    def sort_key(p): return (int(p.stem.split('_')[-1]) if p.stem.split('_')[-1].isdigit() else p.stem)
    input_files = sorted(input_files, key=sort_key)
    print(f"-> Found and sorted {len(input_files)} subbasin files for processing.\n")

    # --- Step 2: Process Each Subbasin Sequentially ---
    print(Fore.CYAN + Style.BRIGHT + "--- STEP 2: GENERATING REPORTS SEQUENTIALLY ---\n")
    
    success_count = 0
    for file_path in tqdm(input_files, desc="Generating Reports"):
        subbasin_name = file_path.stem
        try:
            df_full = pd.read_csv(file_path)
            
            required_cols = {cfg.DATE_COL, cfg.OBS_COL, cfg.SIM_COL}
            if not required_cols.issubset(df_full.columns):
                tqdm.write(Fore.YELLOW + f"WARNING: Skipping {subbasin_name}. Missing required columns. Found: {list(df_full.columns)}")
                continue

            df_full['Date'] = pd.to_datetime(df_full[cfg.DATE_COL], format=cfg.DATE_FORMAT)
            min_year = pd.to_datetime(cfg.CALIBRATION_START).year
            max_year = pd.to_datetime(cfg.VALIDATION_END).year
            df_filtered = df_full[df_full['Date'].dt.year.between(min_year, max_year)]
            df_processed = process_dataframe(df_filtered)

            if df_processed.empty:
                tqdm.write(Fore.YELLOW + f"WARNING: Skipping {subbasin_name} (no data in {min_year}-{max_year} range).")
                continue

            df_cal = df_processed[df_processed['Date'].between(cfg.CALIBRATION_START, cfg.CALIBRATION_END)]
            df_val = df_processed[df_processed['Date'].between(cfg.VALIDATION_START, cfg.VALIDATION_END)]
            stats_cal = compute_metrics(df_cal[cfg.OBS_COL].values, df_cal[cfg.SIM_COL].values)
            stats_val = compute_metrics(df_val[cfg.OBS_COL].values, df_val[cfg.SIM_COL].values)
            stats_full = compute_metrics(df_processed[cfg.OBS_COL].values, df_processed[cfg.SIM_COL].values)
            combined_stats = {metric: [stats_cal.get(metric, np.nan), stats_val.get(metric, np.nan), stats_full.get(metric, np.nan)]
                              for metric in cfg.STATISTICS_TO_DISPLAY}

            if cfg.SAVE_PLOTS_TO_PDF:
                pdf_path = cfg.PDF_OUTPUT_DIR / f"{subbasin_name}_Performance_Report.pdf"
                with PdfPages(pdf_path) as pdf:
                    pdf.savefig(plot_statistics_table(combined_stats, f"Performance Metrics: {subbasin_name}"))
                    pdf.savefig(plot_combined_hydrograph(df_processed, f"Hydrograph: {subbasin_name}"))
                    pdf.savefig(plot_scatter(df_processed, f"Observed vs. Simulated (Full Period): {subbasin_name}"))
                    pdf.savefig(plot_flow_duration_curve(df_processed, f"Flow Duration Curve (Full Period): {subbasin_name}"))
                    pdf.savefig(plot_seasonal_barchart(df_processed, f"Seasonal Average Flow (Full Period): {subbasin_name}"))

            plt.close('all')
            success_count += 1

        except Exception as e:
            tqdm.write(Fore.RED + f"\nERROR processing {subbasin_name}: {e}")
            plt.close('all')

    # --- Final Notification ---
    print("\n" + Fore.CYAN + Style.BRIGHT + "--- PROCESSING COMPLETE ---")
    if cfg.SAVE_PLOTS_TO_PDF:
        print(Fore.GREEN + f"Successfully generated reports for {success_count}/{len(input_files)} subbasins in: {cfg.PDF_OUTPUT_DIR}")
    else:
        print(Fore.GREEN + f"Script finished. Processed {success_count}/{len(input_files)} subbasins.")


if __name__ == "__main__":
    main()