In [None]:
# ==============================================================================
#                       MONTHLY DISCHARGE DATA PROCESSOR
# ==============================================================================
#
# Description:
#   This script executes a complete data analysis workflow: loading specific
#   columns from an Excel file, validating and transforming the data, calculating
#   annual aggregate statistics (Min, Max, Mean) for each subbasin, and finally
#   exporting both the detailed monthly data and the summary statistics to CSV.
#
# Constraints:
#   Designed to be memory-efficient by only reading necessary columns.
#   Suitable for datasets manageable within a 16 GB RAM environment.
#
# ==============================================================================

# ------------------------------------------------------------------------------
#   1. DEPENDENCY IMPORTS
# ------------------------------------------------------------------------------

import pandas as pd
import os
from tqdm import tqdm
import sys # Used for clean exit after error


# ------------------------------------------------------------------------------
#   2. USER-DEFINABLE OPTIONS
# ------------------------------------------------------------------------------
# Modify the variables in this section to match your file paths, column names,
# and desired output settings.

# --- I. FILE PATHS & LOCATIONS ---
# The full path to your source Excel file.
input_excel_path = "Simulated_Monthly_discharge.xlsx"

# The name of the specific sheet to read. Set to None or empty string for the first sheet.
input_sheet_name = None

# The full path to the folder where the individual CSV files will be saved.
output_directory_path = "Simulated_monthly_discharge"


# --- II. DATA FILTERING & OUTPUT SETTINGS ---
# Specify a list of subbasin IDs (as numbers or strings) to export.
# To export ALL subbasins, set this variable to None or an empty list (e.g., []).
# Example: subbasins_to_extract = [1, 5, 12, 18]
subbasins_to_extract = None

# The text prefix for the individual subbasin CSV files (e.g., 'Discharge_Node_1.csv').
output_filename_prefix = 'Subbasin_'

# Name for the CSV file containing the aggregated annual summary statistics.
output_analysis_filename = 'Annual_Simulated_Discharge_Summary.csv'

# Set to True to generate and export the annual summary analysis file.
export_summary_analysis = True

# NEW OPTION: Set to False to skip saving if a file with the same name already exists.
# Set to True to always overwrite existing files.
overwrite_existing_files = True


# --- III. COLUMN MAPPING (Source Data) ---
# Define the column names as they appear in your source Excel file, or what you want
# them to be after cleaning. NOTE: The script automatically converts headers to
# uppercase and strips whitespace for robustness.
column_year = 'YEAR' # Maps to column YEAR
column_month = 'MON' # Maps to column MON
column_subbasin = 'SUB' # Maps to column SUB
column_discharge = 'SIMULATED' # Maps to column SIMULATED


# --- IV. COLUMN MAPPING (Output Data) ---
# Define the names for the columns in the final exported CSV files.
output_date_column = 'Year_Month'
output_data_column = 'SIMULATED'


# ------------------------------------------------------------------------------
#   3. SCRIPT EXECUTION
# ------------------------------------------------------------------------------

print("\n" + "="*50)
print("--- Starting Discharge Data Processing Script ---")
print("="*50 + "\n")

# List of required columns for memory-efficient loading
# Using the defined column names here
REQUIRED_COLS = [column_year, column_month, column_subbasin, column_discharge]
df = pd.DataFrame() # Initialize DataFrame

try:
    # ##########################################################################
    #   STEP 1: DATA LOADING AND VALIDATION
    # ##########################################################################

    print("STEP 1: DATA LOADING AND VALIDATION\n")

    # --- Load the Source Data using only required columns ---
    print(f"Loading data from: '{os.path.basename(input_excel_path)}'...")
    
    # Determine the sheet to read: use index 0 (first sheet) if input_sheet_name 
    # is None or empty, otherwise use the provided name/index.
    sheet_identifier = 0 if input_sheet_name is None or input_sheet_name == "" else input_sheet_name

    if input_sheet_name:
        print(f"   (Reading sheet: '{input_sheet_name}')")

    # Use 'usecols' for memory efficiency. Note: pandas reads the column names
    # from the file and checks them against this list.
    df = pd.read_excel(input_excel_path, sheet_name=sheet_identifier, usecols=REQUIRED_COLS)
    print("Data loaded successfully.")

    # --- Data Header Cleanup for Robustness (New Sub-Step) ---
    # Strip whitespace and convert headers to uppercase to prevent KeyErrors
    # caused by ' YEAR' vs 'YEAR' or 'Simulated' vs 'SIMULATED'.
    df.columns = df.columns.str.strip().str.upper()
    print("Column headers cleaned (whitespace removed, converted to uppercase).")
    
    # Update column variables to match the cleaned (uppercase) DataFrame headers
    column_year = column_year.upper()
    column_month = column_month.upper()
    column_subbasin = column_subbasin.upper()
    column_discharge = column_discharge.upper()
    
    # --- Data Type Validation ---
    # Ensure key columns are integers for proper merging and calculation.
    df[column_year] = pd.to_numeric(df[column_year], errors='coerce').astype('Int64')
    df[column_month] = pd.to_numeric(df[column_month], errors='coerce').astype('Int64')
    df[column_subbasin] = pd.to_numeric(df[column_subbasin], errors='coerce').astype('Int64')
    print("Core column data types enforced (Int64/numeric).")

    # --- Data Filtering by Subbasin (if configured) ---
    if subbasins_to_extract and len(subbasins_to_extract) > 0:
        original_rows = len(df)
        df = df[df[column_subbasin].isin(subbasins_to_extract)]
        filtered_rows = len(df)
        print(f"Data filtered: Retained {filtered_rows} rows for {len(subbasins_to_extract)} specified subbasins (removed {original_rows - filtered_rows} rows).")
    else:
        print("No subbasin filter applied (processing all available subbasins).")


    # ##########################################################################
    #   STEP 2: DATA TRANSFORMATION
    # ##########################################################################

    print("\nSTEP 2: DATA TRANSFORMATION\n")

    # --- Creating a month mapping dictionary ---
    month_map = {
        1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
        7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'
    }

    # --- Constructing the combined date column ---
    # Combines the year and month into a readable format (e.g., '1995_Dec').
    df[output_date_column] = df[column_year].astype(str) + '_' + df[column_month].map(month_map)
    print(f"'{output_date_column}' column created successfully.")


    # ##########################################################################
    #   STEP 3: COMPREHENSIVE DATA ANALYSIS (AGGREGATION)
    # ##########################################################################

    print("\nSTEP 3: COMPREHENSIVE DATA ANALYSIS (AGGREGATION)\n")

    # --- Calculating Annual Summary Statistics ---
    # Group by both Year and Subbasin to find annual statistics.
    # Rationale: This calculation provides a high-level overview of hydrologic
    # variability, summarizing the discharge characteristics (min, max, mean)
    # for each subbasin within a single calendar year.
    if export_summary_analysis:
        analysis_df = df.groupby([column_subbasin, column_year])[column_discharge].agg(
            Annual_Mean_Discharge='mean',
            Annual_Min_Discharge='min',
            Annual_Max_Discharge='max',
            Monthly_Count='count'
        ).reset_index()

        # Display the result logic
        print(f"Annual summary statistics calculated for {len(analysis_df)} Subbasin-Year combinations.")
        print(f"Head of summary data:\n{analysis_df.head(5).to_string()}")


    # ##########################################################################
    #   STEP 4: EXPORTING DATA ARTIFACTS
    # ##########################################################################

    print("\nSTEP 4: EXPORTING DATA ARTIFACTS\n")

    # --- Ensure the output directory exists ---
    os.makedirs(output_directory_path, exist_ok=True)
    print(f"Output directory confirmed: '{output_directory_path}'")

    # --- Sub-step 4a: Exporting Aggregated Analysis (if enabled) ---
    if export_summary_analysis:
        output_path = os.path.join(output_directory_path, output_analysis_filename)
        
        # Check overwrite for summary file
        if not overwrite_existing_files and os.path.exists(output_path):
             print(f"[SKIP] Summary file already exists and overwrite is False: {output_analysis_filename}")
        else:
            analysis_df.to_csv(output_path, index=False)
            print(f"Annual Summary Analysis exported to: '{output_analysis_filename}'")


    # --- Sub-step 4b: Exporting Detailed Monthly Data per Subbasin ---
    grouped_by_subbasin = df.groupby(column_subbasin)
    number_of_subbasins = len(grouped_by_subbasin)

    if number_of_subbasins == 0:
        print("\n[WARNING] No data remains for export (check filters or input data).")
    else:
        print(f"\nPreparing to export {number_of_subbasins} individual subbasin files...")

        # Iterating and saving files with a progress bar
        for subbasin_id, subbasin_df in tqdm(grouped_by_subbasin, total=number_of_subbasins, desc="Subbasins Processed"):

            # Selecting and renaming columns for final output
            output_df = subbasin_df[[output_date_column, column_discharge]].rename(
                columns={column_discharge: output_data_column}
            )

            # Defining the output file path using the user-defined prefix
            output_filename = os.path.join(output_directory_path, f"{output_filename_prefix}{subbasin_id}.csv")
            
            # --- Check and Overwrite Logic ---
            if not overwrite_existing_files and os.path.exists(output_filename):
                # Using tqdm.write() to ensure the message is printed correctly alongside the progress bar
                tqdm.write(f"[SKIP] File already exists and overwrite is set to False: {os.path.basename(output_filename)}")
                continue

            # Saving the DataFrame to a CSV file
            output_df.to_csv(output_filename, index=False)

        print("\nDetailed data export process completed.")


    # ##########################################################################
    #   STEP 5: FINAL REPORTING
    # ##########################################################################

    print("\n" + "="*50)
    print("STEP 5: FINAL REPORTING")
    print("="*50 + "\n")

    print(f"Total Subbasins Processed: {number_of_subbasins}")
    print(f"Total Monthly Records Retained: {len(df)}")
    print(f"Output Directory: {output_directory_path}")
    print(f"Overwrite Files Enabled: {overwrite_existing_files}")
    
    print("\nScript execution successful.")


except FileNotFoundError:
    print(f"\n[ERROR] The input file could not be found at the specified path:")
    print(f"        '{input_excel_path}'")
    print("        Please check the path in Section 2 (I. FILE PATHS) and try again.")
    sys.exit(1)

except KeyError as e:
    # This error should now only occur if the column name doesn't exist *even* after
    # stripping and uppercasing (e.g., if the user typo'd the input column mapping)
    print(f"\n[ERROR] A required column was not found in the Excel file: {e}")
    print(f"        Please verify the column mappings in Section 2 (III. COLUMN MAPPING (Source Data)).")
    print(f"        The script expected the uppercase column name as defined in the user options.")
    sys.exit(1)

except Exception as e:
    print(f"\n[CRITICAL ERROR] An unexpected error occurred: {e}")
    print("        Execution halted. Please review the error message to diagnose the issue.")
    sys.exit(1)

finally:
    print("\n--- Script has finished execution. ---\n")
