# SA1 scPDSI Processing - Testing Notebook

This notebook allows you to test the scPDSI processor with your own sample data.
Fill in the parameters below and run the cells to process your data.

## 1. Configuration Parameters

**Please fill in your specific paths and parameters below:**

In [None]:
# --- Basic parameters ---
basic_path = r''  # Your base directory path
project_name = ''  # Your project folder name
variable_name = ''  # NetCDF variable name (e.g., 'scpdsi')
x_dimension = ''  # Longitude dimension name
y_dimension = ''  # Latitude dimension name
time_dimension = ''  # Time dimension name
zone_field = ""  # SA1 code field name

# --- Paths ---
nc_path = f""  # NetCDF files directory
work_path = f""  # Temporary work directory
save_excel_path = f""  # Excel output directory
shp_path = f""  # Shapefile directory
shp_filename = ""  # Shapefile name

## 2. Import Required Libraries

In [None]:
import os
import sys
import logging
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Import our custom processor
from scpdsi_processor import (
    Config, ScPDSIProcessor, Logger, ProcessingError,
    SA1DataManager, NetCDFManager, GridManager,
    AnnualAggregator, ReprojectionManager, ConsistencyChecker, ExcelExporter
)

print("✅ All libraries imported successfully")

## 3. Update Configuration with Your Parameters

In [None]:
# Update configuration with your parameters
if basic_path and project_name:
    # Update paths
    Config.NETCDF_DIR = nc_path
    Config.OUTPUT_DIR = save_excel_path
    Config.TEMP_DIR = work_path
    Config.LOG_DIR = os.path.join(basic_path, project_name, "logs")
    Config.SA1_SHAPEFILE = os.path.join(shp_path, shp_filename)
    
    # Update variable and dimension names
    if variable_name:
        Config.VARIABLE_NAME = variable_name
    if x_dimension:
        Config.LON_DIM = x_dimension
    if y_dimension:
        Config.LAT_DIM = y_dimension
    if time_dimension:
        Config.TIME_DIM = time_dimension
    if zone_field:
        Config.SA1_KEY = zone_field
    
    print("✅ Configuration updated with your parameters")
    
    # Display current configuration
    print("\n📋 Current Configuration:")
    print(f"  NetCDF Directory: {Config.NETCDF_DIR}")
    print(f"  Output Directory: {Config.OUTPUT_DIR}")
    print(f"  SA1 Shapefile: {Config.SA1_SHAPEFILE}")
    print(f"  Variable Name: {Config.VARIABLE_NAME}")
    print(f"  Dimensions: {Config.LON_DIM}, {Config.LAT_DIM}, {Config.TIME_DIM}")
    print(f"  SA1 Key Field: {Config.SA1_KEY}")
else:
    print("⚠️  Please fill in the basic_path and project_name parameters above")

## 4. Check Input Files and Directories

In [None]:
# Check if directories and files exist
def check_inputs():
    print("🔍 Checking input files and directories...\n")
    
    # Check NetCDF directory
    nc_dir = Path(Config.NETCDF_DIR)
    if nc_dir.exists():
        nc_files = list(nc_dir.glob("*.nc"))
        print(f"✅ NetCDF directory found: {nc_dir}")
        print(f"   📁 Contains {len(nc_files)} NetCDF files")
        if nc_files:
            print(f"   📄 First few files: {[f.name for f in nc_files[:3]]}")
    else:
        print(f"❌ NetCDF directory not found: {nc_dir}")
    
    # Check SA1 shapefile
    shp_file = Path(Config.SA1_SHAPEFILE)
    if shp_file.exists():
        print(f"✅ SA1 shapefile found: {shp_file}")
    else:
        print(f"❌ SA1 shapefile not found: {shp_file}")
    
    # Check/create output directories
    for dir_name, dir_path in [("Output", Config.OUTPUT_DIR), 
                               ("Temp", Config.TEMP_DIR), 
                               ("Logs", Config.LOG_DIR)]:
        dir_obj = Path(dir_path)
        if not dir_obj.exists():
            dir_obj.mkdir(parents=True, exist_ok=True)
            print(f"📁 Created {dir_name} directory: {dir_path}")
        else:
            print(f"✅ {dir_name} directory exists: {dir_path}")

check_inputs()

## 5. Quick Data Inspection

In [None]:
# Quick inspection of the first NetCDF file
import xarray as xr
import geopandas as gpd

def inspect_data():
    print("🔎 Inspecting sample data...\n")
    
    # Inspect NetCDF file
    nc_files = list(Path(Config.NETCDF_DIR).glob("*.nc"))
    if nc_files:
        print(f"📊 Inspecting first NetCDF file: {nc_files[0].name}")
        try:
            with xr.open_dataset(nc_files[0]) as ds:
                print(f"   📐 Dimensions: {dict(ds.dims)}")
                print(f"   📊 Variables: {list(ds.data_vars.keys())}")
                print(f"   🕐 Time range: {ds[Config.TIME_DIM].values[0]} to {ds[Config.TIME_DIM].values[-1]}")
                
                if Config.VARIABLE_NAME in ds.data_vars:
                    var_data = ds[Config.VARIABLE_NAME]
                    print(f"   ✅ Target variable '{Config.VARIABLE_NAME}' found")
                    print(f"   📊 Shape: {var_data.shape}")
                    print(f"   🎯 Data range: {float(var_data.min()):.3f} to {float(var_data.max()):.3f}")
                else:
                    print(f"   ❌ Target variable '{Config.VARIABLE_NAME}' not found")
                    print(f"   📊 Available variables: {list(ds.data_vars.keys())}")
                    
        except Exception as e:
            print(f"   ❌ Error reading NetCDF: {str(e)}")
    
    print("\n" + "-"*50)
    
    # Inspect shapefile
    shp_file = Path(Config.SA1_SHAPEFILE)
    if shp_file.exists():
        print(f"🗺️  Inspecting shapefile: {shp_file.name}")
        try:
            gdf = gpd.read_file(shp_file)
            print(f"   📊 Number of features: {len(gdf)}")
            print(f"   📊 Columns: {list(gdf.columns)}")
            print(f"   🌍 CRS: {gdf.crs}")
            
            if Config.SA1_KEY in gdf.columns:
                print(f"   ✅ Target field '{Config.SA1_KEY}' found")
                print(f"   📊 Sample values: {gdf[Config.SA1_KEY].head(3).tolist()}")
                print(f"   🔢 Unique values: {gdf[Config.SA1_KEY].nunique()}")
            else:
                print(f"   ❌ Target field '{Config.SA1_KEY}' not found")
                
        except Exception as e:
            print(f"   ❌ Error reading shapefile: {str(e)}")

inspect_data()

## 6. Test Individual Components

In [None]:
# Test SA1 data loading
print("🧪 Testing SA1 data loading...")

try:
    sa1_manager = SA1DataManager()
    sa1_gdf = sa1_manager.load_sa1_data()
    print(f"✅ SA1 data loaded successfully: {len(sa1_gdf)} regions")
    
    # Test reprojection
    sa1_reprojected = sa1_manager.reproject_to_target_crs()
    print(f"✅ SA1 data reprojected to {Config.TARGET_CRS}")
    
    # Test centroids
    centroids = sa1_manager.create_centroids()
    print(f"✅ SA1 centroids created: {len(centroids)} points")
    
except Exception as e:
    print(f"❌ SA1 testing failed: {str(e)}")

In [None]:
# Test NetCDF data loading
print("🧪 Testing NetCDF data loading...")

try:
    netcdf_manager = NetCDFManager()
    files = netcdf_manager.discover_netcdf_files()
    print(f"✅ NetCDF files discovered: {len(files)} files")
    
    # Test reading first few monthly slices
    monthly_data = netcdf_manager.read_monthly_data()
    print(f"✅ Monthly data loaded: {len(monthly_data)} time slices")
    
    # Test year grouping
    calendar_years, financial_years = netcdf_manager.group_by_years()
    print(f"✅ Year grouping completed:")
    print(f"   📅 Calendar years: {len(calendar_years)} ({list(calendar_years.keys())})")
    print(f"   💰 Financial years: {len(financial_years)} ({list(financial_years.keys())})")
    
except Exception as e:
    print(f"❌ NetCDF testing failed: {str(e)}")

## 7. Run Complete Processing (Small Test)

In [None]:
# Run a test with limited data (first year only)
print("🚀 Running complete processing test...\n")

# Setup logging for this test
logger_setup = Logger(logging.INFO)

try:
    # Create processor instance
    processor = ScPDSIProcessor()
    
    # Run complete workflow
    processor.run()
    
    print("\n🎉 Processing completed successfully!")
    
    # Check outputs
    output_dir = Path(Config.OUTPUT_DIR)
    excel_files = list(output_dir.glob("*.xlsx"))
    print(f"\n📄 Generated Excel files: {len(excel_files)}")
    for excel_file in excel_files:
        file_size_kb = excel_file.stat().st_size / 1024
        print(f"   📊 {excel_file.name} ({file_size_kb:.1f} KB)")
    
except ProcessingError as e:
    print(f"\n⚠️  Processing error (expected in some cases): {str(e)}")
    print("This might be due to consistency check failures or data alignment issues.")
    
except Exception as e:
    print(f"\n❌ Unexpected error: {str(e)}")
    import traceback
    traceback.print_exc()

## 8. Inspect Results

In [None]:
# Inspect generated Excel files
import pandas as pd

def inspect_results():
    print("📊 Inspecting generated results...\n")
    
    output_dir = Path(Config.OUTPUT_DIR)
    excel_files = list(output_dir.glob("*.xlsx"))
    
    if not excel_files:
        print("❌ No Excel files found in output directory")
        return
    
    for excel_file in excel_files[:2]:  # Inspect first 2 files
        print(f"📄 Inspecting: {excel_file.name}")
        try:
            df = pd.read_excel(excel_file)
            print(f"   📊 Shape: {df.shape}")
            print(f"   📊 Columns: {list(df.columns)}")
            print(f"   📊 Sample data:")
            print(df.head(3).to_string(index=False))
            
            # Check data quality
            if 'scpdsi_mean' in df.columns:
                valid_values = df['scpdsi_mean'].notna().sum()
                print(f"   ✅ Valid scPDSI values: {valid_values}/{len(df)} ({100*valid_values/len(df):.1f}%)")
                
                if valid_values > 0:
                    mean_val = df['scpdsi_mean'].mean()
                    min_val = df['scpdsi_mean'].min()
                    max_val = df['scpdsi_mean'].max()
                    print(f"   📊 scPDSI range: {min_val:.3f} to {max_val:.3f} (mean: {mean_val:.3f})")
            
            print("\n" + "-"*60 + "\n")
            
        except Exception as e:
            print(f"   ❌ Error reading Excel file: {str(e)}")

inspect_results()

## 9. Check Log Files

In [None]:
# Display recent log entries
def show_recent_logs():
    print("📋 Recent log entries...\n")
    
    log_dir = Path(Config.LOG_DIR)
    log_files = sorted(log_dir.glob("*.log"))
    
    if not log_files:
        print("❌ No log files found")
        return
    
    # Get the most recent log file
    latest_log = log_files[-1]
    print(f"📄 Latest log file: {latest_log.name}\n")
    
    try:
        with open(latest_log, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            
        # Show last 20 lines
        print("📄 Last 20 log entries:")
        print("-" * 80)
        for line in lines[-20:]:
            print(line.rstrip())
            
    except Exception as e:
        print(f"❌ Error reading log file: {str(e)}")

show_recent_logs()

## 10. Summary and Next Steps

In [None]:
# Final summary
def show_summary():
    print("📋 PROCESSING SUMMARY")
    print("=" * 50)
    
    # Count outputs
    output_dir = Path(Config.OUTPUT_DIR)
    excel_files = list(output_dir.glob("*.xlsx"))
    calendar_files = [f for f in excel_files if 'calendar' in f.name]
    financial_files = [f for f in excel_files if 'financial' in f.name]
    
    print(f"📊 Total Excel files generated: {len(excel_files)}")
    print(f"   📅 Calendar years: {len(calendar_files)}")
    print(f"   💰 Financial years: {len(financial_files)}")
    
    # Check log files
    log_dir = Path(Config.LOG_DIR)
    log_files = list(log_dir.glob("*.log"))
    validation_reports = list(log_dir.glob("validation_report_*.json"))
    
    print(f"📋 Log files: {len(log_files)}")
    if validation_reports:
        print(f"⚠️  Validation reports: {len(validation_reports)} (consistency check failures)")
    
    print("\n🎯 NEXT STEPS:")
    print("1. Review the generated Excel files in:", Config.OUTPUT_DIR)
    print("2. Check log files for any warnings or errors in:", Config.LOG_DIR)
    if validation_reports:
        print("3. Review validation reports for consistency check failures")
    print("4. Adjust configuration parameters if needed")
    print("5. Run with full dataset when satisfied with test results")

show_summary()