# POA Irradiance and Power Correlation Analysis

This notebook analyzes the correlation between POA (Plane of Array) irradiance and power output
for multiple PV sites, examining the relationship with distance to nearest weather station.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np

# Set style for better-looking plots
plt.style.use('seaborn-v0_8-darkgrid')

## Configuration

Define the PV sites and their normalized distances to nearest weather station.

In [None]:
# PV Site configurations with normalized distances
# Original distances normalized so max = 1.0
pv_sites = {
    79336: 0.308,
    61272: 0.632,
    4708: 0.700,
    89665: 0.716,
    82517: 0.774,
    36019: 0.777,
    24667: 0.816,
    56874: 0.906,
    25724: 0.972,
    42248: 1.000
}

# Data configuration
DATA_DIR = Path('../../data/data-3/timeseries')
TIME_COLUMN = 'time'
POA_COLUMN = 'plane_of_array_irradiance'
POWER_COLUMN = 'power'

# Filtering options (set to None to disable filtering)
FILTER_WEATHER_CODE_MIN = None  # e.g., 0 or None for no minimum
FILTER_WEATHER_CODE_MAX = None  # e.g., 3 or None for no maximum
FILTER_WIND_SPEED_MIN = None  # e.g., 0.0 or None for no minimum
FILTER_WIND_SPEED_MAX = None  # e.g., 10.0 or None for no maximum

## Load Data and Calculate Correlations

For each PV site, load the timeseries data and calculate the correlation coefficient
between POA irradiance and power output.

In [None]:
# Dictionary to store results
results = []

# Build filter description
filter_desc = []
weather_code_range = ""  # Initialize for later use
wind_range = ""  # Initialize for later use

if FILTER_WEATHER_CODE_MIN is not None or FILTER_WEATHER_CODE_MAX is not None:
    weather_code_range = f"{FILTER_WEATHER_CODE_MIN if FILTER_WEATHER_CODE_MIN is not None else 'any'} to {FILTER_WEATHER_CODE_MAX if FILTER_WEATHER_CODE_MAX is not None else 'any'}"
    filter_desc.append(f"weather_code: {weather_code_range}")

if FILTER_WIND_SPEED_MIN is not None or FILTER_WIND_SPEED_MAX is not None:
    wind_range = f"{FILTER_WIND_SPEED_MIN if FILTER_WIND_SPEED_MIN is not None else 'any'} to {FILTER_WIND_SPEED_MAX if FILTER_WIND_SPEED_MAX is not None else 'any'}"
    filter_desc.append(f"wind_speed_180m: {wind_range}")

if filter_desc:
    print(f"Applying filters: {', '.join(filter_desc)}\n")
else:
    print("No filters applied\n")

for site_id, distance in pv_sites.items():
    csv_file = DATA_DIR / f'{site_id}.csv'
    
    try:
        # Read the CSV file
        print(f"Processing Site {site_id}...")
        df = pd.read_csv(csv_file)
        
        # Parse time column
        df[TIME_COLUMN] = pd.to_datetime(df[TIME_COLUMN])
        
        # Store original record count
        original_count = len(df)

        # Apply filters
        if FILTER_WEATHER_CODE_MIN is not None or FILTER_WEATHER_CODE_MAX is not None:
            if 'weather_code' in df.columns:
                if FILTER_WEATHER_CODE_MIN is not None:
                    df = df[df['weather_code'] >= FILTER_WEATHER_CODE_MIN]
                if FILTER_WEATHER_CODE_MAX is not None:
                    df = df[df['weather_code'] <= FILTER_WEATHER_CODE_MAX]
            else:
                print(f"  WARNING: 'weather_code' column not found in data")

        if FILTER_WIND_SPEED_MIN is not None or FILTER_WIND_SPEED_MAX is not None:
            if 'wind_speed_180m' in df.columns:
                if FILTER_WIND_SPEED_MIN is not None:
                    df = df[df['wind_speed_180m'] >= FILTER_WIND_SPEED_MIN]
                if FILTER_WIND_SPEED_MAX is not None:
                    df = df[df['wind_speed_180m'] <= FILTER_WIND_SPEED_MAX]
            else:
                print(f"  WARNING: 'wind_speed_180m' column not found in data")

        # Calculate correlation if we have enough data
        if len(df) > 1:
            correlation = df[POA_COLUMN].corr(df[POWER_COLUMN])
        else:
            correlation = np.nan
            print(f"  WARNING: Insufficient data after filtering")

        # Store results
        results.append({
            'site_id': site_id,
            'distance': distance,
            'correlation': correlation,
            'num_records': len(df),
            'original_records': original_count,
            'filtered_pct': (len(df) / original_count * 100) if original_count > 0 else 0
        })
        
        if pd.notna(correlation):
            print(f"  Site {site_id}: Correlation = {correlation:.4f}, Records = {len(df)}/{original_count} ({len(df)/original_count*100:.1f}%)")
        else:
            print(f"  Site {site_id}: Correlation = N/A, Records = {len(df)}/{original_count}")

    except Exception as e:
        print(f"  ERROR processing Site {site_id}: {e}")
        results.append({
            'site_id': site_id,
            'distance': distance,
            'correlation': np.nan,
            'num_records': 0,
            'original_records': 0,
            'filtered_pct': 0
        })

# Create results dataframe
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('distance').reset_index(drop=True)

print("\n" + "="*60)
print("Analysis Complete!")
print("="*60)

## Results Summary

Summary table showing PV Site ID, normalized distance to nearest weather station,
and the correlation coefficient between POA irradiance and power output.

In [None]:
# Display results table
print("\nPV Site Analysis Results:")
print(results_df[['site_id', 'distance', 'correlation', 'num_records', 'original_records', 'filtered_pct']].to_string(index=False))

# Generate markdown table
print("\n\nMarkdown Table:")
print("| PV Site ID | Distance (Normalized) | Correlation | Records Used | Original Records | Filtered % |")
print("|------------|-----------------------|-------------|--------------|------------------|------------|")
for _, row in results_df.iterrows():
    if pd.notna(row['correlation']):
        print(f"| {row['site_id']:<10} | {row['distance']:<21.3f} | {row['correlation']:<11.4f} | {row['num_records']:<12} | {row['original_records']:<16} | {row['filtered_pct']:<10.1f} |")
    else:
        print(f"| {row['site_id']:<10} | {row['distance']:<21.3f} | {'N/A':<11} | {row['num_records']:<12} | {row['original_records']:<16} | {row['filtered_pct']:<10.1f} |")

## Visualization: Distance vs Correlation

Scatter plot showing the relationship between distance to nearest weather station
and the correlation coefficient between POA irradiance and power output.

In [None]:
# Create scatter plot
fig, ax = plt.subplots(figsize=(12, 7))

# Plot scatter points
scatter = ax.scatter(results_df['distance'], results_df['correlation'],
                     s=100, alpha=0.6, c='tab:blue', edgecolors='black', linewidth=1.5)

# Add site ID labels to each point
for idx, row in results_df.iterrows():
    ax.annotate(str(row['site_id']), 
                (row['distance'], row['correlation']),
                xytext=(5, 5), textcoords='offset points',
                fontsize=9, alpha=0.7)

# Labels and title
ax.set_xlabel('Normalized Distance to Nearest Weather Station', fontsize=13, fontweight='bold')
ax.set_ylabel('Correlation Coefficient (POA Irradiance vs Power)', fontsize=13, fontweight='bold')
ax.set_title('Relationship Between Weather Station Distance and POA-Power Correlation',
             fontsize=15, fontweight='bold', pad=20)

# Grid
ax.grid(True, alpha=0.3, linestyle='--')

# Set y-axis limits with some padding
if not results_df['correlation'].isna().all():
    y_min = results_df['correlation'].min()
    y_max = results_df['correlation'].max()
    y_range = y_max - y_min
    ax.set_ylim([y_min - 0.05 * y_range, y_max + 0.05 * y_range])

# Set x-axis limits with some padding
ax.set_xlim([0, 1.1])

plt.tight_layout()
plt.show()

## Statistical Analysis

Calculate basic statistics about the relationship between distance and correlation.

In [None]:
# Remove any rows with NaN correlations for statistical analysis
valid_results = results_df.dropna(subset=['correlation'])

if len(valid_results) > 0:
    print(f"\nStatistical Summary:")
    print(f"Number of sites analyzed: {len(valid_results)}")
    print(f"\nCorrelation Coefficient Statistics:")
    print(f"  Mean: {valid_results['correlation'].mean():.4f}")
    print(f"  Std Dev: {valid_results['correlation'].std():.4f}")
    print(f"  Min: {valid_results['correlation'].min():.4f}")
    print(f"  Max: {valid_results['correlation'].max():.4f}")
    
    # Calculate correlation between distance and correlation coefficient
    distance_corr = valid_results['distance'].corr(valid_results['correlation'])
    print(f"\nCorrelation between distance and POA-power correlation: {distance_corr:.4f}")
    
    if abs(distance_corr) > 0.3:
        if distance_corr > 0:
            print("  → Positive relationship: Sites farther from weather stations tend to have higher POA-power correlations")
        else:
            print("  → Negative relationship: Sites farther from weather stations tend to have lower POA-power correlations")
    else:
        print("  → Weak relationship: Distance to weather station has minimal impact on POA-power correlation")
else:
    print("No valid data available for statistical analysis")

print("\n" + "="*60)
print("Analysis Complete!")
print("="*60)