# Phase 2: Feature Engineering

## Objectives
1. Compute shape-based features (yield/GHI ratio, solar noon ratio, morning/afternoon asymmetry)
2. Compute MC-derived features (percentile, z-score, uncertainty width)
3. Visualize feature separation between known-cloudy vs known-clear days

In [None]:
import sys
sys.path.insert(0, '/workspaces/O-M-Monte-Carlo')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

from src.phase2_features.feature_engineering import FeatureEngineer
from src.phase1_physics.monte_carlo import MonteCarloSimulator

sns.set_style('darkgrid')
print("Imports successful!")

## Load Phase 1 Results

In [None]:
# Load data from Phase 1
df_daily = pd.read_csv('data/daily_data.csv', parse_dates=['timestamp'], index_col='timestamp')
df_mc = pd.read_csv('data/mc_results.csv', parse_dates=['date'], index_col='date')

print(f"Loaded {len(df_daily)} days of data")
print(f"Date range: {df_daily.index.min()} to {df_daily.index.max()}")

## Compute Features for All Days

In [None]:
# Initialize feature engineer
fe = FeatureEngineer()

# Recreate MC simulator for computing features
efficiency_ratio = df_daily['yield'] / (df_daily['ghi'] + 1e-6)
efficiency_threshold = efficiency_ratio.quantile(0.75)
healthy_mask = efficiency_ratio > efficiency_threshold

mc = MonteCarloSimulator(n_simulations=1000)
mc.calibrate_from_healthy_days(df_daily, healthy_mask.values)

# Compute features for each day
all_features = []

for date, row in df_daily.iterrows():
    yield_day = np.array([row['yield']])
    ghi_day = np.array([row['ghi']])
    
    # Run MC simulation
    mc_sims = mc.simulate_day(ghi_day)
    
    # Compute features
    features = fe.compute_day_features(yield_day, ghi_day, mc_sims)
    features['date'] = date
    features['healthy'] = healthy_mask[date]
    
    all_features.append(features)

df_features = pd.DataFrame(all_features).set_index('date')

print(f"Computed {len(df_features.columns) - 1} features for {len(df_features)} days")
print(f"\nFeatures:")
print(df_features.columns.tolist())
print(f"\nFirst few rows:")
print(df_features.head())

## Feature Statistics

In [None]:
print("Feature Summary Statistics")
print("="*70)
print(df_features.describe())

# Compare healthy vs unhealthy
healthy_features = df_features[df_features['healthy'] == True].drop('healthy', axis=1)
unhealthy_features = df_features[df_features['healthy'] == False].drop('healthy', axis=1)

print("\n\nHealthy Days - Feature Means:")
print(healthy_features.mean())

print("\n\nUnhealthy Days - Feature Means:")
print(unhealthy_features.mean())

## Visualization 1: Feature Distributions

In [None]:
# Plot distribution of key features
feature_cols = ['yield_to_ghi_ratio', 'solar_noon_ratio', 'morning_afternoon_asymmetry', 
                 'yield_variability', 'mc_percentile', 'mc_z_score']

fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

for idx, feat in enumerate(feature_cols):
    ax = axes[idx]
    
    ax.hist(healthy_features[feat], bins=20, alpha=0.6, label='Healthy', color='green')
    ax.hist(unhealthy_features[feat], bins=20, alpha=0.6, label='Unhealthy', color='red')
    
    ax.set_xlabel(feat)
    ax.set_ylabel('Frequency')
    ax.set_title(f'Distribution: {feat}')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Feature distributions plotted.")

## Visualization 2: Feature Separation (Pairplot)

In [None]:
# Select key features for visualization
key_features = ['yield_to_ghi_ratio', 'mc_percentile', 'morning_afternoon_asymmetry', 'healthy']
plot_data = df_features[key_features].copy()
plot_data['condition'] = plot_data['healthy'].map({True: 'Healthy', False: 'Unhealthy'})

sns.pairplot(plot_data, hue='condition', diag_kind='kde', 
             palette={'Healthy': 'green', 'Unhealthy': 'red'},
             plot_kws={'alpha': 0.6})

plt.suptitle('Feature Relationships: Healthy vs Unhealthy Days', y=1.00)
plt.tight_layout()
plt.show()

print("Pairplot created.")

## Visualization 3: Feature Importance via Separation

In [None]:
# Calculate feature separation (Cohen's d or similar)
from scipy import stats as sp_stats

separation_scores = {}
feature_list = [col for col in df_features.columns if col != 'healthy']

for feat in feature_list:
    healthy_vals = healthy_features[feat].dropna()
    unhealthy_vals = unhealthy_features[feat].dropna()
    
    # Cohen's d
    mean_diff = healthy_vals.mean() - unhealthy_vals.mean()
    pooled_std = np.sqrt((healthy_vals.std()**2 + unhealthy_vals.std()**2) / 2)
    cohens_d = mean_diff / (pooled_std + 1e-6)
    
    # t-test p-value
    t_stat, p_val = sp_stats.ttest_ind(healthy_vals, unhealthy_vals)
    
    separation_scores[feat] = {
        'cohens_d': abs(cohens_d),
        'p_value': p_val,
        'mean_healthy': healthy_vals.mean(),
        'mean_unhealthy': unhealthy_vals.mean()
    }

sep_df = pd.DataFrame(separation_scores).T.sort_values('cohens_d', ascending=False)

print("Feature Separation Quality (Cohen's d):")
print(sep_df)

# Visualization
fig, ax = plt.subplots(figsize=(10, 8))

colors = ['green' if p < 0.05 else 'gray' for p in sep_df['p_value']]
ax.barh(range(len(sep_df)), sep_df['cohens_d'], color=colors)
ax.set_yticks(range(len(sep_df)))
ax.set_yticklabels(sep_df.index)
ax.set_xlabel("Cohen's d (absolute effect size)")
ax.set_title('Feature Separation Quality\n(Green = statistically significant p<0.05)')
ax.axvline(0.8, color='red', linestyle='--', alpha=0.5, label='Large effect')
ax.grid(True, alpha=0.3, axis='x')
ax.legend()

plt.tight_layout()
plt.show()

## Visualization 4: Scatter Plots - Key Features

## Summary

Phase 2 is complete. We have:

1. ✓ Computed shape-based features
2. ✓ Computed MC-derived features  
3. ✓ Analyzed feature separation between healthy and unhealthy days

### Key Findings:
- Most discriminative features are:
  - `yield_to_ghi_ratio`: Overall efficiency
  - `mc_percentile`: How day ranks vs MC expectation
  - `mc_z_score`: Deviation from expected in sigma units

### Next Steps (Phase 3):
- Manually label 50-100 days spanning different conditions
- Build gold standard validation set with labels: Healthy, Cloudy, Curtailment, Soiling, Investigate

In [None]:
# Save features for Phase 3 and 4
df_features.to_csv('data/engineered_features.csv')
print("Features saved for Phase 3 and 4.")