In [None]:

from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

DATASET_PATH = Path('../data/data-3/timeseries/61272.csv')

data = pd.read_csv(DATASET_PATH)

X, y = train_test_split(data, test_size=0.2, random_state=42)
X.sort_index(inplace=True)

X.drop(['time', 'direct_normal_irradiance', 'diffuse_radiation'], axis=1, inplace=True)
X.head()

# corr_mat = X.corr()
# print(corr_mat)


## Linear Regression with Feature Scaling

Perform linear regression with 'power' as the output variable, applying StandardScaler to features.


In [None]:
# Separate features and target variable
X_features = X.drop('power', axis=1)
y_target = X['power']

print("Features used for regression:")
print(list(X_features.columns))
print(f"\nNumber of samples: {len(X_features)}")
print(f"Number of features: {X_features.shape[1]}")


In [None]:
# Apply feature scaling using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_features)

# Convert back to DataFrame for easier interpretation
X_scaled_df = pd.DataFrame(X_scaled, columns=X_features.columns, index=X_features.index)

print("="*70)
print("FEATURE SCALING APPLIED")
print("="*70)
print("\nOriginal feature statistics:")
print(X_features.describe())
print("\n" + "="*70)
print("Scaled feature statistics (mean≈0, std≈1):")
print(X_scaled_df.describe())


In [None]:
# Train linear regression model
lr_model = LinearRegression()
lr_model.fit(X_scaled, y_target)

# Make predictions
y_pred = lr_model.predict(X_scaled)

# Calculate performance metrics
mse = mean_squared_error(y_target, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_target, y_pred)
r2 = r2_score(y_target, y_pred)

print("="*70)
print("LINEAR REGRESSION RESULTS")
print("="*70)
print(f"\nModel Performance Metrics:")
print(f"  R² Score:                 {r2:.6f}")
print(f"  Mean Squared Error (MSE): {mse:.2f}")
print(f"  Root Mean Squared Error:  {rmse:.2f}")
print(f"  Mean Absolute Error:      {mae:.2f}")
print(f"\nModel Intercept: {lr_model.intercept_:.2f}")
print("\nFeature Coefficients:")

# Create DataFrame of coefficients
coef_df = pd.DataFrame({
    'Feature': X_features.columns,
    'Coefficient': lr_model.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

print(coef_df.to_string(index=False))


In [None]:
# Visualize actual vs predicted
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Scatter plot: Actual vs Predicted
axes[0].scatter(y_target, y_pred, alpha=0.5, s=10)
axes[0].plot([y_target.min(), y_target.max()],
             [y_target.min(), y_target.max()],
             'r--', lw=2, label='Perfect prediction')
axes[0].set_xlabel('Actual Power', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Predicted Power', fontsize=12, fontweight='bold')
axes[0].set_title(f'Actual vs Predicted Power\n(R² = {r2:.4f})',
                  fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Residuals plot
residuals = y_target - y_pred
axes[1].scatter(y_pred, residuals, alpha=0.5, s=10)
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted Power', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Residuals', fontsize=12, fontweight='bold')
axes[1].set_title('Residual Plot', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Feature importance visualization
fig, ax = plt.subplots(figsize=(10, 6))

colors = ['green' if c > 0 else 'red' for c in coef_df['Coefficient']]
bars = ax.barh(coef_df['Feature'], coef_df['Coefficient'], color=colors, alpha=0.7, edgecolor='black')

ax.set_xlabel('Coefficient Value', fontsize=12, fontweight='bold')
ax.set_ylabel('Feature', fontsize=12, fontweight='bold')
ax.set_title('Feature Coefficients (Linear Regression)', fontsize=14, fontweight='bold')
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
ax.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()


## Quadratic Regression (Quadratic in POA, Linear in Other Features)

Add a quadratic term for plane_of_array_irradiance to capture non-linear effects while keeping other features linear.


In [None]:
# Create quadratic feature for plane_of_array_irradiance
X_features_quad = X_features.copy()
X_features_quad['plane_of_array_irradiance_squared'] = X_features['plane_of_array_irradiance'] ** 2

print("Features for quadratic regression:")
print(list(X_features_quad.columns))
print(f"\nNumber of features: {X_features_quad.shape[1]}")


In [None]:
# Apply feature scaling to the quadratic feature set
scaler_quad = StandardScaler()
X_scaled_quad = scaler_quad.fit_transform(X_features_quad)

# Convert to DataFrame
X_scaled_quad_df = pd.DataFrame(X_scaled_quad, columns=X_features_quad.columns, index=X_features_quad.index)

print("="*70)
print("QUADRATIC FEATURE SCALING APPLIED")
print("="*70)
print("\nScaled quadratic features (first 5 rows):")
print(X_scaled_quad_df.head())


In [None]:
# Train quadratic regression model
lr_quad_model = LinearRegression()
lr_quad_model.fit(X_scaled_quad, y_target)

# Make predictions
y_pred_quad = lr_quad_model.predict(X_scaled_quad)

# Calculate performance metrics
mse_quad = mean_squared_error(y_target, y_pred_quad)
rmse_quad = np.sqrt(mse_quad)
mae_quad = mean_absolute_error(y_target, y_pred_quad)
r2_quad = r2_score(y_target, y_pred_quad)

print("="*70)
print("QUADRATIC REGRESSION RESULTS")
print("="*70)
print(f"\nModel Performance Metrics:")
print(f"  R² Score:                 {r2_quad:.6f}")
print(f"  Mean Squared Error (MSE): {mse_quad:.2f}")
print(f"  Root Mean Squared Error:  {rmse_quad:.2f}")
print(f"  Mean Absolute Error:      {mae_quad:.2f}")
print(f"\nModel Intercept: {lr_quad_model.intercept_:.2f}")
print("\nFeature Coefficients:")

# Create DataFrame of coefficients
coef_quad_df = pd.DataFrame({
    'Feature': X_features_quad.columns,
    'Coefficient': lr_quad_model.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

print(coef_quad_df.to_string(index=False))

print("\n" + "="*70)
print("COMPARISON: LINEAR vs QUADRATIC")
print("="*70)
print(f"Linear Model R²:    {r2:.6f}")
print(f"Quadratic Model R²: {r2_quad:.6f}")
print(f"R² Improvement:     {r2_quad - r2:.6f} ({((r2_quad - r2) / r2 * 100):.2f}%)")
print(f"\nLinear Model RMSE:    {rmse:.2f}")
print(f"Quadratic Model RMSE: {rmse_quad:.2f}")
print(f"RMSE Improvement:     {rmse - rmse_quad:.2f} ({((rmse - rmse_quad) / rmse * 100):.2f}%)")


In [None]:
# Visualize quadratic model performance
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Scatter plot: Actual vs Predicted (Quadratic)
axes[0].scatter(y_target, y_pred_quad, alpha=0.5, s=10, label='Quadratic model')
axes[0].plot([y_target.min(), y_target.max()],
             [y_target.min(), y_target.max()],
             'r--', lw=2, label='Perfect prediction')
axes[0].set_xlabel('Actual Power', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Predicted Power', fontsize=12, fontweight='bold')
axes[0].set_title(f'Quadratic Model: Actual vs Predicted Power\n(R² = {r2_quad:.4f})',
                  fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Residuals plot (Quadratic)
residuals_quad = y_target - y_pred_quad
axes[1].scatter(y_pred_quad, residuals_quad, alpha=0.5, s=10)
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted Power', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Residuals', fontsize=12, fontweight='bold')
axes[1].set_title('Quadratic Model: Residual Plot', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Compare residuals between linear and quadratic models
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Linear model residuals
axes[0].scatter(y_pred, residuals, alpha=0.5, s=10, c='blue')
axes[0].axhline(y=0, color='r', linestyle='--', lw=2)
axes[0].set_xlabel('Predicted Power', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Residuals', fontsize=12, fontweight='bold')
axes[0].set_title(f'Linear Model Residuals\n(RMSE = {rmse:.2f})',
                  fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Quadratic model residuals
axes[1].scatter(y_pred_quad, residuals_quad, alpha=0.5, s=10, c='green')
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted Power', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Residuals', fontsize=12, fontweight='bold')
axes[1].set_title(f'Quadratic Model Residuals\n(RMSE = {rmse_quad:.2f})',
                  fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Visualize the quadratic relationship with POA irradiance
# Sort by POA for cleaner visualization
poa_sorted_idx = X_features['plane_of_array_irradiance'].sort_values().index
poa_sorted = X_features.loc[poa_sorted_idx, 'plane_of_array_irradiance']
y_sorted = y_target.loc[poa_sorted_idx]

# Get predictions for sorted data
X_sorted_quad = X_features_quad.loc[poa_sorted_idx]
X_sorted_quad_scaled = scaler_quad.transform(X_sorted_quad)
y_pred_sorted_quad = lr_quad_model.predict(X_sorted_quad_scaled)

# Also get linear predictions for comparison
X_sorted_linear = X_features.loc[poa_sorted_idx]
X_sorted_linear_scaled = scaler.transform(X_sorted_linear)
y_pred_sorted_linear = lr_model.predict(X_sorted_linear_scaled)

fig, ax = plt.subplots(figsize=(12, 6))

ax.scatter(poa_sorted, y_sorted, alpha=0.3, s=10, label='Actual data', c='gray')
ax.plot(poa_sorted, y_pred_sorted_linear, 'b-', linewidth=2, alpha=0.7, label=f'Linear model (R²={r2:.4f})')
ax.plot(poa_sorted, y_pred_sorted_quad, 'g-', linewidth=2, alpha=0.7, label=f'Quadratic model (R²={r2_quad:.4f})')

ax.set_xlabel('Plane of Array Irradiance', fontsize=12, fontweight='bold')
ax.set_ylabel('Power', fontsize=12, fontweight='bold')
ax.set_title('Power vs POA Irradiance: Linear vs Quadratic Fit', fontsize=14, fontweight='bold')
ax.legend(loc='upper left')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## Neural Network Regression

Use a feedforward neural network to capture complex non-linear relationships between features and power output.


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


In [None]:
# Define a simple feedforward neural network
class SolarPowerNet(nn.Module):
    def __init__(self, input_size):
        super(SolarPowerNet, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(32, 16),
            nn.ReLU(),

            nn.Linear(16, 1)
        )

    def forward(self, x):
        return self.network(x)

# Initialize model
input_size = X_scaled.shape[1]
model = SolarPowerNet(input_size).to(device)

print("="*70)
print("NEURAL NETWORK ARCHITECTURE")
print("="*70)
print(model)
print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Input features: {input_size}")
print(f"Output: 1 (power)")


In [None]:
# Training setup
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert to PyTorch tensors
X_tensor = torch.FloatTensor(X_scaled).to(device)
y_tensor = torch.FloatTensor(y_target.values).reshape(-1, 1).to(device)

# Create DataLoader
dataset = TensorDataset(X_tensor, y_tensor)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

print(f"Dataset size: {len(dataset)}")
print(f"Batch size: 32")
print(f"Number of batches: {len(train_loader)}")


In [None]:
# Training loop
num_epochs = 100
train_losses = []

print("="*70)
print("TRAINING NEURAL NETWORK")
print("="*70)

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0

    for batch_X, batch_y in train_loader:
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    train_losses.append(avg_loss)

    # Print progress every 10 epochs
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

print("\nTraining complete!")


In [None]:
# Plot training loss
fig, ax = plt.subplots(figsize=(10, 5))

ax.plot(train_losses, linewidth=2, color='blue')
ax.set_xlabel('Epoch', fontsize=12, fontweight='bold')
ax.set_ylabel('Loss (MSE)', fontsize=12, fontweight='bold')
ax.set_title('Neural Network Training Loss', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Evaluate model
model.eval()
with torch.no_grad():
    y_pred_nn_tensor = model(X_tensor)
    # Move to CPU first, then detach from computation graph, then convert to numpy
    # Using tolist() to avoid numpy compatibility issues with older PyTorch versions
    y_pred_nn = np.array(y_pred_nn_tensor.detach().cpu().tolist()).flatten()

# Convert y_target to numpy for metrics calculation
y_target_np = y_target.values

# Calculate performance metrics
mse_nn = mean_squared_error(y_target_np, y_pred_nn)
rmse_nn = np.sqrt(mse_nn)
mae_nn = mean_absolute_error(y_target_np, y_pred_nn)
r2_nn = r2_score(y_target_np, y_pred_nn)

print("="*70)
print("NEURAL NETWORK RESULTS")
print("="*70)
print(f"\nModel Performance Metrics:")
print(f"  R² Score:                 {r2_nn:.6f}")
print(f"  Mean Squared Error (MSE): {mse_nn:.2f}")
print(f"  Root Mean Squared Error:  {rmse_nn:.2f}")
print(f"  Mean Absolute Error:      {mae_nn:.2f}")

print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)
print(f"{'Model':<20} {'R²':<12} {'RMSE':<12} {'MAE':<12}")
print("-"*70)
print(f"{'Linear':<20} {r2:<12.6f} {rmse:<12.2f} {mae:<12.2f}")
print(f"{'Quadratic':<20} {r2_quad:<12.6f} {rmse_quad:<12.2f} {mae_quad:<12.2f}")
print(f"{'Neural Network':<20} {r2_nn:<12.6f} {rmse_nn:<12.2f} {mae_nn:<12.2f}")
print("="*70)

# Calculate improvements
print(f"\nNeural Network vs Linear:")
print(f"  R² Improvement:   {r2_nn - r2:.6f} ({((r2_nn - r2) / max(abs(r2), 1e-10) * 100):.2f}%)")
print(f"  RMSE Improvement: {rmse - rmse_nn:.2f} ({((rmse - rmse_nn) / rmse * 100):.2f}%)")

print(f"\nNeural Network vs Quadratic:")
print(f"  R² Improvement:   {r2_nn - r2_quad:.6f} ({((r2_nn - r2_quad) / max(abs(r2_quad), 1e-10) * 100):.2f}%)")
print(f"  RMSE Improvement: {rmse_quad - rmse_nn:.2f} ({((rmse_quad - rmse_nn) / rmse_quad * 100):.2f}%)")


In [None]:
# Visualize neural network performance
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Scatter plot: Actual vs Predicted (Neural Network)
axes[0].scatter(y_target_np, y_pred_nn, alpha=0.5, s=10, label='Neural Network', c='purple')
axes[0].plot([y_target_np.min(), y_target_np.max()],
             [y_target_np.min(), y_target_np.max()],
             'r--', lw=2, label='Perfect prediction')
axes[0].set_xlabel('Actual Power', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Predicted Power', fontsize=12, fontweight='bold')
axes[0].set_title(f'Neural Network: Actual vs Predicted Power\n(R² = {r2_nn:.4f})',
                  fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Residuals plot (Neural Network)
residuals_nn = y_target_np - y_pred_nn
axes[1].scatter(y_pred_nn, residuals_nn, alpha=0.5, s=10, c='purple')
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted Power', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Residuals', fontsize=12, fontweight='bold')
axes[1].set_title(f'Neural Network: Residual Plot\n(RMSE = {rmse_nn:.2f})',
                  fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Compare all three models' predictions
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Linear model
axes[0, 0].scatter(y_target_np, y_pred, alpha=0.5, s=10, c='blue')
axes[0, 0].plot([y_target_np.min(), y_target_np.max()],
                [y_target_np.min(), y_target_np.max()],
                'r--', lw=2)
axes[0, 0].set_xlabel('Actual Power', fontsize=11, fontweight='bold')
axes[0, 0].set_ylabel('Predicted Power', fontsize=11, fontweight='bold')
axes[0, 0].set_title(f'Linear Model (R² = {r2:.4f})', fontsize=12, fontweight='bold')
axes[0, 0].grid(True, alpha=0.3)

# Quadratic model
axes[0, 1].scatter(y_target_np, y_pred_quad, alpha=0.5, s=10, c='green')
axes[0, 1].plot([y_target_np.min(), y_target_np.max()],
                [y_target_np.min(), y_target_np.max()],
                'r--', lw=2)
axes[0, 1].set_xlabel('Actual Power', fontsize=11, fontweight='bold')
axes[0, 1].set_ylabel('Predicted Power', fontsize=11, fontweight='bold')
axes[0, 1].set_title(f'Quadratic Model (R² = {r2_quad:.4f})', fontsize=12, fontweight='bold')
axes[0, 1].grid(True, alpha=0.3)

# Neural network model
axes[1, 0].scatter(y_target_np, y_pred_nn, alpha=0.5, s=10, c='purple')
axes[1, 0].plot([y_target_np.min(), y_target_np.max()],
                [y_target_np.min(), y_target_np.max()],
                'r--', lw=2)
axes[1, 0].set_xlabel('Actual Power', fontsize=11, fontweight='bold')
axes[1, 0].set_ylabel('Predicted Power', fontsize=11, fontweight='bold')
axes[1, 0].set_title(f'Neural Network (R² = {r2_nn:.4f})', fontsize=12, fontweight='bold')
axes[1, 0].grid(True, alpha=0.3)

# R² comparison bar chart
models = ['Linear', 'Quadratic', 'Neural Net']
r2_scores = [r2, r2_quad, r2_nn]
colors_bar = ['blue', 'green', 'purple']

axes[1, 1].bar(models, r2_scores, color=colors_bar, alpha=0.7, edgecolor='black', linewidth=1.5)
axes[1, 1].set_ylabel('R² Score', fontsize=11, fontweight='bold')
axes[1, 1].set_title('Model Comparison (R² Score)', fontsize=12, fontweight='bold')
axes[1, 1].set_ylim([0, 1])
axes[1, 1].grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for i, (model, score) in enumerate(zip(models, r2_scores)):
    axes[1, 1].text(i, score + 0.02, f'{score:.4f}',
                    ha='center', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()

