# Power Output Predictor

Example Use Case for the PV Dataset. Power output predictor from environmental measurements.

## Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

## Import datasets

In [2]:
pv_dataset = pd.read_csv('../datasets/PV_dataset.csv')
module_metadata = pd.read_csv('../datasets/modules_metadata.csv')

### Get necessary columns for this example:
**Target**: Pmpp

**Features**: module_temperature_center, module_temperature_lateral, G_tilt20_start, G_tilt20_end, wind_speed_ms, relative_humidity

In [3]:
pv_dataset['module_name'].value_counts().head()

module_name
CDF1150A1    174105
VBHN330      173922
QPEAK315     173238
LG345N1C     172862
LG370Q1C     172686
Name: count, dtype: int64

For this example, we'll use the VBHN330 PV module. 

In [4]:
module = 'VBHN330'

In [5]:
CDF_metadata = module_metadata.loc[module_metadata['module_name'] == module]
CDF_metadata

Unnamed: 0,module_name,commercial_name,technology,bifacial,bifaciality_factor,area_m2,installation_tilt,Voc_stc,Isc_stc,Vmpp_stc,Impp_stc,Pmpp_stc,NOCT_c,temp_coeff_Isc_pct,temp_coeff_Voc_pct,temp_coeff_Pmpp_pct,datasheet_url
7,VBHN330,VBHN330SJ47,HIT,no,0.0,1.67427,20,69.7,6.07,58.0,5.7,330,44.0,0.03,-0.25,-0.29,https://cdn.enfsolar.com/Product/pdf/Crystalli...


In [6]:
P_stc = CDF_metadata['Pmpp_stc'].values[0]
temp_coeff_Pmpp = CDF_metadata['temp_coeff_Pmpp_pct'].values[0]

In [7]:
df = pv_dataset[['timestamp','module_temperature_center', 'module_temperature_lateral', 'relative_humidity', 'wind_speed_ms', 'G_tilt20_start', 'G_tilt20_end', 'Pmpp']].loc[pv_dataset['module_name'] == module]

In [8]:
# Use average irradiance at Point of Array (POA)
df['G_POA'] = (df[['G_tilt20_start', 'G_tilt20_end']].mean(axis=1))
# Use average temperature from the center and lateral sensors
df['module_temperature'] = (df[['module_temperature_center', 'module_temperature_lateral']].mean(axis=1))
# Get the year
df['year'] = pd.to_datetime(df['timestamp']).dt.year

In [9]:
df.describe()

Unnamed: 0,module_temperature_center,module_temperature_lateral,relative_humidity,wind_speed_ms,G_tilt20_start,G_tilt20_end,Pmpp,G_POA,module_temperature,year
count,173378.0,172349.0,173427.0,173523.0,173920.0,173850.0,173650.0,173922.0,173741.0,173922.0
mean,33.067606,34.806715,84.154446,1.248218,399.730582,390.740061,109.840005,395.167877,33.976111,2023.528093
std,10.428003,10.991339,6.626214,0.891694,303.198363,302.678077,83.971192,302.781334,10.627682,1.117164
min,2.24,4.73,64.72,0.0,23.22,13.27,8.3e-05,20.12,2.49,2022.0
25%,24.68,25.76,79.37,0.62,141.35,132.42,37.766402,136.81,25.33,2023.0
50%,31.06,32.74,84.0,1.03,309.59,300.675,86.586517,305.075,31.88,2024.0
75%,41.27,43.49,89.05,1.72,626.6625,617.55,173.675336,622.17875,42.32,2025.0
max,64.53,62.99,100.0,4.97,1960.54,1396.89,425.034334,1499.815,64.53,2025.0


## Define features and target and split

In [10]:
features = ['module_temperature', 'relative_humidity', 'wind_speed_ms', 'G_POA']
target = 'Pmpp'

In [11]:
df = df[features + [target] + ['year']]

In [12]:
df.isna().sum()

module_temperature    181
relative_humidity     495
wind_speed_ms         399
G_POA                   0
Pmpp                  272
year                    0
dtype: int64

We'll train on data from 2022-2024 and test on 2025

In [13]:
df['year'].value_counts()

year
2025    44859
2024    44130
2023    42932
2022    42001
Name: count, dtype: int64

NaN values represent a small portion of the dataset. In this case, we'll discard them.

In [14]:
df.dropna(inplace=True)

In [15]:
X_train = df[df['year'] < 2025][features]
y_train = df[df['year'] < 2025][target]
X_test = df[df['year'] == 2025][features]
y_test = df[df['year'] == 2025][target]

# AI Models

# Baseline - Physics Model

In [16]:
P_baseline = P_stc * (X_test['G_POA'] / 1000) * (1 + (temp_coeff_Pmpp/100) * (X_test['module_temperature'] - 25))

In [17]:
rmse_baseline = mean_squared_error(y_test, P_baseline) ** 0.5
baseline_r2 = r2_score(y_test, P_baseline)
print(f"Baseline RMSE using formula: {rmse_baseline:.2f} W" )

Baseline RMSE using formula: 20.47 W


## Linear Regression

In [18]:
reg = LinearRegression().fit(X_train, y_train)
reg.score(X_test, y_test)
pred_reg = reg.predict(X_test)

In [19]:
rmse_reg = mean_squared_error(y_test, pred_reg) ** 0.5
reg_r2 = r2_score(y_test, pred_reg)
print(f"RMSE using Linear Regression: {rmse_reg:.2f} W" )

RMSE using Linear Regression: 7.26 W


## Random Forest Regressor

In [20]:
rf = RandomForestRegressor(n_estimators=100, random_state=18)

rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

In [21]:
rmse_rf = mean_squared_error(y_test, pred_rf) ** 0.5
rf_r2 = r2_score(y_test, pred_rf)
print(f"RMSE using Random Forest: {rmse_rf:.2f} W" )

RMSE using Random Forest: 7.08 W


## Gradient Boost

In [22]:
gbr = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05, max_depth=3, random_state=18)
gbr.fit(X_train, y_train)
pred_gbr = gbr.predict(X_test)

In [23]:
rmse_gbr = mean_squared_error(y_test, pred_gbr) ** 0.5
gbr_r2 = r2_score(y_test, pred_gbr)
print(f"RMSE using Gradient Boosting: {rmse_gbr:.2f} W" )

RMSE using Gradient Boosting: 6.87 W


# Summary

In [24]:
results_dict = {'Baseline': [rmse_baseline, baseline_r2], 'Linear Regression': [rmse_reg, reg_r2], 'Random Forest': [rmse_rf, rf_r2], 'Gradient Boosting': [rmse_gbr, gbr_r2]}
results_df = pd.DataFrame.from_dict(results_dict, orient='index', columns=['RMSE (W)', 'R² Score'])
results_df

Unnamed: 0,RMSE (W),R² Score
Baseline,20.471259,0.942681
Linear Regression,7.25541,0.9928
Random Forest,7.082531,0.993139
Gradient Boosting,6.868579,0.993547
