# Module Temperature Predictor

Example Use Case for the PV Dataset. Module temperature predictor from environmental measurements.

## Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

## Import datasets

In [2]:
pv_dataset = pd.read_csv('../datasets/PV_dataset.csv')
module_metadata = pd.read_csv('../datasets/modules_metadata.csv')

### Get necessary columns for this example:
**Target**: module_temperature_center

**Features**: air_temperature, relative_humidity, wind_speed_ms, G_tilt20_start, G_tilt20_end

In [3]:
pv_dataset['module_name'].value_counts().head()

module_name
CDF1150A1    174105
VBHN330      173922
QPEAK315     173238
LG345N1C     172862
LG370Q1C     172686
Name: count, dtype: int64

For this example, we'll use the CDF1150A1 PV module. 

In [4]:
CDF_metadata = module_metadata.loc[module_metadata['module_name'] == 'CDF1150A1']
NOCT = CDF_metadata['NOCT_c'].values[0]

In [5]:
df_temperature = pv_dataset[['timestamp','module_temperature_center', 'air_temperature', 'relative_humidity', 'wind_speed_ms', 'G_tilt20_start', 'G_tilt20_end']].loc[pv_dataset['module_name'] == 'CDF1150A1']

In [6]:
# Use average irradiance at Point of Array (POA)
df_temperature['G_POA'] = (df_temperature[['G_tilt20_start', 'G_tilt20_end']].mean(axis=1))
# Get the year
df_temperature['year'] = pd.to_datetime(df_temperature['timestamp']).dt.year

In [7]:
df_temperature.describe()

Unnamed: 0,module_temperature_center,air_temperature,relative_humidity,wind_speed_ms,G_tilt20_start,G_tilt20_end,G_POA,year
count,173587.0,174044.0,173589.0,173656.0,174105.0,174072.0,174105.0,174105.0
mean,33.5955,19.850767,84.110287,1.248791,402.041427,392.926126,397.450418,2023.529353
std,10.843084,3.073996,6.631206,0.888443,303.340342,302.4867,302.647951,1.114974
min,1.96,12.95,64.73,0.0,25.01,13.21,19.345,2022.0
25%,24.88,17.3,79.32,0.62,143.21,134.57,138.87,2023.0
50%,31.42,19.79,83.95,1.03,312.68,303.355,308.065,2024.0
75%,42.12,22.05,89.01,1.72,630.71,621.44,626.585,2025.0
max,64.61,29.24,100.0,4.97,2118.03,1380.95,1555.275,2025.0


## Define features and target and split

In [8]:
df_temperature.drop(['G_tilt20_start', 'G_tilt20_end'], axis=1).isna().sum()

timestamp                      0
module_temperature_center    518
air_temperature               61
relative_humidity            516
wind_speed_ms                449
G_POA                          0
year                           0
dtype: int64

We'll train on data from 2022-2024 and test on 2025

In [9]:
df_temperature['year'].value_counts()

year
2025    44696
2024    44607
2023    42966
2022    41836
Name: count, dtype: int64

In [10]:
features = ['air_temperature', 'relative_humidity', 'wind_speed_ms', 'G_POA']

NaN values represent a small portion of the dataset. In this case, we'll discard them.

In [11]:
df_temperature.dropna(inplace=True)

In [12]:
X_train = df_temperature[df_temperature['year'] < 2025][features]
y_train = df_temperature[df_temperature['year'] < 2025]['module_temperature_center']
X_test = df_temperature[df_temperature['year'] == 2025][features]
y_test = df_temperature[df_temperature['year'] == 2025]['module_temperature_center']

# AI Models

# Baseline - NOCT model

In [13]:
T_module_baseline = X_test['air_temperature'] + (NOCT - 20) * X_test['G_POA'] / 800

In [14]:
rmse_baseline = mean_squared_error(y_test, T_module_baseline) ** 0.5
baseline_r2 = r2_score(y_test, T_module_baseline)
print(f"Baseline RMSE using NOCT formula: {rmse_baseline:.2f} °C" )

Baseline RMSE using NOCT formula: 4.90 °C


## Linear Regression

In [15]:
reg = LinearRegression().fit(X_train, y_train)
reg.score(X_test, y_test)
pred_reg = reg.predict(X_test)

In [16]:
rmse_reg = mean_squared_error(y_test, pred_reg) ** 0.5
reg_r2 = r2_score(y_test, pred_reg)
print(f"RMSE using Linear Regression: {rmse_reg:.2f} °C" )

RMSE using Linear Regression: 4.75 °C


## Random Forest Regressor

In [17]:
rf = RandomForestRegressor(n_estimators=100, random_state=18)

rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

In [18]:
rmse_rf = mean_squared_error(y_test, pred_rf) ** 0.5
rf_r2 = r2_score(y_test, pred_rf)
print(f"RMSE using Random Forest: {rmse_rf:.2f} °C" )

RMSE using Random Forest: 4.70 °C


## Gradient Boost

In [19]:
gbr = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05, max_depth=3, random_state=18)
gbr.fit(X_train, y_train)
pred_gbr = gbr.predict(X_test)

In [20]:
rmse_gbr = mean_squared_error(y_test, pred_gbr) ** 0.5
gbr_r2 = r2_score(y_test, pred_gbr)
print(f"RMSE using Gradient Boosting: {rmse_gbr:.2f} °C" )

RMSE using Gradient Boosting: 4.61 °C


# Summary

In [21]:
results_dict = {'Baseline': [rmse_baseline, baseline_r2], 'Linear Regression': [rmse_reg, reg_r2], 'Random Forest': [rmse_rf, rf_r2], 'Gradient Boosting': [rmse_gbr, gbr_r2]}
results_df = pd.DataFrame.from_dict(results_dict, orient='index', columns=['RMSE (°C)', 'R² Score'])
results_df

Unnamed: 0,RMSE (°C),R² Score
Baseline,4.902111,0.813371
Linear Regression,4.748922,0.824852
Random Forest,4.699221,0.828499
Gradient Boosting,4.614104,0.834656
