In [1]:
from sys import path
if '..' not in path:
    path.insert(0, '..')

In [19]:
import numpy as np
import pandas as pd
from _library.utils import SYSTEM_NAMES, load_datasets, load_amb_cond
from os import path, makedirs
from sklearn.model_selection import train_test_split
import time
from datetime import timedelta
from _library.uc2_interpolation import compute_metrics
from scipy.interpolate import UnivariateSpline

In [3]:
# Select the main folder 
%cd /mnt/data/vieri/projects/SAMPLE/

# Visualize names of PV systems
print(SYSTEM_NAMES)
# --- 0 ---------- 1 --------- 2 ------ 3 ------ 4 --------- 5 --------- 6 -------- 7 ---

/mnt/data/vieri/projects/SAMPLE
['Binetto 1', 'Binetto 2', 'Soleto 1', 'Soleto 2', 'Galatina']


# Load the PV system and the solar irradiance values

In [4]:
system_name = SYSTEM_NAMES[2]
print("PV System:", system_name)

PV System: Soleto 1


In [5]:
dataset_name = "Cleaned"

In [6]:
system_path, inv_data, inv_names, raw_irr_data, *_ = load_datasets(system_name, subfolder = dataset_name, verbose=True)

-------------------------------------------------------------------------------- 
				PV SYSTEM --> SOLETO 1 
--------------------------------------------------------------------------------

Loading inverter data...
SOLETO 1: OK, component data loaded (4) --> INV1, INV2, INV3, INV4

Loading irradiance values...
SOLETO 1: OK, raw irradiance data (234226 observations) have been loaded

-------------------------------------------------------------------------------- 
FINISHED!: All datasets have been loaded. (SYS: 4 - IRR FILE: 1)
--------------------------------------------------------------------------------
-------------------------------------------------------------------------------- 
EXAMPLE --> Soleto 1: INV1 (FROM '2018-08-08' TO '2021-06-30': 1057 days).
--------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146481 entries, 0 to 146480
Data columns (total 13 columns):
 #   Column               Non-Null 

# Load ambiental conditions from the external source

In [7]:
system_name_amb_cond = "Galatina"
amb_cond = load_amb_cond(system_name_amb_cond)
print(f"Ambiental conditions ({system_name_amb_cond})\n" + "-" * 60)
amb_cond.info()

Ambiental conditions (Galatina)
------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41806 entries, 0 to 41805
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Date/Time                   41806 non-null  datetime64[ns]
 1   Amb. Temp (°C)              41806 non-null  float64       
 2   Humidity (%)                40964 non-null  float64       
 3   Atmospheric Pressure (hPa)  41758 non-null  float64       
 4   Rainfall (mm)               41806 non-null  float64       
 5   Wind speed (m/s)            25573 non-null  float64       
 6   Wind direction (°)          25585 non-null  float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 2.2 MB


# Interpolation: A) Ambiental temperature

## A.1) Select only the ambiental temperature

In [8]:
amb_cond.index = amb_cond['Date/Time']
amb_temp = amb_cond['Amb. Temp (°C)']
display(amb_temp)

Date/Time
2017-01-04 16:00:00    11.00
2017-01-04 17:00:00    10.51
2017-01-11 18:00:00     2.38
2017-01-11 19:00:00     2.25
2017-01-11 20:00:00     1.58
                       ...  
2021-10-31 19:00:00    15.47
2021-10-31 20:00:00    13.82
2021-10-31 21:00:00    12.34
2021-10-31 22:00:00    11.55
2021-10-31 23:00:00    10.74
Name: Amb. Temp (°C), Length: 41806, dtype: float64

## A.2) Split data into train, validation, test

In [9]:
test_dim = 0.2
valid_dim = 0.2

In [10]:
all_train_data, test_temp = train_test_split(amb_temp, test_size = test_dim, random_state = 101)
train_temp, valid_temp = train_test_split(all_train_data, test_size = valid_dim, random_state = 101)

# Sort the timestamps
train_temp.sort_index(inplace = True)
valid_temp.sort_index(inplace = True)
test_temp.sort_index(inplace = True)

print(f"DATA: {len(amb_temp)} obs.")
print(f"--> TRAIN ({int((1 - test_dim)*100)} %): {len(train_temp)} obs.")
print(f"    --> VALID ({int(valid_dim*100)} %): {len(valid_temp)} obs.")
print(f"-->  TEST ({int(test_dim*100)} %):  {len(test_temp)} obs.")

DATA: 41806 obs.
--> TRAIN (80 %): 26755 obs.
    --> VALID (20 %): 6689 obs.
-->  TEST (20 %):  8362 obs.


# A3) Interpolation

In [16]:
loss_function = 'rmse'

In [28]:
spline_order = range(3, 4) 
  
best_metrics_value = 99999
for order in spline_order:
    print("-"*25,f"SPLINE (order: {order})", "-"*25)
    
    # Retrieve the data
    train_df = train_temp
    target_df = valid_temp
  
    # Add the gaps in the train data 
    df = pd.concat([train_df, target_df])
    df.loc[target_df.index] = np.nan
    print("\nTRAIN DATA with gaps (included in the validation set)")
    print(f"--> TRAIN DATA: {len(train_df)} obs. filled with {len(target_df)} gaps (i.e., validation set)\n" + "-" * 60)
    
    display(df)
    
    # Start time
    start_time = time.time()
    
    # Interpolation 
    print(f"\n--> Interpolating {len(df[df.isnull()])} observations... ")
    #interpolated_data = df.interpolate(method='spline', order = order)
    spl = UnivariateSpline(x = df.index, y = df.values, k = order)
    
    interpolated_data2 = spl(target_df.values)
    display(spl)
    print(interpolated_data2)
    
    # Ending the temporal counter
    time_elapsed = str(timedelta(seconds = (time.time() - start_time))).split(":")
    print(f"--> Finisched! Time elapsed: {time_elapsed[0]} h, {time_elapsed[1]} min, {time_elapsed[2].split('.')[0]} sec\n") 
    
    # Compute the perfomance
    predicted_values = interpolated_data[target_df.index]
    #display(predicted_values)
    
    # Compute metrics 
    print("-"*25,f"METRICS", "-"*25)
    mae, rmse, wape, pos_wape, neg_wape = compute_metrics(actual = target_df.values, predicted = predicted_values.values)
    
    if loss_function == 'rmse':
        if rmse < best_metrics_value:
            best_metrics_value = rmse
            best_spline = ""
            best_spline_order = order
            
print("-" * 100 + f"\n\t\t\t\t\tBEST SPLINE: {best_spline_order}\n" + "-" * 100)

------------------------- SPLINE (order: 3) -------------------------

TRAIN DATA with gaps (included in the validation set)
--> TRAIN DATA: 26755 obs. filled with 6689 gaps (i.e., validation set)
------------------------------------------------------------


Date/Time
2017-01-04 17:00:00    10.51
2017-01-11 18:00:00     2.38
2017-01-11 19:00:00     2.25
2017-01-11 20:00:00     1.58
2017-01-11 22:00:00     0.47
                       ...  
2021-10-30 03:00:00      NaN
2021-10-30 11:00:00      NaN
2021-10-31 13:00:00      NaN
2021-10-31 15:00:00      NaN
2021-10-31 17:00:00      NaN
Name: Amb. Temp (°C), Length: 33444, dtype: float64


--> Interpolating 6689 observations... 


<scipy.interpolate.fitpack2.InterpolatedUnivariateSpline at 0x7f9b30342670>

[nan nan nan ... nan nan nan]
--> Finisched! Time elapsed: 0 h, 00 min, 01 sec

------------------------- METRICS -------------------------
--> MAE: 1.0777  || RMSE: 1.4212 
--> WAPE: 5.91 %  --> [Pos] WAPE: 5.63 %  || [Neg] WAPE: 6.22 %
----------------------------------------------------------------------------------------------------
					BEST SPLINE: 3
----------------------------------------------------------------------------------------------------
