In [1]:
from sys import path
if '..' not in path:
    path.insert(0, '..')

In [2]:
import numpy as np
import pandas as pd
from _library.utils import SYSTEM_NAMES_FULL, load_datasets, load_amb_cond
from os import path,makedirs
from IPython.display import clear_output

In [3]:
# Select the main folder 
%cd /mnt/data/vieri/projects/SAMPLE/

# Visualize names of PV systems
print(SYSTEM_NAMES_FULL)
# --- 0 ---------- 1 --------- 2 ------ 3 ------ 4 --------- 5 --------- 6 -------- 7 ---

/mnt/data/vieri/projects/SAMPLE
['Binetto 1', 'Binetto 2', 'Cantore', 'Emi', 'Soleto 1', 'Soleto 2', 'Galatina', 'Verone']


In [4]:
system_name = SYSTEM_NAMES_FULL[3]
system_path, inv_data, inv_names, raw_irr_data, *_ = load_datasets(system_name, subfolder= "Cleaned", verbose=True)

-------------------------------------------------------------------------------- 
				PV SYSTEM --> EMI 
--------------------------------------------------------------------------------

Loading inverter data...
EMI: OK, component data loaded (4) --> INV1, INV2, INV3, INV4

Loading irradiance values...
EMI: OK, raw irradiance data (238796 observations) have been loaded

-------------------------------------------------------------------------------- 
FINISHED!: All datasets have been loaded. (SYS: 4 - IRR FILE: 1)
--------------------------------------------------------------------------------
-------------------------------------------------------------------------------- 
EXAMPLE --> Emi: INV1 (FROM '2018-07-27' TO '2021-06-30': 1069 days).
--------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149861 entries, 0 to 149860
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype       

# Merge the irrradiance values with the main inverter data

In [5]:
# Discard useless irradiance observations (i.e., Nan values)
if len(raw_irr_data) > 0:
    raw_irr_data.dropna(inplace=True)
    raw_irr_data.reset_index(inplace=True, drop=True)

    for inv_name in inv_names:
        inv_data[inv_name] = inv_data[inv_name].merge(raw_irr_data, on="Date/Time", how="inner")
        inv_data[inv_name].info()
else:
     print("Data not available")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128754 entries, 0 to 128753
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   Date/Time            128754 non-null  datetime64[ns]
 1   Iac R (A)            128754 non-null  int64         
 2   Iac S (A)            128754 non-null  int64         
 3   Iac T (A)            128754 non-null  int64         
 4   Vac R (V)            128754 non-null  int64         
 5   Vac S (V)            128754 non-null  int64         
 6   Vac T (V)            128754 non-null  int64         
 7   Pac R (kW)           128754 non-null  int64         
 8   E. totale (kWh)      128754 non-null  float64       
 9   Cc 1 (A)             128754 non-null  int64         
 10  Vcc 1 (V)            128754 non-null  int64         
 11  Allarme              128754 non-null  string        
 12  Inverter temp. (°C)  128754 non-null  int64         
 13  Irradiance (W/

# STRAT 2: Compute the average value for the observation at the hour

In [6]:
averaged_hourly_inv_data = dict()
for inv_name in inv_names:
    
    # Clean the console output
    clear_output(wait = True)
    
    print("\n",30*"-", inv_name, 30*"-")
    df = inv_data[inv_name]
    
    # Isolate the minute
    df["Time"] = df["Date/Time"].dt.time
    df["Minute"] = df["Time"].apply(lambda time: time.minute)
    df.drop(columns = ["Time"], inplace=True)
    
    # Keep only hourly observations
    hours = df[df["Minute"] == 0]["Date/Time"].tolist()
    df.drop(columns = ["Minute"], inplace=True)
    start_timestamp = hours[0]
    end_timestamp = hours[-1] + pd.Timedelta(1, unit="hour") 
    hourly_ts = pd.date_range(start_timestamp, end_timestamp, freq="1H")
    print(f"START: {start_timestamp}\nEND:   {end_timestamp}\nTOTAL TIMESTAMPS: {len(hourly_ts)} observations")

    hourly_avg_df = []
    missing_ts_counter = 0

    for idk, timestamp in enumerate(hourly_ts):
        
        if len(hourly_ts) >= 1000:
            clear_output(wait = True)
            
        print(f"\n[{inv_name}] TARGET ({idk+1}/{len(hourly_ts)}):", timestamp)

        # Create a time window --> [t - 55 min, t] --> e.g., [9:05:10:00]
        window_df = df[df["Date/Time"].between(timestamp - pd.Timedelta(55, unit="minutes"),timestamp)]

        if len(window_df) == 0:
            missing_ts_counter += 1
            time_window = df[df["Date/Time"].between(timestamp - pd.Timedelta(12, unit="hours"),timestamp)]

            print(f"WARNING: No observations available for this time.")
            
            if len(time_window) > 0:
                last_timestamp = time_window.iloc[-1, 0]
                print(f"         Last available: {last_timestamp.time().strftime('%H:%M')} "\
                      f"({(timestamp-last_timestamp).components[1]} hours ago)")
            continue
        
        # Compute the average values
        averaged_observation = window_df.mean(numeric_only=True)
        
        # Round integer values (apart from the column 'E. total')
        integer_columns = averaged_observation.index.tolist()
        integer_columns.remove('E. totale (kWh)')
        averaged_observation.loc[integer_columns] = averaged_observation.loc[integer_columns].round(decimals = 0)
        
        # Add the timestamp
        averaged_observation["Date/Time"] = timestamp
        
        # Add the 'Allarme' string
        alarm_code = window_df.loc[window_df["Date/Time"] == timestamp, "Allarme"].tolist()
        if alarm_code:
            alarm_code = alarm_code[0]
        else:
            alarm_code = window_df["Allarme"].tolist()[0]
        averaged_observation["Allarme"] = alarm_code
        
        # Reorder columns
        original_order = inv_data[inv_name].columns
        averaged_observation = averaged_observation.reindex(index = original_order)

        print(f"An averaged observation (out of {len(window_df)}) has been created between: "\
              f"{window_df.iloc[-1,0].time().strftime('%H:%M')} and {window_df.iloc[0,0].time().strftime('%H:%M')}")
        #display(averaged_observation)
        
        # Add the averaged observation to the new dataframe
        hourly_avg_df.append(averaged_observation)
        
    # Create a dictionary of data for each inverter
    averaged_hourly_inv_data[inv_name] = pd.DataFrame(hourly_avg_df)
    
    # Cast to int 
    idk_to_cast = [1, 2, 3, 4, 5, 6, 7] + [9, 10, 12, 13]
    averaged_hourly_inv_data[inv_name].iloc[:, idk_to_cast] = averaged_hourly_inv_data[inv_name].iloc[:, idk_to_cast].astype("int64")
    print("\n", "-"*80, f"\nFinished building the new 1-hour sampled dataframe with {len(hourly_avg_df)} unique timestamps\n" \
          f"({missing_ts_counter} timestamp have been skipped since missing values)")


[INV4] TARGET (25662/25662): 2021-06-30 22:00:00
An averaged observation (out of 4) has been created between: 21:30 and 21:05

 -------------------------------------------------------------------------------- 
Finished building the new 1-hour sampled dataframe with 16465 unique timestamps
(9197 timestamp have been skipped since missing values)


# Merge the dataset with the irradiance value and the enviromental value

## Flag to decide whether merge the enviromental temperature retrieved from another data source

In [8]:
# Enviromental data have been found for this pv systems 
if system_name in SYSTEM_NAMES_FULL[4:7]:
    use_amb_temp = True
else: 
    use_amb_temp = False
print(f"Using ambiental conditions from a second source: {str(use_amb_temp).upper()}")

Using ambiental conditions from a second source: FALSE


In [10]:
if use_amb_temp:
    
    # Load the ambiental condition from the second data source
    amb_cond = load_amb_cond(system_name = "Galatina")
    
    # Perfom the merge on both the datasets generated 
#    hourly_datasets = [hourly_inv_data, averaged_hourly_inv_data]
#    names = ["1-hour sampling", "1-hour averaged sampling"]
    
    hourly_datasets = [averaged_hourly_inv_data]
    names = ["1-hour averaged sampling"]
    
    for idk, dataset in enumerate(hourly_datasets):
        print(f"{idk +1}) [{names[idk]}] Merging enviromental data.")
        for inv_name in inv_names:
            dataset[inv_name] = dataset[inv_name].merge(amb_cond, on="Date/Time", how="inner")
else:
    print("This step has been skipped. ")

This step has been skipped. 


# Save dataframes

In [11]:
print("PV System --> ", system_name.upper())

# Folder names
#saving_folder_name_strat1 = "1-hour sampling"
saving_folder_name_strat2 = "1-hour averaged sampling"
#saving_folder_path_strat1 = path.join(system_path, "..", saving_folder_name_strat1)
saving_folder_path_strat2 = path.join(system_path, "..", saving_folder_name_strat2)

# Create the saving folders
#if not path.exists(saving_folder_path_strat1):
#    makedirs(saving_folder_path_strat1)
#    print(f"A new saving folder has been created: {saving_folder_path_strat1}\n")
if not path.exists(saving_folder_path_strat2):
    makedirs(saving_folder_path_strat2)
    print(f"A new saving folder has been created: {saving_folder_path_strat2}\n")

# Save the files as CSV files 
for inv_name in inv_names:
#    hourly_inv_data[inv_name]
#    # Saving the dataframes created with the strat 1 --> picked the single observation at the hour
#    file_name_strat1 = f"hourlySampling_{inv_name.upper()}_data.csv"
#    hourly_inv_data[inv_name].to_csv(path.join(saving_folder_path_strat1, file_name_strat1), index=False)
#    print(f"The 1-hour sampling data for '{inv_name}' has been saved.")
    
    # Saving the dataframes created with the strat 2 --> Compute averaged values
    file_name_strat2 = f"hourlyAveragedSampling_{inv_name.upper()}_data.csv"
    averaged_hourly_inv_data[inv_name].to_csv(path.join(saving_folder_path_strat2, file_name_strat2), index=False)
    print(f"The 1-hour averaged sampling data for '{inv_name}' has been saved.\n")

PV System -->  EMI
The 1-hour averaged sampling data for 'INV1' has been saved.

The 1-hour averaged sampling data for 'INV2' has been saved.

The 1-hour averaged sampling data for 'INV3' has been saved.

The 1-hour averaged sampling data for 'INV4' has been saved.

