In [21]:
import pandas as pd
from collections import OrderedDict, defaultdict
from os import listdir, path, makedirs
from datetime import datetime
import random

In [22]:
# Folder path
%cd /mnt/data/vieri/projects/SAMPLE/

/mnt/data/vieri/projects/SAMPLE


In [23]:
system_name = "Binetto " + "2"
# (Binetto 1): 2021-06 --> "Sheet: Binetto 1" is missed
# (Binetto 2) EMPTY DAYS: 2/07/2020 --> 24/07/2020

# TASK: Get file names for the pv system (one for each month)
folder_path = "./data/" + system_name.upper() + "/Inverter/"
filepaths = [file for file in listdir(folder_path) if file.endswith('.xlsx')]

# Reorder the paths according to their date (ascending order)
filepaths.sort(key = lambda date : datetime.strptime(date[-12:-5],'%Y-%m'))

# ONLY FOR DEBUGING!
#filepaths = random.sample(filepaths, 2) 

print("There're {0} files (i.e., months) available for {1}".format(len(filepaths), system_name))
print("PERIOD: From {0} to {1}".format(filepaths[0][-12:-5], filepaths[-1][-12:-5]))

There're 18 files (i.e., months) available for Binetto 2
PERIOD: From 2020-02 to 2021-07


In [24]:
# TASK: Load all data from the months available
months = []
print("Loading {0} files...".format(len(filepaths)))
for file_path in filepaths:
    print("Loading the data from " + file_path[-12:-5] + " ...")
    months.append(pd.read_excel(path.join(folder_path, file_path), 
                                sheet_name = None, 
                                header = [3,4], 
                                skiprows = [5]))
print("\n--> Loading completed: {0} files (i.e., months)".format(len(months)))

Loading 18 files...
Loading the data from 2020-02 ...
Loading the data from 2020-03 ...
Loading the data from 2020-04 ...
Loading the data from 2020-05 ...
Loading the data from 2020-06 ...
Loading the data from 2020-07 ...
Loading the data from 2020-08 ...
Loading the data from 2020-09 ...
Loading the data from 2020-10 ...
Loading the data from 2020-11 ...
Loading the data from 2020-12 ...
Loading the data from 2021-01 ...
Loading the data from 2021-02 ...
Loading the data from 2021-03 ...
Loading the data from 2021-04 ...
Loading the data from 2021-05 ...
Loading the data from 2021-06 ...
Loading the data from 2021-07 ...

--> Loading completed: 18 files (i.e., months)


In [25]:
# Define some counters for missed sheets
missed_amb_sheet = 0
missed_inv_sheets = 0
missed_system_sheet = 0

# FUNCTION: Get the multiline header flatted
def flat_header(multiheader_df):
    
    # Merginig the column name & unit measure
    flatted_header = [multi_col[0] + " [" + multi_col[1] + "]" 
                      for multi_col in multiheader_df.columns.to_flat_index()]
    multiheader_df.columns = flatted_header
    
    # Rename column of datetime
    renamed_column = {"Date/Time (Europe/Rome) [Unnamed: 0_level_1]":"Date/Time [Europe/Rome]"}
    multiheader_df.rename(columns = renamed_column, inplace=True)
    return multiheader_df

# FUNCTION: Extracting and transforming the different sheets (PV system, Ambiental condition and string inverter sheets)
def extractSheets(month_sheets): 
    
    # Find full names of specific sheets 
    name_ambiental_sheet = ""
    name_system_sheet = ""
    full_inv_names = []
    for key in month_sheets.keys():
        if "VSN800" in key: # Ambiental sheet
            name_ambiental_sheet = key
        if "INV" in key.upper(): # String inverter sheets
            full_inv_names.append(key)
        if system_name.upper() in key.upper():
            name_system_sheet = key 

    # ---------- SHEET: PV system ----------
    if(name_system_sheet):
        month_system = month_sheets[name_system_sheet]
        month_system = flat_header(month_system) # Flatted the multiline header
        
        # Discard this column (as it's present only in one old file (2019-10))
        system_column_to_delate = "3 Phase Line Neutral V [Average(volts)]"
        if system_column_to_delate in month_system.columns: 
            month_system.drop(columns = [system_column_to_delate], inplace=True)
    else:
        month_system = None
        global missed_system_sheet
        missed_system_sheet = missed_system_sheet + 1

    # ---------- SHEET: Ambiental sheet ----------
    if(name_ambiental_sheet):
        month_amb = month[name_ambiental_sheet]
        month_amb.dropna(axis = 1, how= "all", inplace=True) # Drop empty columns
        month_amb = flat_header(month_amb) # flatted the multiline header 
        
        # Discard this column (as it's present only in one old file (2019-10))
        column_to_delate = "Horizontal Insolation [Interval Sum(kilowatt-hours-per-meter-sq)]"
        if column_to_delate in month_amb.columns: 
            month_amb.drop(columns = [column_to_delate], inplace=True)
    else:
        month_amb = None
        global missed_amb_sheet
        missed_amb_sheet = missed_amb_sheet + 1

    # ---------- MULTIPLE SHEETS: String inverters sheets ----------
    # Select the subset containing the string inverter data
    if(len(full_inv_names) != 0):
        month_inverters = OrderedDict([(inv_name[0:6].replace(" ", "_"), month[inv_name]) 
                                       for inv_name in full_inv_names if inv_name in month])
        # Clean the sheets
        for name, sheet in month_inverters.items() : 
            sheet.dropna(axis = 1, how= "all", inplace=True) # Drop empty columns
            sheet = flat_header(sheet)  # flatted the multiline header 
    else:
        month_inverters = None
        global missed_inv_sheets
        missed_inv_sheets = missed_inv_sheets + 1
        
    return month_system, month_amb, month_inverters

In [26]:
# TASK: Group all the monthtly data according to the sheet types (system, ambiental and string inverter data)
system_dfs = []
amb_dfs = []
stringInv_dfs = []
for month in months: 
    month_system, month_amb, month_inverters = extractSheets(month)
    system_dfs.append(month_system)
    amb_dfs.append(month_amb)
    stringInv_dfs.append(month_inverters)
    
# Get the names of the inverters
inverter_names = list(stringInv_dfs[0].keys())

print("Months that have been merged: ", len(system_dfs))
print("Number of string inverter:", len(inverter_names))
print("\nMissed system sheets: ", missed_system_sheet, "(", round(missed_system_sheet/len(system_dfs)*100, 2), "%)")
print("Missed ambiental condition sheets: ", missed_amb_sheet, "(", round(missed_amb_sheet/len(system_dfs)*100,2), "%)")
print("Missed string inverter sheets: ", missed_inv_sheets, "(", round(missed_inv_sheets/len(system_dfs)*100,2), "%)")

Months that have been merged:  18
Number of string inverter: 20

Missed system sheets:  0 ( 0.0 %)
Missed ambiental condition sheets:  0 ( 0.0 %)
Missed string inverter sheets:  0 ( 0.0 %)


In [27]:
# TASK: Merge all the monthly data into a single dataframe 
# A. System and ambiental sheets
system_data = pd.concat(system_dfs, ignore_index=True, sort=False) #sort=True
amb_data = pd.concat(amb_dfs, ignore_index=True, sort=False)

# B1: Group month data of all string inverter data
stringInv_data = defaultdict(list)
for month_data in stringInv_dfs:
    if month_data != None:
        for inv_name, inv_data in month_data.items():
            stringInv_data[inv_name].append(inv_data)

# B2: Merge all the string inverter data into one dataframe
for inv_name, inv_data in stringInv_data.items():
    stringInv_data[inv_name] = pd.concat(inv_data, ignore_index=True, sort=False)
    stringInv_data[inv_name].sort_values(by=['Date/Time [Europe/Rome]'], inplace=True)
    
# C: Sort the dataframes by timestamps 
system_data.sort_values(by=['Date/Time [Europe/Rome]'], inplace=True)
amb_data.sort_values(by=['Date/Time [Europe/Rome]'], inplace=True)

# D: Left outer join
merged_system_data = system_data.merge(amb_data, how= "left", on="Date/Time [Europe/Rome]", suffixes=('_SYS', '_AMB'))

print("---------------- SYSTEM DATA -----------------")
display(system_data.info())
print("---------------- AMBIENTAL DATA -----------------")
display(amb_data.info())
print("---------- STRING INVERTERS DATA ("+ inverter_names[0] + ")-----------")
display(stringInv_data[inverter_names[0]].info())

print("---------------- MERGED SYSTEM DATA -----------------")
display(merged_system_data.info())

---------------- SYSTEM DATA -----------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 157524 entries, 0 to 157523
Data columns (total 14 columns):
Date/Time [Europe/Rome]                            157524 non-null datetime64[ns]
Voltage AN [Average(volts)]                        80833 non-null float64
PV Energy [Interval Sum(kilowatt-hours)]           80455 non-null float64
PV Power [Average(watts)]                          80833 non-null float64
DC Gen. Power [Average(watts)]                     80833 non-null float64
Voltage BN [Average(volts)]                        80833 non-null float64
Voltage [Average(volts)]                           80833 non-null float64
Generated Energy [Interval Sum(kilowatt-hours)]    80455 non-null float64
Current [Average(amps)]                            80833 non-null float64
Cell Temp. [Average(celsius)]                      132758 non-null float64
Generated Power [Average(watts)]                   80833 non-null float64
DC Voltage [Average

None

---------------- AMBIENTAL DATA -----------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 157524 entries, 0 to 157523
Data columns (total 4 columns):
Date/Time [Europe/Rome]                     157524 non-null datetime64[ns]
Cell Temp. [Average(celsius)]               132758 non-null float64
Irradiance [Average(watts-per-meter-sq)]    132758 non-null float64
Ambient Temp. [Average(celsius)]            132758 non-null float64
dtypes: datetime64[ns](1), float64(3)
memory usage: 6.0 MB


None

---------- STRING INVERTERS DATA (INV_53)-----------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 157524 entries, 0 to 157523
Data columns (total 6 columns):
Date/Time [Europe/Rome]                            157524 non-null datetime64[ns]
Generated Power [Average(watts)]                   77447 non-null float64
DC Voltage [Average(volts)]                        77447 non-null float64
Generated Energy [Interval Sum(kilowatt-hours)]    77439 non-null float64
DC Current [Average(amps)]                         77447 non-null float64
DC Current [Average(amps).1]                       77447 non-null float64
dtypes: datetime64[ns](1), float64(5)
memory usage: 8.4 MB


None

---------------- MERGED SYSTEM DATA -----------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 157548 entries, 0 to 157547
Data columns (total 17 columns):
Date/Time [Europe/Rome]                            157548 non-null datetime64[ns]
Voltage AN [Average(volts)]                        80833 non-null float64
PV Energy [Interval Sum(kilowatt-hours)]           80455 non-null float64
PV Power [Average(watts)]                          80833 non-null float64
DC Gen. Power [Average(watts)]                     80833 non-null float64
Voltage BN [Average(volts)]                        80833 non-null float64
Voltage [Average(volts)]                           80833 non-null float64
Generated Energy [Interval Sum(kilowatt-hours)]    80455 non-null float64
Current [Average(amps)]                            80833 non-null float64
Cell Temp. [Average(celsius)]_SYS                  132782 non-null float64
Generated Power [Average(watts)]                   80833 non-null float64
DC Voltage [

None

In [28]:
# Quick look at the pv system dataframe
print("EXAMPLE: PV System [Observations:", len(system_data), "]")
display(system_data.sample(5))

# Quick look at the enviromental data
print("EXAMPLE: Environmental conditions [Observations:", len(amb_data), "]")
display(amb_data.sample(5))

# Quick look at the enviromental data
print("EXAMPLE: Merged PV System[Observations:", len(merged_system_data), "]")
display(merged_system_data.sample(5))

# Quick look at the string inverter dataframe
print("EXAMPLE: String inverter data: ", inverter_names[0], "[Observations:", len(stringInv_data[inverter_names[0]]), "]")
display(stringInv_data[inverter_names[0]].sample(5))

EXAMPLE: PV System [Observations: 157524 ]


Unnamed: 0,Date/Time [Europe/Rome],Voltage AN [Average(volts)],PV Energy [Interval Sum(kilowatt-hours)],PV Power [Average(watts)],DC Gen. Power [Average(watts)],Voltage BN [Average(volts)],Voltage [Average(volts)],Generated Energy [Interval Sum(kilowatt-hours)],Current [Average(amps)],Cell Temp. [Average(celsius)],Generated Power [Average(watts)],DC Voltage [Average(volts)],Voltage CN [Average(volts)],Ambient Temp. [Average(celsius)]
39125,2020-06-15 21:25:00,228.74118,0.0,314.0,9.0,229.152944,228.429415,0.0,0.454118,-40.0,314.0,238.402945,227.39412,18.0
106280,2021-02-04 00:40:00,,,,,,,,,-40.0,,,,8.8
107971,2021-02-09 21:35:00,,,,,,,,,-40.0,,,,11.900001
124937,2021-04-09 20:25:00,230.270002,0.0,424.0,18.0,230.855002,229.990002,0.0,0.513,-40.0,424.0,233.942503,228.845001,6.8
98807,2021-01-09 01:55:00,,,,,,,,,-40.0,,,,6.4


EXAMPLE: Environmental conditions [Observations: 157524 ]


Unnamed: 0,Date/Time [Europe/Rome],Cell Temp. [Average(celsius)],Irradiance [Average(watts-per-meter-sq)],Ambient Temp. [Average(celsius)]
83320,2020-11-16 07:20:00,,,
128573,2021-04-22 11:25:00,-40.0,205.0,14.1
24486,2020-04-26 01:30:00,-40.0,0.0,10.8
90210,2020-12-10 05:30:00,-40.0,0.0,6.1
60267,2020-08-28 07:15:00,-40.0,110.0,23.1


EXAMPLE: Merged PV System[Observations: 157548 ]


Unnamed: 0,Date/Time [Europe/Rome],Voltage AN [Average(volts)],PV Energy [Interval Sum(kilowatt-hours)],PV Power [Average(watts)],DC Gen. Power [Average(watts)],Voltage BN [Average(volts)],Voltage [Average(volts)],Generated Energy [Interval Sum(kilowatt-hours)],Current [Average(amps)],Cell Temp. [Average(celsius)]_SYS,Generated Power [Average(watts)],DC Voltage [Average(volts)],Voltage CN [Average(volts)],Ambient Temp. [Average(celsius)]_SYS,Cell Temp. [Average(celsius)]_AMB,Irradiance [Average(watts-per-meter-sq)],Ambient Temp. [Average(celsius)]_AMB
107463,2021-02-08 01:15:00,,,,,,,,,-40.0,,,,11.8,-40.0,0.0,11.8
13082,2020-03-17 10:10:00,235.288239,14.12,170207.0,173809.0,235.982355,235.484316,14.12,13.820588,-40.0,170207.0,590.952962,235.182354,16.5,-40.0,907.0,16.5
20188,2020-04-11 03:20:00,,,,,,,,,-40.0,,,,13.2,-40.0,0.0,13.2
70991,2020-10-04 12:55:00,237.386669,11.895,142883.0,145642.0,238.273335,237.433335,11.895,13.082666,-40.0,142883.0,542.613346,236.640001,27.700001,-40.0,913.0,27.700001
69659,2020-09-29 21:55:00,,,,,,,,,-40.0,,,,12.3,-40.0,0.0,12.3


EXAMPLE: String inverter data:  INV_53 [Observations: 157524 ]


Unnamed: 0,Date/Time [Europe/Rome],Generated Power [Average(watts)],DC Voltage [Average(volts)],Generated Energy [Interval Sum(kilowatt-hours)],DC Current [Average(amps)],DC Current [Average(amps).1]
95062,2020-12-27 01:50:00,,,,,
140634,2021-06-03 08:30:00,4508.0,607.500031,0.368,3.74,3.88
97935,2021-01-06 01:15:00,,,,,
154670,2021-07-22 02:10:00,,,,,
47881,2020-07-16 07:05:00,,,,,


In [29]:
# TASK: Save the dataframes
save_folder_name =  "Imported data"
stringInv_folder = "String Inverters"
save_folder_path = path.join("./data/", system_name.upper(), save_folder_name)

print("Saving data for '"+ system_name.upper()+"'")

# 0: Create the folders
if not path.exists(save_folder_path):
    makedirs(save_folder_path)
    print("A new saving folder has been created")
if not path.exists(path.join(save_folder_path, stringInv_folder)):
    makedirs(path.join(save_folder_path, stringInv_folder))
    print("A new sub-folder has been created\n")

# 1: Save the merged dataframe of PV system
#system_data.to_csv(save_folder_path + "/System_data.csv", index=False)
merged_system_data.to_csv(save_folder_path + "/pv_system_data.csv", index=False)
print("OK: System data has been saved as CSV (Entries: ", len(merged_system_data), ")")

# 2: Save the raw environmental data
amb_data.to_csv(save_folder_path + "/raw_ambiental_data.csv", index=False)
print("OK: Ambiental data has been saved as CSV (Entries: ", len(amb_data), ")")

# 3: Save all the string inverter data
for inv_name in inverter_names:
    strInv_data = stringInv_data[inv_name]
    #print(strInv_data)
    path_to_save = path.join(save_folder_path, stringInv_folder, ("String_" + inv_name.upper() + "_data.csv"))
    strInv_data.to_csv(path_to_save, index=False)
print("OK: The string inverter data ({0} tables) has been saved".format(len(inverter_names)))

Saving data for 'BINETTO 2'
A new saving folder has been created
A new sub-folder has been created

OK: System data has been saved as CSV (Entries:  157548 )
OK: Ambiental data has been saved as CSV (Entries:  157524 )
OK: The string inverter data (20 tables) has been saved


In [31]:
# Check saved data
system_name = "Binetto " + "2"
save_folder_path = path.join("./data/", system_name.upper(), "Imported data")

# Load files
loaded_system_data = pd.read_csv(path.join(save_folder_path, "pv_system_data.csv"))
loaded_amb_data = pd.read_csv(path.join(save_folder_path, "raw_ambiental_data.csv"))

# Load string inverters files 
inverter_names = []
loaded_strInv_data = dict()                            
filepaths_stringInv = [file for file in listdir(path.join(save_folder_path,"String Inverters")) if file.endswith('.csv')]            
for file in filepaths_stringInv:
    inv_name = "_".join(file.split("_")[1:3])
    loaded_strInv_data[inv_name] = pd.read_csv(path.join(save_folder_path,"String Inverters", file))
    inverter_names.append(inv_name)
                              
# Show loaded data                             
# Quick look at the pv system dataframe
print("-" * 50 + system_name.upper() + "-" * 50)
print("EXAMPLE: PV System merged [Observations:", len(loaded_system_data), "]")
display(loaded_system_data.sample(5))

# Quick look at the enviromental data
print("EXAMPLE: Environmental conditions [Observations:", len(loaded_amb_data), "]")
display(loaded_amb_data.sample(5))

# Quick look at the string inverter dataframe
print("EXAMPLE: String inverter data: ", inverter_names[0], "[Observations:", len(loaded_strInv_data[inverter_names[0]]), "]")
display(loaded_strInv_data[inverter_names[0]].sample(5))

--------------------------------------------------BINETTO 2--------------------------------------------------
EXAMPLE: PV System merged [Observations: 157548 ]


Unnamed: 0,Date/Time [Europe/Rome],Voltage AN [Average(volts)],PV Energy [Interval Sum(kilowatt-hours)],PV Power [Average(watts)],DC Gen. Power [Average(watts)],Voltage BN [Average(volts)],Voltage [Average(volts)],Generated Energy [Interval Sum(kilowatt-hours)],Current [Average(amps)],Cell Temp. [Average(celsius)]_SYS,Generated Power [Average(watts)],DC Voltage [Average(volts)],Voltage CN [Average(volts)],Ambient Temp. [Average(celsius)]_SYS,Cell Temp. [Average(celsius)]_AMB,Irradiance [Average(watts-per-meter-sq)],Ambient Temp. [Average(celsius)]_AMB
4126,2020-02-15 07:50:00,,,,,,,,,,,,,,,,
124683,2021-04-08 21:15:00,228.60667,0.0,235.0,28.0,228.773337,227.971114,0.0,0.518,-40.0,235.0,240.573335,226.533336,2.5,-40.0,0.0,2.5
128651,2021-04-22 15:55:00,229.985003,0.819,8788.0,9365.0,230.850004,230.070003,0.819,0.6035,-40.0,8788.0,595.457512,229.375003,13.2,-40.0,46.0,13.2
116997,2021-03-13 03:45:00,,,,,,,,,-40.0,,,,12.6,-40.0,0.0,12.6
103771,2021-01-26 05:35:00,,,,,,,,,-40.0,,,,5.1,-40.0,0.0,5.1


EXAMPLE: Environmental conditions [Observations: 157524 ]


Unnamed: 0,Date/Time [Europe/Rome],Cell Temp. [Average(celsius)],Irradiance [Average(watts-per-meter-sq)],Ambient Temp. [Average(celsius)]
29793,2020-05-14 11:45:00,-40.0,674.0,26.4
110208,2021-02-17 16:00:00,-40.0,40.0,10.2
78203,2020-10-29 12:55:00,-40.0,630.0,19.5
136647,2021-05-20 12:15:00,-40.0,1018.0,18.9
130730,2021-04-29 23:10:00,-40.0,0.0,18.6


EXAMPLE: String inverter data:  INV_23 [Observations: 157524 ]


Unnamed: 0,Date/Time [Europe/Rome],Generated Power [Average(watts)],DC Voltage [Average(volts)],Generated Energy [Interval Sum(kilowatt-hours)],DC Current [Average(amps)],DC Current [Average(amps).1]
31327,2020-05-19 19:35:00,275.0,544.500031,0.016,0.21,0.22
149969,2021-07-05 18:25:00,1719.0,602.500031,0.144,1.5,1.41
7393,2020-02-26 16:05:00,,,,,
53799,2020-08-05 20:15:00,27.0,251.950005,0.016,0.0,0.0
56725,2020-08-16 00:05:00,,,,,
