In [1]:
from sys import path
if '..' not in path:
    path.insert(0, '..')

In [2]:
import pandas as pd
import re

In [3]:
from _library.utils import SYSTEM_NAMES, SUBFOLDERS
from _library.fault_utils import load_faults
from dateutil.parser import ParserError
from os import path, makedirs

In [4]:
%cd /mnt/data/vieri/projects/SAMPLE/

/mnt/data/vieri/projects/SAMPLE


# A) The photovoltaic systems

In [5]:
print(SYSTEM_NAMES, "\nSUBFOLDERS: -->", SUBFOLDERS)
# --- 0 ---------- 1 ---------- 2 --------- 3 ---------- 4 -------

['Binetto 1', 'Binetto 2', 'Soleto 1', 'Soleto 2', 'Galatina'] 
SUBFOLDERS: --> ['Cleaned', '1-hour sampling', '1-hour averaged sampling', 'Residuals', 'Residuals_analytical', 'Failure events', None]


## A.1) Selecting the PV system

In [6]:
system_name = SYSTEM_NAMES[2]
print(f"PV SYSTEM --> {system_name}")

PV SYSTEM --> Soleto 1


# B) Retrieve *failure events* 
Possible priority values for the *alarm logs*: 
- "High"
- "Medium"
- "Low"

## B.1) alarm priorities selection

In [7]:
#fault_priorities = ["High","Medium"]
fault_priorities = ["Medium"]

## B.2) Select which data to load
- **Alarm** logs (True/False)
- **String Box alarm** logs (True/False)
- **Included anonimous faults**: (True/False

In [8]:
to_load = {
    'faults': False, 
    'inv_alarms': False, 
    'stringBox_alarms': True
}
include_faults_notRelatedToInverters = False

## B.3) Load the failure events

In [9]:
# Load the fault dataset: Storico guasti.xlsx (a.k.a., 'General faults') & PV SYSTEM - Storico Allarme.xlsx (a.k.a., Log - X)
fault_df = load_faults(system_name, include_faults_notRelatedToInverters, to_load['inv_alarms'], fault_priorities, 
                       to_load['stringBox_alarms'], to_load['faults'], verbose = False)
fault_df.info()

print("-" * 40, 'DATA AVAILABLE', "-" * 40)
print(f"TOTAL: {len(fault_df)} failure events")

----------------------------------------------------------------------------------------------------
					FAULTS: Soleto 1
				PRIORITIES: Medium 
----------------------------------------------------------------------------------------------------
--> [A) General faults skipped]
--> [B) Inverter logs have been skipped (0)]
--> C) String-box logs loaded (INV1: 68975, INV2: 37800, INV3: 26483, INV4: 8399)

Loading completed!

FAUL CAUSES (2):
--------------------
1) Allarme string-box
2) String-box con produzione anomala
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132556 entries, 0 to 132555
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   Causa Guasto       132556 non-null  object        
 1   Componente Guasto  132556 non-null  object        
 2   Inizio             132556 non-null  datetime64[ns]
 3   Fine               132556 non-null  object        
 4   Inverter           13255

In [10]:
display(fault_df)

Unnamed: 0,Causa Guasto,Componente Guasto,Inizio,Fine,Inverter,Tipo
0,Allarme string-box,CSP4.5 V180562 s1: [3] Corrente di stringa fuo...,2018-08-08 10:15:00,2018-08-08 10:36:00,4,Log_stringBox - Medium
1,Allarme string-box,CSP2.4 V180541 s1: [3] Corrente di stringa fuo...,2018-08-08 10:15:00,2018-08-08 11:52:00,2,Log_stringBox - Medium
2,Allarme string-box,CSP4.4 V180561 s1: [3] Corrente di stringa fuo...,2018-08-08 10:15:00,2018-08-08 10:59:00,4,Log_stringBox - Medium
3,Allarme string-box,CSP1.4 V180545 s1: [3] Corrente di stringa fuo...,2018-08-08 10:15:00,2018-08-08 12:02:00,1,Log_stringBox - Medium
4,Allarme string-box,CSP1.6 V180544 s7: [3] Corrente di stringa fuo...,2018-08-08 10:34:00,2018-08-08 11:52:00,1,Log_stringBox - Medium
...,...,...,...,...,...,...
132551,Allarme string-box,CSP2.5 V180542 s4: [3] Corrente di stringa fuo...,2021-09-17 08:46:00,2021-09-17 08:52:00,2,Log_stringBox - Medium
132552,Allarme string-box,CSP4.6 V180543 s11: [3] Corrente di stringa fu...,2021-09-17 08:46:00,2021-09-17 08:52:00,4,Log_stringBox - Medium
132553,Allarme string-box,CSP2.5 V180542 s4: [3] Corrente di stringa fuo...,2021-09-17 09:25:00,2021-09-17 09:33:00,2,Log_stringBox - Medium
132554,Allarme string-box,CSP4.6 V180543 s11: [3] Corrente di stringa fu...,2021-09-17 09:42:00,2021-09-17 09:49:00,4,Log_stringBox - Medium


# B.4 ) Carry out some transformation

In [11]:
default_value = None # [None, '-', Unknown']

def cast_ending_datatime(df_row):
    try:
        ts = pd.to_datetime(df_row['Fine'])
    except ParserError:
        ts = pd.to_datetime(df_row['Inizio'])
    return ts

def retrive_generalPlantBox_names(log_message, plantBox_prefix, include_code_component = False):
    prefix_pos = log_message.find(plantBox_prefix)
    
    if prefix_pos == -1:
        name = default_value
    else:
        if include_code_component:
            chars_to_include = 14
        else: 
            chars_to_include = 6
        name = log_message[prefix_pos:chars_to_include].strip()
    return name

def retrieve_stringNames(log_message):
    regex_name = r's\d{1,2}'
    stringName_pos = re.search(regex_name, log_message)
    if stringName_pos:
        stringName = stringName_pos.group()
    else:
        stringName = default_value
    return stringName

In [12]:
# Dealing with the artefacts in the column 'Fine' ('n.d' values)
fault_df['Fine'] = fault_df.apply(func = lambda df_row: cast_ending_datatime(df_row),  axis = 1) 

# Compute the temporal duration
if 'Durata' not in fault_df.columns:
    temporal_duration = fault_df['Fine'] - fault_df['Inizio']
    fault_df.insert(5, "Durata", temporal_duration)
    print("A) Temporal durations have been computed.")

# Extract the name of the General plant box 
if 'Quadro Generale' not in fault_df.columns:
    if system_name == SYSTEM_NAMES[2]:
        plantBox_prefix = 'CSP'
    elif system_name in SYSTEM_NAMES[3:5]:
        plantBox_prefix = 'QC'
    generalPlantBox = fault_df['Componente Guasto'].apply(lambda log_message: 
                                                          retrive_generalPlantBox_names(log_message, plantBox_prefix,
                                                                                        include_code_component = False))
    fault_df.insert(1, 'Quadro Generale', generalPlantBox)
    print(f"B) The general plant box names (e.g., {fault_df['Quadro Generale'].sample(1).iloc[0]}) have been retrieved.")
    
if 'Stringa' not in fault_df.columns:
    stringhe = fault_df['Componente Guasto'].apply(lambda log_message: retrieve_stringNames(log_message))
    fault_df.insert(2, 'Stringa', stringhe)
    
    print("C) The names concerning the string have been retrieved")

# Clean the column 
if "Componente Guasto" in fault_df.columns:
    fault_df['Messaggio'] = fault_df['Componente Guasto'].apply(lambda message: 
                                                                message.split(']')[-1].strip().split('(')[0].strip().split(':')[-1].strip())
    fault_df.drop(columns = 'Componente Guasto', inplace = True)

# Re-order columns 
new_col_order = ['Inverter', 'Quadro Generale', 'Stringa', 'Tipo', 'Causa Guasto', 'Messaggio', 'Durata', 'Inizio', 'Fine']
fault_df = fault_df.reindex(columns = new_col_order)

print("\n" + '-' * 30, 'OUTCOME', '-' * 30)
fault_df.info()
display(fault_df)

A) Temporal durations have been computed.
B) The general plant box names (e.g., CSP2.1) have been retrieved.
C) The names concerning the string have been retrieved

------------------------------ OUTCOME ------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132556 entries, 0 to 132555
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype          
---  ------           --------------   -----          
 0   Inverter         132556 non-null  int64          
 1   Quadro Generale  132556 non-null  object         
 2   Stringa          126053 non-null  object         
 3   Tipo             132556 non-null  object         
 4   Causa Guasto     132556 non-null  object         
 5   Messaggio        132556 non-null  object         
 6   Durata           132556 non-null  timedelta64[ns]
 7   Inizio           132556 non-null  datetime64[ns] 
 8   Fine             132556 non-null  datetime64[ns] 
dtypes: datetime64[ns](2), int64(1), object(5), ti

Unnamed: 0,Inverter,Quadro Generale,Stringa,Tipo,Causa Guasto,Messaggio,Durata,Inizio,Fine
0,4,CSP4.5,s1,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 00:21:00,2018-08-08 10:15:00,2018-08-08 10:36:00
1,2,CSP2.4,s1,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 01:37:00,2018-08-08 10:15:00,2018-08-08 11:52:00
2,4,CSP4.4,s1,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 00:44:00,2018-08-08 10:15:00,2018-08-08 10:59:00
3,1,CSP1.4,s1,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 01:47:00,2018-08-08 10:15:00,2018-08-08 12:02:00
4,1,CSP1.6,s7,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 01:18:00,2018-08-08 10:34:00,2018-08-08 11:52:00
...,...,...,...,...,...,...,...,...,...
132551,2,CSP2.5,s4,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 00:06:00,2021-09-17 08:46:00,2021-09-17 08:52:00
132552,4,CSP4.6,s11,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 00:06:00,2021-09-17 08:46:00,2021-09-17 08:52:00
132553,2,CSP2.5,s4,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 00:08:00,2021-09-17 09:25:00,2021-09-17 09:33:00
132554,4,CSP4.6,s11,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 00:07:00,2021-09-17 09:42:00,2021-09-17 09:49:00


## B.5) Visualize the outcome

In [13]:
print("\n"+"-"* 120 + f"\n\t\t\t\t\t\tFAUL EVENTS (Period: {fault_df['Inizio'].iloc[0].strftime('%Y-%m-%d')})")
print("\t\t\t\tPRIORITIES:", (', ').join([priority for priority in fault_df['Tipo'].unique()]) 
      + "\n" + "-"* 120)

inverters = sorted([inv_number for inv_number in fault_df['Inverter'].unique() if not pd.isna(inv_number)])
generalPlantBoxes = sorted([generalPlant for generalPlant in fault_df['Quadro Generale'].unique() if not pd.isna(generalPlant)])
stringNames = sorted([string_name for string_name in fault_df['Stringa'].unique() if not pd.isna(string_name)], 
                    key = lambda name: int(name[1:]))
uniqueEvents = sorted([event for event in fault_df['Messaggio'].unique() if not pd.isna(event)])

print("-" * 40, 'DATA AVAILABLE', "-" * 40)
print(f"TOTAL: {len(fault_df)} failure events")
print(f"--> Inverter available ({len(inverters)}): ", ', '.join([str(num) for num in inverters]))
print(f"--> Unique events ({len(uniqueEvents)}): \n\t --> " + '\n\t --> '.join(uniqueEvents))
print(f"\n--> General Plant box available ({len(generalPlantBoxes)}):", ', '.join(generalPlantBoxes))
print(f"--> String names available ({len(stringNames)}):", ', '.join(stringNames))

display(fault_df)

# --------------- Isolate only the (general) faults observations ---------------------------------
fault_to_vis = "General Fault" 
fault_type_cond = fault_df["Tipo"] == fault_to_vis
print("\n"+"-"* 105 + f"\n\t\t\t\t\tFAULT EVENTS ('{fault_to_vis}')\n" + "-"* 105)
only_fault_df = fault_df[fault_type_cond]
if len(only_fault_df) > 0:
    display(only_fault_df)
else:
    print(f"\n[{system_name}] No faults available for this PV system.\n")


------------------------------------------------------------------------------------------------------------------------
						FAUL EVENTS (Period: 2018-08-08)
				PRIORITIES: Log_stringBox - Medium
------------------------------------------------------------------------------------------------------------------------
---------------------------------------- DATA AVAILABLE ----------------------------------------
TOTAL: 132556 failure events
--> Inverter available (4):  1, 2, 3, 4
--> Unique events (2): 
	 --> Corrente di stringa fuori range
	 --> String-box con produzione anomala

--> General Plant box available (24): CSP1.1, CSP1.2, CSP1.3, CSP1.4, CSP1.5, CSP1.6, CSP2.1, CSP2.2, CSP2.3, CSP2.4, CSP2.5, CSP2.6, CSP3.1, CSP3.2, CSP3.3, CSP3.4, CSP3.5, CSP3.6, CSP4.1, CSP4.2, CSP4.3, CSP4.4, CSP4.5, CSP4.6
--> String names available (12): s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12


Unnamed: 0,Inverter,Quadro Generale,Stringa,Tipo,Causa Guasto,Messaggio,Durata,Inizio,Fine
0,4,CSP4.5,s1,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 00:21:00,2018-08-08 10:15:00,2018-08-08 10:36:00
1,2,CSP2.4,s1,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 01:37:00,2018-08-08 10:15:00,2018-08-08 11:52:00
2,4,CSP4.4,s1,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 00:44:00,2018-08-08 10:15:00,2018-08-08 10:59:00
3,1,CSP1.4,s1,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 01:47:00,2018-08-08 10:15:00,2018-08-08 12:02:00
4,1,CSP1.6,s7,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 01:18:00,2018-08-08 10:34:00,2018-08-08 11:52:00
...,...,...,...,...,...,...,...,...,...
132551,2,CSP2.5,s4,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 00:06:00,2021-09-17 08:46:00,2021-09-17 08:52:00
132552,4,CSP4.6,s11,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 00:06:00,2021-09-17 08:46:00,2021-09-17 08:52:00
132553,2,CSP2.5,s4,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 00:08:00,2021-09-17 09:25:00,2021-09-17 09:33:00
132554,4,CSP4.6,s11,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 00:07:00,2021-09-17 09:42:00,2021-09-17 09:49:00



---------------------------------------------------------------------------------------------------------
					FAULT EVENTS ('General Fault')
---------------------------------------------------------------------------------------------------------

[Soleto 1] No faults available for this PV system.



# C) Save the dataframe 

## Create the saving folder

In [14]:
saving_folder_name = "Failure events"

In [15]:
system_path = path.join('data', system_name.upper(), system_name.upper())
main_folder = 'Imported data'
saving_folder_path = path.join(system_path, main_folder, saving_folder_name)
if not path.exists(saving_folder_path):
    makedirs(saving_folder_path) 
    print(f"PV System --> {system_name.upper()}\nA new saving folder has been created: {saving_folder_path}\n")

## Save the enhanced dataframe

In [16]:
priority_name = "".join([priority_name.capitalize() for priority_name in fault_priorities])

In [17]:
if to_load['inv_alarms']:
    file_name = f'{priority_name}_invFailureEvent_logs.csv'
else:
    file_name = f'{priority_name}_failureEvent_logs.csv'
print(f"FILE NAME: {file_name}")

FILE NAME: Medium_failureEvent_logs.csv


In [18]:
file_path = path.join(saving_folder_path, file_name)
fault_df.to_csv(file_path, index = False)
print(f"[{system_name.upper()}] The dataset ({len(fault_df)} entries) has been saved ")

[SOLETO 1] The dataset (132556 entries) has been saved 
