In [1]:
from sys import path
if '..' not in path:
    path.insert(0, '..')

In [2]:
from _library.utils import SYSTEM_NAMES, SUBFOLDERS, load_datasets
from matplotlib.dates import ConciseDateFormatter, AutoDateFormatter, AutoDateLocator, DateFormatter, DayLocator
from dateutil.parser import ParserError
from string import ascii_uppercase
from scipy.stats import stats
from matplotlib.ticker import MaxNLocator
from os import path
import _library.fault_utils as fault_utils
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import ceil

In [3]:
# Folder path
%cd /mnt/data/vieri/projects/SAMPLE/

/mnt/data/vieri/projects/SAMPLE


# A.1) Load inverter data

In [4]:
system_name = SYSTEM_NAMES[2]
subfolder = "1-hour averaged sampling"

In [5]:
folder_path, inv_data, inv_names, *_, = load_datasets(system_name, verbose=True, subfolder = subfolder)

-------------------------------------------------------------------------------- 
				PV SYSTEM --> SOLETO 1 
--------------------------------------------------------------------------------

Loading inverter data...
SOLETO 1: OK, component data loaded (4) --> INV1, INV2, INV3, INV4
-------------------------------------------------------------------------------- 
FINISHED!: All datasets have been loaded. (SYS: 4 - IRR FILE: 0)
--------------------------------------------------------------------------------
-------------------------------------------------------------------------------- 
EXAMPLE --> Soleto 1: INV1 (FROM '2018-08-08' TO '2021-06-30': 1057 days).
--------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15586 entries, 0 to 15585
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Date/Tim

## Compute the DC power

In [6]:
 for inv_name in inv_names:
    df = inv_data[inv_name]

    # Compute the power --> P = V * A --> Power = Voltage * Ampere
    power = df["Vcc 1 (V)"] * df["Cc 1 (A)"]

    # Transform the WATT (W) into KILOWATT (kW)
    kilowatt = power / 1000

    # Insert the computed values after the DC voltage
    dc_voltage_pos = int(np.argwhere(np.array(df.columns) == "Vcc 1 (V)")[0][0])
    dc_power_pos = dc_voltage_pos + 1
    df.insert(dc_power_pos, "DC Power (kW)", kilowatt)

    print(f"{inv_name}: Added the computed column 'DC Power (kW)' in the data "\
          "(i.e., using the formula 'P = V *A').")

INV1: Added the computed column 'DC Power (kW)' in the data (i.e., using the formula 'P = V *A').
INV2: Added the computed column 'DC Power (kW)' in the data (i.e., using the formula 'P = V *A').
INV3: Added the computed column 'DC Power (kW)' in the data (i.e., using the formula 'P = V *A').
INV4: Added the computed column 'DC Power (kW)' in the data (i.e., using the formula 'P = V *A').


# A.2) Retrieve *failure events* 

In [7]:
fault_priorities = ["High", "Medium"]

In [8]:
data_folder = 'Failure events'

In [9]:
# File path
priority_names = ''.join(fault_priorities)
file_name = f'{priority_names}_failureEvent_logs.csv'
file_path = path.join(folder_path,"..", data_folder, file_name)

# Read the csv
fault_df = pd.read_csv(file_path, parse_dates = [7, 8],  dtype = {'Inverter': 'Int64'}) 
fault_df['Durata'] = fault_df['Durata'].apply(lambda value: pd.to_timedelta(value)) 

# Retrive the inverters
inverters = sorted([inv_number for inv_number in fault_df['Inverter'].unique() if not pd.isna(inv_number)])
generalPlantBoxes = sorted([generalPlant for generalPlant in fault_df['Quadro Generale'].unique() if not pd.isna(generalPlant)])
stringNames = sorted([string_name for string_name in fault_df['Stringa'].unique() if not pd.isna(string_name)])

# Visualize the information
print("-" * 110 + "\n" + "-" * 50, system_name.upper(), "-" * 50 + "\n" + "-" * 110)
print(f"Logs concerning failure events have been loaded.\n")
print("-" * 40, 'DATA AVAILABLE', "-" * 40)
print(f"--> Inverter available ({len(inverters)}): ", ', '.join([str(num) for num in inverters]))
print(f"--> General Plant box available ({len(generalPlantBoxes)}):", ', '.join(generalPlantBoxes[:10]), "...")
print(f"--> String names available ({len(stringNames)}):", ', '.join(stringNames))

print("\n", "-" * 40, 'FEATURES', "-" * 40)
fault_df.info()
print("\n" + "-" * 20, f'LOG EXAMPLES (COVERED PERIOD: FROM {fault_df["Inizio"].iloc[0].strftime("%Y-%m-%d")} '\
        f'TO {fault_df["Inizio"].iloc[-1].strftime("%Y-%m-%d")})', "-" * 20)
display(fault_df)

--------------------------------------------------------------------------------------------------------------
-------------------------------------------------- SOLETO 1 --------------------------------------------------
--------------------------------------------------------------------------------------------------------------
Logs concerning failure events have been loaded.

---------------------------------------- DATA AVAILABLE ----------------------------------------
--> Inverter available (4):  1, 2, 3, 4
--> General Plant box available (24): CSP1.1, CSP1.2, CSP1.3, CSP1.4, CSP1.5, CSP1.6, CSP2.1, CSP2.2, CSP2.3, CSP2.4 ...
--> String names available (12): s1, s10, s11, s12, s2, s3, s4, s5, s6, s7, s8, s9

 ---------------------------------------- FEATURES ----------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134561 entries, 0 to 134560
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype          
---  ------       

Unnamed: 0,Inverter,Quadro Generale,Stringa,Tipo,Causa Guasto,Messaggio,Durata,Inizio,Fine
0,4,CSP4.5,s1,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 00:21:00,2018-08-08 10:15:00,2018-08-08 10:36:00
1,2,CSP2.4,s1,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 01:37:00,2018-08-08 10:15:00,2018-08-08 11:52:00
2,1,CSP1.4,s1,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 01:47:00,2018-08-08 10:15:00,2018-08-08 12:02:00
3,1,CSP1.3,s1,Log_stringBox - High,Allarme string-box,Allarme fusibile su polo negativo,0 days 01:06:00,2018-08-08 10:15:00,2018-08-08 11:21:00
4,4,CSP4.4,s1,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 00:44:00,2018-08-08 10:15:00,2018-08-08 10:59:00
...,...,...,...,...,...,...,...,...,...
134556,2,CSP2.5,s4,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 00:06:00,2021-09-17 08:46:00,2021-09-17 08:52:00
134557,4,CSP4.6,s11,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 00:06:00,2021-09-17 08:46:00,2021-09-17 08:52:00
134558,2,CSP2.5,s4,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 00:08:00,2021-09-17 09:25:00,2021-09-17 09:33:00
134559,4,CSP4.6,s11,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 00:07:00,2021-09-17 09:42:00,2021-09-17 09:49:00


# Utils

In [10]:
sns.set_theme(style = "whitegrid")

def generate_num_matrix(alarm_df, alarm_name):
    
    # Initizalize the matrix
    matrix = np.zeros(shape = len(alarm_df), dtype = int)
    
    # Isolate the selected alarms§
    selected_alarms = alarm_df[alarm_df['Errore'] == alarm_name]
    #print(f"SELECTED ALARMS ({len(selected_alarms)}): {alarm_name}")

    # Set the cell to the number of instances
    for alarm_idk in selected_alarms.index: 
        num_instances = alarm_df.loc[alarm_idk, 'Instances']
        matrix[alarm_idk] = num_instances

    # Check the validity
    num_nonZeros = len(np.nonzero(matrix)[0])
    
    assert num_nonZeros == len(selected_alarms), f"ISSUE, Matrix ones: {num_nonZeros} || ALARMS: {len(selected_alarms)}"
    
    # return the computed matrix
    return matrix

def compute_previous_alarms(alarm_df, df_row, hours_to_consider, escluding_highAlarms = True):
    if hours_to_consider == 0:
        return df_row['Instances']
    else:
        if escluding_highAlarms:
            if df_row['Tipo'] == 'Log_stringBox - High':
                return df_row['Instances']
        
        # Starting timestamp
        starting_ts = df_row['Hourly timestamp'] - pd.Timedelta(hours_to_consider, unit = "hours")
        
        # Conditions 
        temporal_cond = alarm_df['Hourly timestamp'].between(starting_ts, df_row['Hourly timestamp'])
        type_cond = alarm_df['Errore'] == df_row['Errore']
        df_period = alarm_df[temporal_cond & type_cond]
 
        # Sum the instances 
        summed_instances = df_period['Instances'].sum()
        
        return summed_instances

def df_with_previous_alarms(alarm_df, hours_to_consider):
    alarm_df = alarm_df.copy()
    
    # Apply the function
    rowWise_func = lambda row_df: compute_previous_alarms(alarm_df, row_df, hours_to_consider)
    alarm_df['Instances'] = alarm_df.apply(rowWise_func, axis = 1)
    
    return alarm_df

## Retrieve the hourly timestamp

In [11]:
def convert_timestamps(df_row, hour_to_consider = 'Fine'):
    ts_format = '%Y-%m-%d %H'
    
    try: 
        hour = pd.to_datetime(df_row[hour_to_consider]).strftime(ts_format)
    except ParserError:
        hour = pd.to_datetime(df_row['Inizio']).strftime(ts_format)
    return pd.to_datetime(hour)

In [12]:
fault_df['Hourly timestamp'] = fault_df.apply(func = lambda df_row: convert_timestamps(df_row), axis = 1)
fault_df.sort_values(by = ['Hourly timestamp'], inplace = True)

## Extract the error names

In [13]:
errors_df = fault_df['Messaggio'].apply(lambda messageString: messageString) #.split(']')[-1].strip().split('(')[0].strip()
fault_df.insert(5, 'Errore', errors_df)
fault_df.drop(columns = ['Messaggio'], inplace = True)

## Visualization

In [14]:
filling_empty_hours = True
empty_hours_to_zero = True
drop_high_freq_category = False

In [15]:
verbose = False

# Master function

## Utils

In [16]:
def explore_situation(fault_df, inv_data, inverter, timestamp, groupByString = False, lower_temporal_tol = 0, 
                      upper_temporal_tol = 1, use_the_failure_star = True, verbose = False):
    
    ts = pd.to_datetime(timestamp)
    
    # PART 1) Failure events
    # 1.1) Retrive the inverter failire events
    inv_alarms = fault_df[fault_df['Inverter'] == inverter]
    
    # 1.2) Retrive the situation concerning that timestamp
    starting_ts = ts - pd.Timedelta(lower_temporal_tol, unit = "hours")
    ending_ts = ts + pd.Timedelta(upper_temporal_tol, unit = "hours")
    print(f"PERIOD ({(ending_ts - starting_ts).components[1]} hours): FROM '{starting_ts.strftime('%Y-%m-%d (%H:%M)')}' "\
          f"TO '{ending_ts.strftime('%Y-%m-%d (%H:%M)')}'\n")
    
    # 1.3) Select the period according to the start/end of the failure events
    if use_the_failure_star:
        cond = inv_alarms['Inizio'].between(starting_ts, ending_ts)
    else:
        cond = inv_alarms['Fine'].between(starting_ts, ending_ts)
    selected_failure_events = inv_alarms[cond] 
    
    if verbose:
        display(selected_failure_events)
    
    # 1.4) Group by stringbox 
    if groupByString: 
        cols_to_group = ['Quadro Generale', 'Stringa', 'Causa Guasto', 'Tipo', 'Errore']
    else:
        cols_to_group = ['Quadro Generale', 'Causa Guasto', 'Tipo', 'Errore']
    stringBoxSituation = selected_failure_events.groupby( by = cols_to_group)['Durata'].agg(['count', 'sum'])
    stringBoxSituation.rename(columns = {'sum': 'Summed period', 'count' : 'Total events'}, inplace = True)
    print("-" * 45, f"{stringBoxSituation['Total events'].sum()} FAILURE EVENTS", "-" * 45)
    display(stringBoxSituation)
    
    # 2) Inverter data
    # 2.1) Select the inverter data
    inv_name = 'INV' + str(inverter)
    inverter_data = inv_data[inv_name]
    
    # 2.2) Select the period
    cond = inverter_data['Date/Time'].between(starting_ts, ending_ts, inclusive = 'right') #both, left, right
    selected_inv_data = inverter_data[cond]
    
    print("-" * 27, "INVERTER SITUATION", "-" * 26)
    if verbose:
        display(selected_inv_data)
    else:
        cols_to_visualize = ['Date/Time', 'Cc 1 (A)', 'Vcc 1 (V)', 'DC Power (kW)', 'E. totale (kWh)', 
                             'Irradiance (W/mq)', 'Amb. Temp (°C)', ]
        display(selected_inv_data[cols_to_visualize])
    
    return selected_failure_events, selected_inv_data

## Explore the situation

In [17]:
inverter = 1

In [20]:
timestamp = '2021-02-17 08:00'

In [22]:
selected_failure_events, selected_inv_data = explore_situation(fault_df, inv_data, inverter, timestamp, 
                                                               groupByString = False, verbose = True)

PERIOD (1 hours): FROM '2021-02-17 (08:00)' TO '2021-02-17 (09:00)'



Unnamed: 0,Inverter,Quadro Generale,Stringa,Tipo,Causa Guasto,Errore,Durata,Inizio,Fine,Hourly timestamp
109387,1,CSP1.2,s5,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 01:04:00,2021-02-17 08:47:00,2021-02-17 09:51:00,2021-02-17 09:00:00
109386,1,CSP1.2,s8,Log_stringBox - Medium,Allarme string-box,Corrente di stringa fuori range,0 days 01:04:00,2021-02-17 08:47:00,2021-02-17 09:51:00,2021-02-17 09:00:00


--------------------------------------------- 2 FAILURE EVENTS ---------------------------------------------


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Total events,Summed period
Quadro Generale,Causa Guasto,Tipo,Errore,Unnamed: 4_level_1,Unnamed: 5_level_1
CSP1.2,Allarme string-box,Log_stringBox - Medium,Corrente di stringa fuori range,2,0 days 02:08:00


--------------------------- INVERTER SITUATION --------------------------


Unnamed: 0,Date/Time,Iac R (A),Iac S (A),Iac T (A),Vac R (V),Vac S (V),Vac T (V),Pac R (kW),E. totale (kWh),Cc 1 (A),...,DC Power (kW),Allarme,Inverter temp. (°C),Irradiance (W/mq),Amb. Temp (°C),Humidity (%),Atmospheric Pressure (hPa),Rainfall (mm),Wind speed (m/s),Wind direction (°)
13393,2021-02-17 09:00:00,16,15,15,113,113,114,5,3755193.87,12,...,6.456,553701696,8,368,6.64,60.31,1015.61,0.0,,
